diff --git a/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript b/.github/workflows/pre-commit.yml similarity index 83% rename from inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript rename to .github/workflows/pre-commit.yml index 748714d1..4fa18732 100644 --- a/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript +++ b/.github/workflows/pre-commit.yml @@ -1,4 +1,4 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -23,8 +23,16 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -{ - global: - TRITONBACKEND_*; - local: *; -}; + +name: pre-commit + +on: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5.0.0 + - uses: actions/setup-python@v6.0.0 + - uses: pre-commit/action@v3.0.1 diff --git a/.gitignore b/.gitignore index f4c2f069..a8cb1c8d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ build/ .coverage *.onnx tmp/ +.idea diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index caca92b3..ee803e26 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,29 @@ +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + repos: - repo: https://github.com/pycqa/isort rev: 5.12.0 @@ -7,12 +33,12 @@ repos: rev: v1.1.13 hooks: - id: remove-crlf -- repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.32.0 +- repo: https://github.com/google/yapf + rev: v0.43.0 hooks: - id: yapf - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v6.0.0 hooks: - id: check-added-large-files exclude: 'tensorrt_llm/' diff --git a/README.md b/README.md index 9d9e0088..9c64ed57 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ tensorrt_llm --> postprocessing in this case), we - # expect each duration metric for the ensemble model to be greater the - # corresponding sum for that metric across each of the submodels. - duration_keys = [ - "request_duration_us", "compute_input_duration_us", - "compute_infer_duration_us", "compute_output_duration_us" - ] - for stat in duration_keys: - composing_stat_duration = sum([ - int(model_metrics[model][stat]) for model in model_metrics - if model != "ensemble" - ]) - ensemble_stat_duration = int(model_metrics["ensemble"][stat]) - self.assertTrue(composing_stat_duration > 0) - self.assertTrue(ensemble_stat_duration > 0) - self.assertTrue(ensemble_stat_duration >= composing_stat_duration) - - def test_end_to_end(self): - try: - client = utils.create_inference_server_client("http", - "localhost:8000", - concurrency=128, - verbose=True) - except Exception as e: - print("channel creation failed: " + str(e)) - sys.exit(1) - - max_input_len = 500 - op_tokens_per_word = 1.3 - dataset = "./simple_data.json" - - prompts = [] - output_lens = [] - with open(dataset, "r") as f: - data_dict = json.load(f) - for req in data_dict: - prompt = req["input"] + " " + req["instruction"] - output = req["output"] - # 1.3 is a magic number that converts number of words to number of tokens - if int(len(prompt.split(" ")) / - op_tokens_per_word) > max_input_len: - continue - prompts.append(prompt) - output_lens.append( - int(len(output.split(" ")) * op_tokens_per_word)) - - self._run_infer(client, prompts, output_lens) - metrics = self._get_metrics() - filename = "./base_metrics.out" - with open(filename, "w+") as metrics_file: - metrics_file.write(metrics) - self._verify_base_metrics(filename) - - -if __name__ == "__main__": - unittest.main() diff --git a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py deleted file mode 100644 index ad6c539f..00000000 --- a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/python -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import json -import os -import re -import unittest -from datetime import datetime, timedelta - -AVAILABLE_GPUS = int(os.environ.get("AVAILABLE_GPUS", "1")) - -metric_to_stat_dict = { - "request_type=context": "Context Requests", - "request_type=scheduled": "Scheduled Requests", - "request_type=max": "Max Request Count", - "request_type=active": "Active Request Count", - "memory_type=pinned": "Runtime Pinned Memory Usage", - "memory_type=gpu": "Runtime GPU Memory Usage", - "memory_type=cpu": "Runtime CPU Memory Usage", - "kv_cache_block_type=tokens_per": "Tokens per KV cache block", - "kv_cache_block_type=used": "Used KV cache blocks", - "kv_cache_block_type=free": "Free KV cache blocks", - "kv_cache_block_type=max": "Max KV cache blocks", - "inflight_batcher_specific_metric=total_context_tokens": - "Total Context Tokens", - "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID", - "inflight_batcher_specific_metric=generation_requests": - "Generation Requests", - "inflight_batcher_specific_metric=paused_requests": "Paused Requests", - "v1_specific_metric=total_context_tokens": "Total Context Tokens", - "v1_specific_metric=total_generation_tokens": "Total Generation Tokens", - "v1_specific_metric=empty_generation_slots": "Empty Generation Slots", - "general_type=iteration_counter": "Iteration Counter", - "general_type=timestamp": "Timestamp", -} - - -class CustomMetricsTest(unittest.TestCase): - - def _parse_log_file(self, filename): - with open(filename) as log_file: - for line in reversed(list(log_file)): - if "Active Request Count" in line: - match = re.search(r'({.*})', line) - if match: - json_string = match.group(1) - try: - json_string = json_string.replace('\\"', '"') - except json.JSONDecodeError as e: - raise Exception("Error parsing the JSON string: ", - e) - else: - raise Exception("No JSON found in the log file") - - return json.loads(json_string) - - def _parse_triton_metrics(self, filename, is_v1): - curl_counts = {} - with open(filename) as metrics_file: - for line in metrics_file: - metric_value = "" - if line[0] != "#" and "nv_trt_llm" in line: - metric_output = re.sub(r"^.*?{", "{", line).split() - metric_key = metric_output[0] - metric_value = metric_output[1] - key = self._convert_metric_key_to_stats_key( - metric_key, is_v1) - curl_counts[key] = metric_value - return curl_counts - - def _convert_metric_key_to_stats_key(self, metric_output, is_v1): - # Converts: - # '{model="tensorrt_llm",request_type="context",version="1"}' - # to: - # ['model=tensorrt_llm', 'request_type=context', 'version=1'] - base = metric_output.replace('"', "").strip("{}").split(",") - key = [ - i for i in base - if not i.startswith('model') and not i.startswith('version') - ][0] - self.assertIn(key, metric_to_stat_dict) - if (is_v1): - self.assertNotIn("inflight_batcher_specific_metric", key) - else: - self.assertNotIn("v1_specific_metric", key) - return metric_to_stat_dict[key] - - def _base_test(self, stats_file, metrics_file, is_v1): - stats = self._parse_log_file(stats_file) - metrics = self._parse_triton_metrics(metrics_file, is_v1) - self.assertEqual(len(stats.keys()), len(metrics.keys())) - self.assertEqual( - list(stats.keys()).sort(), - list(metrics.keys()).sort()) - for metric_key in stats.keys(): - if metric_key != "Timestamp": - self.assertEqual(int(stats[metric_key]), - int(metrics[metric_key])) - else: - dt_log = datetime.strptime(stats[metric_key], - '%m-%d-%Y %H:%M:%S') - dt_curl = datetime.utcfromtimestamp( - int(metrics[metric_key]) // 1000000) - difference = dt_log - dt_curl - self.assertTrue( - timedelta(seconds=-1) <= difference, - difference <= timedelta(seconds=1)) - - def test_1_gpu_v1(self): - self._base_test("1gpu_v1_no_streaming_server.log", - "1gpu_v1_no_stream_metrics.out", True) - - def test_1_gpu_IFB_no_stream(self): - self._base_test("1gpu_IFB_no_streaming_server.log", - "1gpu_IFB_no_stream_metrics.out", False) - - def test_1_gpu_IFB_stream(self): - self._base_test("1gpu_IFB_streaming_server.log", - "1gpu_IFB_stream_metrics.out", False) - - if AVAILABLE_GPUS >= 2: - - def test_2_gpu_v1(self): - self._base_test("2gpu_v1_no_streaming_server.log", - "2gpu_v1_no_stream_metrics.out", True) - - def test_2_gpu_IFB_no_stream(self): - self._base_test("2gpu_IFB_no_streaming_server.log", - "2gpu_IFB_no_stream_metrics.out", False) - - def test_2_gpu_IFB_stream(self): - self._base_test("2gpu_IFB_streaming_server.log", - "2gpu_IFB_stream_metrics.out", False) - - if AVAILABLE_GPUS >= 4: - - def test_4_gpu_v1(self): - self._base_test("4gpu_v1_no_streaming_server.log", - "4gpu_v1_no_stream_metrics.out", True) - - def test_4_gpu_IFB_no_stream(self): - self._base_test("4gpu_IFB_no_streaming_server.log", - "4gpu_IFB_no_stream_metrics.out", False) - - def test_4_gpu_IFB_stream(self): - self._base_test("4gpu_IFB_streaming_server.log", - "4gpu_IFB_stream_metrics.out", False) - - -if __name__ == "__main__": - unittest.main() diff --git a/ci/L0_backend_trtllm/generate_engines.sh b/ci/L0_backend_trtllm/generate_engines.sh deleted file mode 100644 index e51bcbc1..00000000 --- a/ci/L0_backend_trtllm/generate_engines.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -BASE_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm -GPT_DIR=/opt/tritonserver/tensorrtllm_backend/tensorrt_llm/examples/gpt -TRTLLM_DIR=/opt/tritonserver/tensorrtllm_backend/tensorrt_llm/ - -function build_base_model { - local NUM_GPUS=$1 - cd ${GPT_DIR} - rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 - pushd gpt2 && rm pytorch_model.bin model.safetensors && wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin && popd - python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir ./c-model/gpt2/${NUM_GPUS}-gpu/ - cd ${BASE_DIR} -} - -function build_tensorrt_engine_inflight_batcher { - local NUM_GPUS=$1 - cd ${GPT_DIR} - local GPT_MODEL_DIR=./c-model/gpt2/${NUM_GPUS}-gpu/ - local OUTPUT_DIR=inflight_${NUM_GPUS}_gpu/ - # ./c-model/gpt2/ must already exist (it will if build_base_model - # has already been run) - extra_args="" - # If no nvlink, disable custom all reduce. - if [ "$(nvidia-smi nvlink -s | wc -l)" -eq "0" ] || [ $(nvidia-smi nvlink --status | grep inActive | wc -l) -ge 1 ]; then - extra_args+="--use_custom_all_reduce=disable" - fi - trtllm-build --checkpoint_dir "${GPT_MODEL_DIR}" \ - --gpt_attention_plugin float16 \ - --remove_input_padding enable \ - --paged_kv_cache enable \ - --gemm_plugin float16 \ - --workers "${NUM_GPUS}" \ - --output_dir "${OUTPUT_DIR}" \ - ${extra_args} - cd ${BASE_DIR} -} - -# Downgrade to legacy version to accommodate Triton CI runners -pip install pynvml==11.4.0 - -# Generate the TRT_LLM model engines -NUM_GPUS_TO_TEST=("1" "2" "4") -for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do - AVAILABLE_GPUS=$(nvidia-smi -L | wc -l) - if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then - continue - fi - - build_base_model "${NUM_GPU}" - build_tensorrt_engine_inflight_batcher "${NUM_GPU}" -done - -# Move the TRT_LLM model engines to the CI directory -mkdir engines -mv ${GPT_DIR}/inflight_*_gpu/ engines/ - -# Move the tokenizer into the CI directory -mkdir tokenizer -mv ${GPT_DIR}/gpt2/* tokenizer/ diff --git a/ci/L0_backend_trtllm/simple_data.json b/ci/L0_backend_trtllm/simple_data.json deleted file mode 100644 index 9b7bebca..00000000 --- a/ci/L0_backend_trtllm/simple_data.json +++ /dev/null @@ -1,67 +0,0 @@ -[ - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - }, - { - "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.", - "instruction": "Summarize the following news article:", - "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ." - } -] diff --git a/ci/L0_backend_trtllm/test.sh b/ci/L0_backend_trtllm/test.sh deleted file mode 100644 index b947971a..00000000 --- a/ci/L0_backend_trtllm/test.sh +++ /dev/null @@ -1,410 +0,0 @@ -#!/bin/bash -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost} -SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} -DATASET="$PWD/simple_data.json" -TOOLS_DIR='/opt/tritonserver/tensorrtllm_backend/tools' -STREAM_DIR='/opt/tritonserver/tensorrtllm_backend/inflight_batcher_llm/client' -MODEL_DIR="$PWD/triton_model_repo" -SERVER=/opt/tritonserver/bin/tritonserver -TOKENIZER_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm/tokenizer -BASE_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm -BASE_METRICS_VERIFICATION_TEST=base_metrics_verification_tests.py -BASE_METRICS_VERIFICATION_LOG="base_metrics_verification.log" -CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py -CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log" -SERVER_PID=0 -SLEEP_DURATION=3 - -# Force environment to use python version 3 -apt update -q=2 \ - && apt install -y python-is-python3 - -# Helpers =============================== -function replace_config_tags { - tag_to_replace="${1}" - new_value="${2}" - config_file_path="${3}" - sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path} - -} - -function run_server { - SERVER_ARGS="${1}" - python3 /opt/tritonserver/tensorrtllm_backend/scripts/launch_triton_server.py ${SERVER_ARGS} > ${SERVER_LOG} 2>&1 & - sleep 2 # allow time to obtain the pid(s) - # Read PIDs into an array, trimming whitespaces - readarray -t SERVER_PID < <(pgrep "tritonserver") -} - -# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on -# success, 1 on failure -function wait_for_server_ready() { - local wait_time_secs="${1:-30}"; shift - local spids=("$@"); - - WAIT_RET=0 - - local wait_secs=$wait_time_secs - until test $wait_secs -eq 0 ; do - # Multi-GPU will spawn multiple pids - for pid in "${spids[@]}"; do - if ! kill -0 $pid > /dev/null 2>&1; then - echo "=== Server not running." - WAIT_RET=1 - return - fi - done - - sleep 1; - - set +e - code=`curl -s -w %{http_code} ${SERVER_IPADDR}:8000/v2/health/ready` - set -e - if [ "$code" == "200" ]; then - code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' localhost:8000/v2/logging` - assert_curl_success "Failed to change log settings necessary for verification" ${BASH_LINENO} - return - fi - - ((wait_secs--)); - done - - echo "=== Timeout $wait_time_secs secs. Server not ready." - WAIT_RET=1 -} - -function reset_model_repo { - rm -rf triton_model_repo/ - mkdir ${MODEL_DIR} -} - -function kill_server { - pgrep tritonserver | xargs kill -SIGINT - if pgrep -x "trtllmExecutorWorker" > /dev/null; then - pkill -SIGINT -f "trtllmExecutorWorker" - fi -} - -function wait_for_server_terminated { - local spids=("$@"); - for pid in "${spids[@]}"; do - echo "Waiting for proc ${pid} to terminate..." - while true; do - if ! (kill -0 $pid) > /dev/null 2>&1; then - break - fi - sleep 1 - done - done -} - -function assert_curl_success { - message="${1}" - original_line_no="${2}" - RET=0 - if [ "$code" != "200" ]; then - cat ./curl.out - cat ${SERVER_LOG} - echo -e "\n***\n*** ${message} : line ${original_line_no}\n***" - RET=1 - fi - return ${RET} -} - -# ======================================= - -rm -f *.log *.out -# Generate TRT_LLM engines and install dependencies -source ./generate_engines.sh -python3 -m pip install --upgrade pip && \ - pip3 install tritonclient[all] && \ - pip3 install pandas && \ - pip3 install tabulate - -export AVAILABLE_GPUS=$(nvidia-smi -L | wc -l) - -RET=0 - -NUM_GPUS_TO_TEST=("1" "2" "4") -for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do - if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then - break - fi - - SERVER_ARGS="--world_size=${NUM_GPU} --model_repo=${MODEL_DIR}" - - reset_model_repo - - cp -r /opt/tritonserver/tensorrtllm_backend/all_models/inflight_batcher_llm/* ${MODEL_DIR} - rm -rf ${MODEL_DIR}/tensorrt_llm_bls - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/ensemble/config.pbtxt" - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/preprocessing/config.pbtxt" - replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/preprocessing/config.pbtxt" - replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt" - replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${batching_strategy}' 'INVALID' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt" - replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt" - replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt" - - # Copy the engine and place it into the model folder - cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1 - - # Invalid GPT model Type - SERVER_LOG="./${NUM_GPU}gpu_invalid_batch_strat.log" - - run_server "${SERVER_ARGS}" - wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - - # Expect invalid GPT model type error to be gracefully handled - if [ `grep -c "Invalid gpt_model_type" $SERVER_LOG` == "0" ]; then - echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***" - cat $SERVER_LOG - exit 1 - fi - - wait_for_server_terminated ${SERVER_PID[@]} - - # inflight batching OFF (V1) - # streaming OFF - SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log" - replace_config_tags 'INVALID' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - - run_server "${SERVER_ARGS}" - wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 - fi - set +e - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 - fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - - # inflight batching ON - # streaming OFF - SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log" - replace_config_tags 'V1' 'inflight_fused_batching' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - - run_server "${SERVER_ARGS}" - wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 - fi - set +e - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 - fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - - # Start a clean server to verify base metrics are being - # reported correctly - SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_base_metrics.log" - run_server "${SERVER_ARGS}" - wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - set -e - - python3 ${BASE_METRICS_VERIFICATION_TEST} >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1 - if [ $? -ne 0 ]; then - cat ${BASE_METRICS_VERIFICATION_LOG} - echo -e "\n***\n*** Error executing base metrics verification test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - RET=1 - fi - set +e - - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - - # World size must be 1 when using multi-model - if [ "${NUM_GPU}" == "0" ]; then - # Multi-model - SERVER_LOG="./${NUM_GPU}gpu_multi_model.log" - run_server "${SERVER_ARGS} --multi-model" - wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - set -e - - python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 - fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - fi - - # inflight batching ON - # streaming ON - SERVER_LOG="./${NUM_GPU}gpu_IFB_streaming_server.log" - replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" - - run_server "${SERVER_ARGS}" - wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - set -e - python3 ${STREAM_DIR}/end_to_end_grpc_client.py \ - --prompt="My name is" - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - RET=1 - fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_stream_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_PID[@]} - -done - -# Verify TRT LLM statistics are being properly reported as custom metrics -python3 ${CUSTOM_METRICS_VERIFICATION_TEST} >> ${CUSTOM_METRICS_VERIFICATION_LOG} 2>&1 -if [ $? -ne 0 ]; then - cat ${CUSTOM_METRICS_VERIFICATION_LOG} - RET=1 -fi - -if [ $RET -eq 0 ]; then - echo -e "\n***\n*** Test Passed\n***" -else - echo -e "\n***\n*** Test FAILED\n***" -fi - -exit $RET diff --git a/ci/README.md b/ci/README.md deleted file mode 100644 index 8a20dfb0..00000000 --- a/ci/README.md +++ /dev/null @@ -1,112 +0,0 @@ - - -# Testing TensorRT-LLM backend - -Tests in this CI directory can be run manually to provide extensive testing. - -## Run QA Tests - -Before the Triton 23.10 release, you can launch the Triton 23.09 container -`nvcr.io/nvidia/tritonserver:23.09-py3` and add the directory -`/opt/tritonserver/backends/tensorrtllm` within the container following the -instructions in [Option 3 Build via CMake](../README.md#option-3-build-via-cmake). - -Run the testing within the Triton container. - -```bash -docker run --rm -it --net host --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v /path/to/tensorrtllm_backend:/opt/tritonserver/tensorrtllm_backend nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 bash - -# Change directory to the test and run the test.sh script -cd /opt/tritonserver/tensorrtllm_backend/ci/ -bash -x ./test.sh -``` - -## Run the e2e/benchmark_core_model to benchmark - -These two tests are ran in the [L0_backend_trtllm](./L0_backend_trtllm/) -test. Below are the instructions to run the tests manually. - -### Generate the model repository - -Follow the instructions in the -[Create the model repository](../README.md#create-the-model-repository) -section to prepare the model repository. - -### Modify the model configuration - -Follow the instructions in the -[Modify the model configuration](../README.md#modify-the-model-configuration) -section to modify the model configuration based on the needs. - -### End to end test - -[End to end test script](../tools/inflight_batcher_llm/end_to_end_test.py) sends -requests to the deployed `ensemble` model. - -Ensemble model is ensembled by three models: `preprocessing`, `tensorrt_llm` and `postprocessing`: -- "preprocessing": This model is used for tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints). -- "tensorrt_llm": This model is a wrapper of your TensorRT-LLM model and is used for inferencing -- "postprocessing": This model is used for de-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string). - -The end to end latency includes the total latency of the three parts of an ensemble model. - -```bash -cd tools/inflight_batcher_llm -python3 end_to_end_test.py --dataset -``` - -Expected outputs -``` -[INFO] Functionality test succeed. -[INFO] Warm up for benchmarking. -[INFO] Start benchmarking on 125 prompts. -[INFO] Total Latency: 11099.243 ms -``` - -### benchmark_core_model - -[benchmark_core_model script](../tools/inflight_batcher_llm/benchmark_core_model.py) -sends requests directly to the deployed `tensorrt_llm` model, the benchmark_core_model -latency indicates the inference latency of TensorRT-LLM, not including the -pre/post-processing latency which is usually handled by a third-party library -such as HuggingFace. - -```bash -cd tools/inflight_batcher_llm -python3 benchmark_core_model.py dataset --dataset -``` - -Expected outputs - -``` -[INFO] Warm up for benchmarking. -[INFO] Start benchmarking on 125 prompts. -[INFO] Total Latency: 10213.462 ms -``` -*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.* diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend index 524ca41a..e36da3ec 100644 --- a/dockerfile/Dockerfile.triton.trt_llm_backend +++ b/dockerfile/Dockerfile.triton.trt_llm_backend @@ -1,53 +1,84 @@ -ARG BASE_IMAGE # Use NGC PyTorch image as base image +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.06-py3-min +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:25.06-py3 +ARG NVRTC_VER=12.9.86-1 +ARG TRT_VER=10.11.0.33 +ARG NCCL_VER=2.27.5-1+cuda12.9 +ARG RELEASE_URL_TRT_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-${TRT_VER}.Linux.x86_64-gnu.cuda-12.9.tar.gz +ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-${TRT_VER}.Linux.aarch64-gnu.cuda-12.9.tar.gz + +# Versions of packages to copy from pytorch image +ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6 +ARG TORCHVISION_VER=0.22.0a0+95f10a4e +ARG SETUPTOOLS_VER=78.1.1 +ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal +ARG JINJA2_VER=3.1.6 +ARG NETWORKX_VER=3.5 +ARG SYMPY_VER=1.14.0 +ARG PACKAGING_VER=23.2 +ARG FLASH_ATTN_VER=2.7.4.post1 + +ARG TENSORRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git +ARG TENSORRTLLM_REPO_TAG=release/1.0 +ARG TENSORRTLLM_VER=1.0.0 + +FROM ${PYTORCH_IMAGE} AS pytorch_image +FROM ${BASE_IMAGE} AS install_dependencies -FROM ${BASE_IMAGE} as install_dependencies +WORKDIR /workspace + +# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container +COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ +ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN apt-get update -q=2 \ && apt-get install -y --no-install-recommends \ + python3-dev \ python3-pip \ - ccache \ git-lfs \ + # Remove previous TRT installation + && apt-get purge -y "libnvinfer*" \ + && pip3 uninstall -y tensorrt \ && rm -rf /var/lib/apt/lists/* -# Remove previous TRT installation -# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries. -RUN apt-get remove -y tensorrt* -RUN pip3 uninstall -y tensorrt ARG TRT_VER +ARG NVRTC_VER +ARG NCCL_VER ENV TRT_VERSION=$TRT_VER \ TRT_VER=$TRT_VER \ CUDA_VER=$CUDA_VERSION \ CUDNN_VER=$CUDNN_VERSION \ - NCCL_VER=$NCCL_VERSION \ - CUBLAS_VER=$CUBLAS_VERSION + NCCL_VER=$NCCL_VER \ + CUBLAS_VER=$CUBLAS_VERSION \ + NVRTC_VER="${NVRTC_VER}" -LABEL TRT_VERSION $TRT_VER +LABEL TRT_VERSION=$TRT_VER +LABEL NCCL_VER=$NCCL_VER -# Download & install internal TRT release +# Install NVRTC RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \ - && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \ + && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.1-1_all.deb \ && apt install /tmp/cuda-keyring.deb \ && rm /tmp/cuda-keyring.deb \ - && apt-get update -q=2 \ - && rm -rf /var/lib/apt/lists/* - -ARG NVRTC_VER="12.4.99-1" -ENV NVRTC_VER="${NVRTC_VER}" - -RUN apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* -RUN CUDA_VER_SHORT=$(echo $CUDA_VER | awk -F. '{print $1"."$2}') \ - && NVRTC_CUDA_VERSION=$(echo $CUDA_VER_SHORT | sed 's/\./-/g') \ - && apt update -qq \ - && apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \ + && apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* \ + && CUDA_VER_SHORT=${CUDA_VER: 0:4} \ + && NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} \ + && apt-get update -qq \ + && apt-get install -y --no-install-recommends \ + cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \ + libnccl2=${NCCL_VER} \ + libnccl-dev=${NCCL_VER} \ && rm -rf /var/lib/apt/lists/* +# Download & install TRT release ARG RELEASE_URL_TRT_x86 ARG RELEASE_URL_TRT_ARM RUN [ "$(uname -m)" != "x86_64" ] && RELEASE_URL_TRT=${RELEASE_URL_TRT_ARM} || RELEASE_URL_TRT=${RELEASE_URL_TRT_x86} \ && curl -fSL -o /tmp/tensorrt.tar.gz ${RELEASE_URL_TRT} \ - && tar xzvf /tmp/tensorrt.tar.gz -C /usr/local \ + # Extract the tarball, excluding Windows libraries and static libraries as + # they are not needed for Linux build + && tar xzvf /tmp/tensorrt.tar.gz --exclude="lib*win.so*" --exclude="*.a" -C /usr/local \ && rm /tmp/tensorrt.tar.gz \ && find /usr/local -maxdepth 1 -name Tens* -type d -exec ln -s {} /usr/local/tensorrt \; @@ -56,20 +87,110 @@ RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import s ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} ENV TRT_ROOT=/usr/local/tensorrt -FROM install_dependencies as tensorrt_llm_build -RUN pip3 install --no-cache-dir \ - cmake \ - polygraphy==0.49.0 \ - mpi4py==3.1.5 +FROM install_dependencies AS tensorrt_llm_code WORKDIR /workspace -COPY scripts scripts -COPY tensorrt_llm tensorrt_llm -RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --job_count 18 && cd .. -FROM install_dependencies as base +ARG TENSORRTLLM_REPO +ARG TENSORRTLLM_REPO_TAG +RUN git-lfs install \ + && git clone --single-branch --recurse-submodules --depth=1 -b ${TENSORRTLLM_REPO_TAG} ${TENSORRTLLM_REPO} tensorrt_llm + +# Final stage to build the TRT-LLM container +FROM ${BASE_IMAGE} AS final_stage + +ARG TORCH_VER +ARG TORCHVISION_VER +ARG SETUPTOOLS_VER +ARG PYTORCH_TRITON_VER +ARG JINJA2_VER +ARG NETWORKX_VER +ARG SYMPY_VER +ARG PACKAGING_VER +ARG FLASH_ATTN_VER +# Copy necessary files from the base stage +COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/ +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch /usr/local/lib/python3.12/dist-packages/torch +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchgen /usr/local/lib/python3.12/dist-packages/torchgen +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision /usr/local/lib/python3.12/dist-packages/torchvision +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision.libs /usr/local/lib/python3.12/dist-packages/torchvision.libs +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools /usr/local/lib/python3.12/dist-packages/setuptools +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/functorch /usr/local/lib/python3.12/dist-packages/functorch +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/triton /usr/local/lib/python3.12/dist-packages/triton +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2 /usr/local/lib/python3.12/dist-packages/jinja2 +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx /usr/local/lib/python3.12/dist-packages/networkx +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy /usr/local/lib/python3.12/dist-packages/sympy +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging /usr/local/lib/python3.12/dist-packages/packaging +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/lib/python3.12/dist-packages/flash_attn +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info +COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/ + +# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container +COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/ + +ARG NVRTC_VER +ARG NCCL_VER +ENV CUDA_VER=$CUDA_VERSION \ + NVRTC_VER="${NVRTC_VER}" \ + NCCL_VER="${NCCL_VER}" + +# Install the necessary dependencies and remove previous TRT installation in the +# final image +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +RUN apt-get update -q=2 \ + && apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + git-lfs \ + perl \ + # Remove previous TRT installation + && apt-get purge -y "libnvinfer*" \ + && pip3 uninstall -y tensorrt \ + && rm -rf /var/lib/apt/lists/* \ + && pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 + +# Install NVRTC +RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \ + && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.1-1_all.deb \ + && apt install /tmp/cuda-keyring.deb \ + && rm /tmp/cuda-keyring.deb \ + && apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* \ + && CUDA_VER_SHORT=${CUDA_VER: 0:4} \ + && NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} \ + && apt-get update -qq \ + && apt-get install -y --no-install-recommends \ + cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \ + libnccl2=${NCCL_VER} \ + libnccl-dev=${NCCL_VER} \ + && rm -rf /var/lib/apt/lists/* + +# Install TRT +COPY --from=install_dependencies /usr/local/tensorrt /usr/local/tensorrt +RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )* -WORKDIR /tmp -COPY --from=tensorrt_llm_build /workspace/tensorrt_llm/build/tensorrt_llm*whl . +# Set environment variables +ARG TRT_VER +ENV TRT_VERSION=$TRT_VER +ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} +ENV TRT_ROOT=/usr/local/tensorrt -RUN pip3 install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt_llm*.whl +# Install TRT-LLM wheel after all the dependencies are installed +ARG TENSORRTLLM_VER +RUN --mount=type=secret,id=pypi_extra_values,env=PYPI_EXTRA_VALUES \ + pip install --no-cache-dir ${PYPI_EXTRA_VALUES} tensorrt_llm==${TENSORRTLLM_VER} + +# Copying the Tensorrt LLM scripts and applications +WORKDIR /app +COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/scripts scripts +COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/all_models all_models +COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/inflight_batcher_llm/client client +COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/tools tools +COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/examples examples diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend deleted file mode 100644 index de520c6d..00000000 --- a/dockerfile/Dockerfile.trt_llm_backend +++ /dev/null @@ -1,67 +0,0 @@ -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_TAG=24.05-py3 - -FROM ${BASE_IMAGE}:${BASE_TAG} as base - -RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3 ccache git-lfs - -COPY requirements.txt /tmp/ -RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com - -# Remove previous TRT installation -# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries. -RUN apt-get remove --purge -y tensorrt* -RUN pip uninstall -y tensorrt - -FROM base as dev - -# Download & install internal TRT release -COPY tensorrt_llm/docker/common/install_tensorrt.sh /tmp/ -RUN bash /tmp/install_tensorrt.sh && rm /tmp/install_tensorrt.sh -ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH} -ENV TRT_ROOT=/usr/local/tensorrt - -# Install latest Polygraphy -COPY tensorrt_llm/docker/common/install_polygraphy.sh /tmp/ -RUN bash /tmp/install_polygraphy.sh && rm /tmp/install_polygraphy.sh - -# CMake -COPY tensorrt_llm/docker/common/install_cmake.sh /tmp/ -RUN bash /tmp/install_cmake.sh && rm /tmp/install_cmake.sh -ENV PATH="/usr/local/cmake/bin:${PATH}" - -# Install mpi4py -COPY tensorrt_llm/docker/common/install_mpi4py.sh /tmp/ -RUN bash /tmp/install_mpi4py.sh && rm /tmp/install_mpi4py.sh - -# Use "pypi" (default) for x86_64 arch and "src_non_cxx11_abi" for aarch64 arch -ARG TORCH_INSTALL_TYPE="pypi" -COPY tensorrt_llm/docker/common/install_pytorch.sh install_pytorch.sh -RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh - -FROM dev as trt_llm_builder - -WORKDIR /app -COPY scripts scripts -COPY tensorrt_llm tensorrt_llm -RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" -i -c && cd .. - -FROM trt_llm_builder as trt_llm_backend_builder - -WORKDIR /app/ -COPY inflight_batcher_llm inflight_batcher_llm -RUN cd inflight_batcher_llm && bash scripts/build.sh && cd .. - -FROM trt_llm_backend_builder as final - -# Install TensorRT-LLM -WORKDIR /app/ -COPY --from=trt_llm_builder /app/tensorrt_llm/build /app/tensorrt_llm/build -RUN cd /app/tensorrt_llm/build && pip3 install *.whl - -# Install TensorRT-LLM backend -RUN mkdir /opt/tritonserver/backends/tensorrtllm -ENV LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorrtllm:${LD_LIBRARY_PATH} -COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm -COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so /opt/tritonserver/backends/tensorrtllm -COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm diff --git a/docs/baichuan.md b/docs/baichuan.md index fef7f5eb..3d383330 100644 --- a/docs/baichuan.md +++ b/docs/baichuan.md @@ -12,7 +12,7 @@ python build.py --model_dir ${HF_BAICHUAN_MODEL} \ --enable_context_fmha \ --use_gemm_plugin float16 \ --output_dir /tmp/baichuan/13B/trt_engines/fp16/1-gpu/ \ - --paged_kv_cache \ + --kv_cache_type paged \ --max_batch_size 64 [11/29/2023-08:20:34] [TRT] [I] Total Host Persistent Memory: 77008 @@ -38,20 +38,20 @@ python build.py --model_dir ${HF_BAICHUAN_MODEL} \ * Prepare configs ```bash -cp all_models/inflight_batcher_llm/ baichuan_ifb -r +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r -python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 -python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 -python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False -python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64 -python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0 -``` +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 +```` * Launch server ```bash pip install SentencePiece -python3 scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/ +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/ ``` this setting requires about 35GB @@ -145,7 +145,7 @@ curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What * Send request by `inflight_batcher_llm_client.py` (Remember to add `trust_remote_code=True` in tokenizer of `inflight_batcher_llm_client.py`) ```bash -python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} ========= Input sequence: [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650] @@ -160,7 +160,7 @@ Output sequence: [16814, 677, 5621, 1412, 4514, 678, 2835, 677, 31106, 53, 60, * Run test on dataset ``` -python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 [INFO] Start testing on 13 prompts. [INFO] Functionality test succeed. @@ -172,19 +172,19 @@ python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_tr * Run with decoupled mode (streaming) ```bash -cp all_models/inflight_batcher_llm/ baichuan_ifb -r +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r -python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 -python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 -python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True -python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64 -python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 pip install SentencePiece # please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default. -python3 scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/ +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/ -python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} --streaming +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} --streaming ```
diff --git a/docs/build.md b/docs/build.md new file mode 100644 index 00000000..48d13fb4 --- /dev/null +++ b/docs/build.md @@ -0,0 +1,33 @@ +# Building from Source + +This document describes how to build the TensorRT-LLM backend and the Triton +TRT-LLM container from source. The Triton container includes TensorRT-LLM, +along with the TensorRT-LLM backend and the Python backend. + +## Build the TensorRT-LLM Backend from source + +Make sure TensorRT-LLM is installed before building the backend. Since the +version of TensorRT-LLM and the TensorRT-LLM backend has to be aligned, it is +recommended to directly use the Triton TRT-LLM container from NGC or build the +whole container from source as described below in the Build the Docker Container +section. + +```bash +cd tensorrt_llm/triton_backend/inflight_batcher_llm +bash scripts/build.sh +``` + +## Build the Docker Container + +> [!CAUTION] +> [build.sh](../build.sh) is currently not working and will be fixed in the next weekly update. + +#### Build via Docker + +You can build the container using the instructions in the [TensorRT-LLM Docker Build](../tensorrt_llm/docker/README.md) +with `tritonrelease` stage. Please make sure to add CUDA_ARCHS flag for your GPU, for example if compute capability of your GPU is 89: + +```bash +cd tensorrt_llm/ +make -C docker tritonrelease_build CUDA_ARCHS='89-real' +``` diff --git a/docs/encoder_decoder.md b/docs/encoder_decoder.md new file mode 100755 index 00000000..40b89cae --- /dev/null +++ b/docs/encoder_decoder.md @@ -0,0 +1,402 @@ +# End to end workflow to run an Encoder-Decoder model + +### Support Matrix +For the specific models supported by encoder-decoder family, please visit [TensorRT-LLM encoder-decoder examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#encoder-decoder-model-support). The following two model types are supported: +* T5 +* BART + +## Run Encoder-Decoder with Tritonserver +### Tritonserver setup steps + +#### 1. Make sure that you have initialized the TRT-LLM submodule: + +``` + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend + git lfs install + git submodule update --init --recursive +``` + +#### 2. Start the Triton Server Docker container within `tensorrtllm_backend` repo: + +If you're using [Triton TRT-LLM NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags) + +``` + # Replace with the version of Triton you want to use. Here using 24.08. + # The commands below assumes the the current directory is the + # TRT-LLM backend root git repository. + + docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 bash +``` + +If [building your own TensorRT-LLM Backend container](https://github.com/triton-inference-server/tensorrtllm_backend#option-2-build-via-docker) then you can run the `tensorrtllm_backend` container: + +``` + docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace triton_trt_llm bash +``` + +#### 3. Build the engines: + +Clone the target model repository from HuggingFace. Here we use [T5-small model](https://huggingface.co/google-t5/t5-small) as example but you can also follow the same steps for BART model. + + + git lfs install + git clone https://huggingface.co/google-t5/t5-small /workspace/hf_models/t5-small + + +Build TensorRT-LLM engines. + +``` + export MODEL_NAME=t5-small # or bart-base + export MODEL_TYPE=t5 # or bart + export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME} + export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME} + export ENGINE_PATH=/workspace/engines/${MODEL_NAME} + export INFERENCE_PRECISION=float16 + export TP_SIZE=1 + export MAX_BEAM_WIDTH=1 + export MAX_BATCH_SIZE=8 + export INPUT_LEN=1024 + export OUTPUT_LEN=201 + + python3 tensorrt_llm/examples/models/core/enc_dec/convert_checkpoint.py \ + --model_type ${MODEL_TYPE} \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype ${INFERENCE_PRECISION} \ + --tp_size ${TP_SIZE} + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \ + --output_dir ${ENGINE_PATH}/encoder \ + --kv_cache_type disabled \ + --moe_plugin disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_input_len ${INPUT_LEN} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} \ + --context_fmha disable # remove for BART + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \ + --output_dir ${ENGINE_PATH}/decoder \ + --moe_plugin disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} \ + --max_input_len 1 \ + --max_encoder_input_len ${INPUT_LEN} \ + --max_seq_len ${OUTPUT_LEN} \ + --context_fmha disable # remove for BART +``` + +> **NOTE** +> +> If you want to build multi-GPU engine using Tensor Parallelism then you can set `--tp_size` in convert_checkpoint.py. For example, for TP=2 on 2-GPU you can set `--tp_size=2`. If you want to use beam search then set `--max_beam_width` to higher value than 1. The `--max_input_len` in encoder trtllm-build controls the model input length and should be same as `--max_encoder_input_len` in decoder trtllm-build. Additionally, to control the model output len you should set `--max_seq_len` in decoder trtllm-build to `desired output length + 1`. It is also advisable to tune [`--max_num_tokens`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md#max_num_tokens) as the default value of 8192 might be too large or too small depending on your input, output len and use-cases. For BART family models, make sure to remove `--context_fmha disable` from both encoder and decoder trtllm-build commands. Please refer to [TensorRT-LLM enc-dec example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#build-tensorrt-engines) for more details. + +#### 4. Prepare Tritonserver configs + +``` + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 + +``` + +> **NOTE** +> +> Currently, encoder-decoder models don't support running with chunked context. + +#### 5. Launch Tritonserver + +``` +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/ +``` + +### Send requests +#### 1. Send request with CURL + +``` +curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": \"\", \"stop_words\": \"\"}" + + {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Tickets will go on sale Monday, March 9 at 10 a.m. local time."} +``` + +#### 2. Send request with `bad_words` and `stop_words` + +After applying the `stop_words` and `bad_words`, the output avoids the bad words and stops at the first generated stop word. + +``` +curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}" + + {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the home of 3rdEyeGirl's Hannah Welton."} +``` + +#### 3. Send request by `inflight_batcher_llm_client.py` +If not already installed, install `tritonclient` + +``` + pip install tritonclient[all] + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH} + + ======== + Using pad_id: 0 + Using end_id: 1 + Input sequence: [13959, 1566, 12, 2968, 10, 100, 19, 207, 1] + [TensorRT-LLM][WARNING] decoder_input_ids is not present in the request for encoder-decoder model. The decoder input tokens will be set to [padId] + Got completed request + Input: translate English to German: This is good + Output beam 0: Das is gut. + Output sequence: [644, 229, 1806, 5] +``` + +> **NOTE** +> +> Please ignore any exception thrown with the output. It's a known issue to be fixed. + +#### 4. Run test on dataset + +``` + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 + + [INFO] Start testing on 13 prompts. + [INFO] Functionality test succeed. + [INFO] Warm up for benchmarking. + [INFO] Start benchmarking on 13 prompts. + [INFO] Total Latency: 155.756 ms +``` + +#### 5. Run several requests at the same time + +``` +echo "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}" > tmp.txt + +printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt +``` +#### 6. Evaluating performance with Gen-AI Perf + +Gen-AI Perf is a command line tool for measuring the throughput and latency of generative AI models as served through an inference server. You can read more about installing Gen-AI Perf [here](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html#installation). + +To use Gen-AI Perf, run the following command: + +``` +genai-perf profile \ + -m ensemble \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer ${HF_MODEL_PATH} \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +You should expect an output that looks like this (the output below was obtained on A100-80GB with TRT-LLM v0.12): + +``` LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +│ Request latency (ms) │ 80.92 │ 78.84 │ 323.55 │ 85.14 │ 79.90 │ 79.64 │ +│ Output sequence length │ 95.83 │ 65.00 │ 100.00 │ 100.00 │ 99.00 │ 98.00 │ +│ Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.00 │ 200.00 │ 200.00 │ +└────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +Output token throughput (per sec): 1182.70 +Request throughput (per sec): 12.34 +``` + +#### 7. Run with decoupled mode (streaming) + +To enable streaming, we set `decoupled_mode:True` in config.pbtxt of `tensorrt_llm` and `tensorrt_llm_bls` model (if you are using BLS instead of ensemble). + +``` + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 + +``` + +We launch Tritonserver + +``` +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/ +``` + +Then send request by `inflight_batcher_llm_client.py` + +``` +pip install tritonclient[all] +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH} --streaming +``` + +To use Gen-AI Perf to benchmark streaming/decoupled mode, run the following command: + +``` +genai-perf profile \ + -m ensemble \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 200 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 100 \ + --output-tokens-stddev 0 \ + --tokenizer ${HF_MODEL_PATH} \ + --concurrency 1 \ + --measurement-interval 4000 \ + --profile-export-file my_profile_export.json \ + --url localhost:8001 +``` + +You should see output like this (the output below was obtained on A100-80GB with TRT-LLM v0.12) + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +│ Time to first token (ms) │ 4.69 │ 3.99 │ 14.05 │ 5.70 │ 5.04 │ 4.76 │ +│ Inter token latency (ms) │ 0.63 │ 0.38 │ 1.04 │ 0.98 │ 0.70 │ 0.66 │ +│ Request latency (ms) │ 75.32 │ 46.34 │ 114.27 │ 90.35 │ 79.27 │ 79.11 │ +│ Output sequence length │ 116.50 │ 58.00 │ 197.00 │ 197.00 │ 132.00 │ 128.00 │ +│ Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.10 │ 200.00 │ 200.00 │ +└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘ +Output token throughput (per sec): 1542.81 +Request throughput (per sec): 13.24 +``` + +## Running multiple instances of encoder-decoder model on multiple GPUs + +In this section, we demonstrate how you can use +[Leader Mode](../README.md#leader-mode) for running multiple instances of a encoder-decoder model on different GPUs. + +For this section, let's assume that we have four GPUs and the CUDA device ids +are 0, 1, 2, and 3. We will be launching two instances of the T5-small model +with tensor parallelism 2 (TP=2). The first instance will run on GPUs 0 and 1 +and the second instance will run on GPUs 2 and 3. We will launch two separate `mpirun` commands to launch two separate Triton servers, one for each GPU (4 Triton Server instances in total). We also need to use a reverse proxy in front of them to load balance the requests between the servers. + +[Orchestrator Mode](../README.md#orchestrator-mode) currently not supported. + + +### Triton setup steps +1. Build the model, but add `--tp_size 2` when converting checkpoints. The rest of the steps are the same as [Tritonserver Setup +](#Tritonserver-setup-steps). + +``` + export MODEL_NAME=t5-small + export MODEL_TYPE=t5 # or bart + export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME} + export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME}-2tp-2gpu + export ENGINE_PATH=/workspace/engines/${MODEL_NAME}-2tp-2gpu + + python tensorrt_llm/examples/models/core/enc_dec/convert_checkpoint.py \ + --model_type ${MODEL_TYPE} \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 \ + --tp_size 2 + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \ + --output_dir ${ENGINE_PATH}/encoder \ + --kv_cache_type disabled \ + --moe_plugin disable \ + --max_batch_size 64 \ + --gemm_plugin float16 \ + --bert_attention_plugin float16 \ + --gpt_attention_plugin float16 \ + --max_input_len 2048 \ + --context_fmha disable + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \ + --output_dir ${ENGINE_PATH}/decoder \ + --moe_plugin disable \ + --max_batch_size 64 \ + --gemm_plugin float16 \ + --bert_attention_plugin float16 \ + --gpt_attention_plugin float16 \ + --context_fmha disable \ + --max_input_len 1 \ + --max_encoder_input_len 2048 +``` + +3. Setup Tritonserver config with the same commands in [step 4](#prepare-tritonserver-configs) above. + +4. Launch the servers: + +``` + CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004 + CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005 +``` + +4. Install NGINX: + +``` + apt update + apt install nginx -y +``` + +5. Setup the NGINX configuration and store it in `/etc/nginx/sites-available/tritonserver`: + +``` + upstream tritonserver { + server localhost:8000; + server localhost:8002; + } + + server { + listen 8080; + + location / { + proxy_pass http://tritonserver; + } + } +``` + +6. Create a symlink and restart NGINX to enable the configuration: + +``` + ln -s /etc/nginx/sites-available/tritonserver /etc/nginx/sites-enabled/tritonserver + service nginx restart +``` + +### Send the request + +1. Run test on dataset + +``` + # Test the load on all the servers + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080 + + # Test the load on one of the servers + python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000 +``` + +### Kill the server +``` +pgrep mpirun | xargs kill +``` diff --git a/docs/gemma.md b/docs/gemma.md index fed782ae..f8959ec6 100644 --- a/docs/gemma.md +++ b/docs/gemma.md @@ -14,20 +14,20 @@ ENGINE_PATH=/tmp/gemma/2B/bf16/1-gpu/ Note that we use `tokenizer_type=sp` (sentencepiece) tokenizer. ```bash -cp all_models/inflight_batcher_llm/ gemma -r +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ gemma -r -python3 tools/fill_template.py -i gemma/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,preprocessing_instance_count:1,add_special_tokens:True -python3 tools/fill_template.py -i gemma/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,postprocessing_instance_count:1 -python3 tools/fill_template.py -i gemma/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False -python3 tools/fill_template.py -i gemma/ensemble/config.pbtxt triton_max_batch_size:64 -python3 tools/fill_template.py -i gemma/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:guaranteed_no_evict +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,preprocessing_instance_count:1,add_special_tokens:True +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:guaranteed_no_evict,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 ``` * Launch server ```bash -python3 scripts/launch_triton_server.py --world_size 1 --model_repo=gemma/ +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=gemma/ ``` diff --git a/docs/guided_decoding.md b/docs/guided_decoding.md new file mode 100644 index 00000000..90854451 --- /dev/null +++ b/docs/guided_decoding.md @@ -0,0 +1,128 @@ +# End-to-End Workflow for Guided Decoding with TensorRT-LLM Backend + +This document outlines the process for running guided decoding using the TensorRT-LLM backend. Guided decoding ensures that generated outputs adhere to specified formats, such as JSON. Currently, this feature is supported through the [XGrammar](https://github.com/mlc-ai/xgrammar) backend. + +For more information, refer to the [guided decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/executor.md#structured-output-with-guided-decoding) from TensorRT-LLM. Additionally, you can explore another example of [guided decoding + LLM API example](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_guided_decoding.html). + +## Overview of Guided Decoding +Guided decoding ensures that generated outputs conform to specific constraints or formats. Supported guide types include: +- **None**: No constraints. +- **JSON**: Outputs in JSON format. +- **JSON Schema**: JSON format with schema validation. +- **Regex**: Outputs matching a regular expression. +- **EBNF Grammar**: Outputs adhering to extended Backus-Naur form (EBNF) grammar rules. + +# Build TensorRT-LLM engine and launch Tritonserver + +From this point, we assume you installed all requirements for tensorrtllm_backend. You can refer to [build.md](build.md) for installation and docker launch. + +## Build TensorRT-LLM engine +```bash +# Clone model from Hugging Face +export MODEL_NAME=TinyLlama-1.1B-Chat-v1.0 +git clone https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0 hf_models/${MODEL_NAME} + +export HF_MODEL_PATH=hf_models/${MODEL_NAME} +export UNIFIED_CKPT_PATH=trt_ckpts/tiny_llama_1b/1-gpu/fp16 +export ENGINE_PATH=trt_engines/tiny_llama_1b/1-gpu/fp16 + +python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + +trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --remove_input_padding enable \ + --gpt_attention_plugin float16 \ + --context_fmha enable \ + --gemm_plugin float16 \ + --output_dir ${ENGINE_PATH} \ + --kv_cache_type paged \ + --max_batch_size 64 +``` +## Launch Tritonserver + +## Python Backend +```bash +export GUIDED_DECODING_BACKEND=xgrammar +export TRITON_BACKEND=python + +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,tokenizer_dir:${HF_MODEL_PATH},guided_decoding_backend:${GUIDED_DECODING_BACKEND} + +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +``` + +## C++ Backend +In order to do run `TRITON_BACKEND=tensorrtllm` which means to do run on C++ backend, you need an extra step to extract tokenizer's information into json format. `generate_xgrammar_tokenizer_info.py` will create `xgrammar_tokenizer_info.json` under given output_dir argument. And we fill the `xgrammer_tokenizer_info_path` parameter in `tensorrt_llm/config.pbtxt`. +```bash +export XGRAMMAR_TOKENIZER_INFO_DIR=tokenizer_info/${MODEL_NAME} + +python3 tensorrt_llm/examples/generate_xgrammar_tokenizer_info.py --model_dir ${HF_MODEL_PATH} --output_dir ${XGRAMMAR_TOKENIZER_INFO_DIR} + +export XGRAMMAR_TOKENIZER_INFO_PATH=tokenizer_info/${MODEL_NAME}/xgrammar_tokenizer_info.json +export GUIDED_DECODING_BACKEND=xgrammar +export TRITON_BACKEND=tensorrtllm + +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,guided_decoding_backend:${GUIDED_DECODING_BACKEND},xgrammar_tokenizer_info_path:${XGRAMMAR_TOKENIZER_INFO_PATH} + +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +``` +# Sending Guided Decoding Requests + +Use the provided gRPC client to send requests with different guide types. +```bash +# Set the prompt +PROMPT="What is the year after 2024? Answer:" + +# 0. Guide type: None +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble + +# Output: +# 0: 2025 +# +# Question 3: What is the year after 2025? Answer: 2026 +# + +# 1. Guide type: json +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json + +# Output: +# 0: [2025] + +# 2. Guide type: json_schema +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json_schema --guided-decoding-guide '{"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"}' + +# Output: +# 0: {"answer": 2026} + +# 3. Guide type: regex +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type regex --guided-decoding-guide '\d+' + +# Output: +# 0: 2025 + +# 4. Guide type: ebnf_grammar +python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type ebnf_grammar --guided-decoding-guide 'root ::= [0-9]+' + +# Output: +# 0: 2025 +``` + +Use curl method to send requests +```bash +curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is the year after 2024? Answer:", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2, "guided_decoding_guide_type":"json"}' + +# Output: +# {"model_name":"ensemble","model_version":"1","sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"[2025]"} +``` diff --git a/docs/llama.md b/docs/llama.md index d1ae08ed..c67c31b3 100644 --- a/docs/llama.md +++ b/docs/llama.md @@ -36,7 +36,7 @@ docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggi export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json')).parent)"` export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b/ export ENGINE_PATH=/tmp/engines/llama/7b/ -python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ +python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ --output_dir ${UNIFIED_CKPT_PATH} \ --dtype float16 @@ -46,27 +46,27 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ --context_fmha enable \ --gemm_plugin float16 \ --output_dir ${ENGINE_PATH} \ - --paged_kv_cache enable \ + --kv_cache_type paged \ --max_batch_size 64 ``` * Prepare configs ```bash -cp all_models/inflight_batcher_llm/ llama_ifb -r +cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r -python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 -python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 -python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False -python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64 -python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 ``` * Launch server ```bash pip install SentencePiece -python3 scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ ``` this setting requires about 25GB @@ -114,7 +114,7 @@ curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What * Send request by `inflight_batcher_llm_client.py` ```bash -python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} ========= [[1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263]] @@ -128,7 +128,7 @@ output_ids = [14547, 297, 3681, 322, 4517, 1434, 8401, 304, 1570, 3088, 297, 29 * Run test on dataset ``` -python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 [INFO] Start testing on 13 prompts. [INFO] Functionality test succeed. @@ -142,18 +142,18 @@ python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_tr * Run with decoupled mode (streaming) ```bash -cp all_models/inflight_batcher_llm/ llama_ifb -r +cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r -python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 -python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 -python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True -python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64 -python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:Truelogits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 pip install SentencePiece -python3 scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/ -python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} --streaming +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} --streaming ```
diff --git a/docs/llama_multi_instance.md b/docs/llama_multi_instance.md index f7dbd3b9..5dce2a91 100644 --- a/docs/llama_multi_instance.md +++ b/docs/llama_multi_instance.md @@ -74,7 +74,7 @@ same GPU. 4. Run the test client to measure performance: ```bash -python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 ``` If you plan to use the BLS version instead of the ensemble model, you might also @@ -111,7 +111,7 @@ export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b-2tp-2gpu/ export ENGINE_PATH=/tmp/engines/llama/7b-2tp-2gpu/ # Create the checkpoint -python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ +python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \ --output_dir ${UNIFIED_CKPT_PATH} \ --dtype float16 \ --tp_size 2 @@ -123,7 +123,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ --context_fmha enable \ --gemm_plugin float16 \ --output_dir ${ENGINE_PATH} \ - --paged_kv_cache enable \ + --kv_cache_type paged \ --max_batch_size 64 ``` @@ -131,13 +131,13 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ ```bash # Setup the model repository for the first instance. -cp all_models/inflight_batcher_llm/ llama_ifb -r +cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r -python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 -python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 -python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False -python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64 -python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 ``` ### Leader Mode @@ -150,8 +150,8 @@ between the servers. 3a. Launch the servers: ```bash -CUDA_VISIBLE_DEVICES=0,1 python3 scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004 -CUDA_VISIBLE_DEVICES=2,3 python3 scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005 +CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004 +CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005 ``` 4a. Install NGINX: @@ -191,10 +191,10 @@ service nginx restart pip3 install tritonclient[all] # Test the load on all the servers -python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080 +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080 # Test the load on one of the servers -python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000 +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000 ``` 8a. Kill the server: @@ -205,10 +205,14 @@ pgrep mpirun | xargs kill ### Orchestrator Mode -In this mode, we will create a copy of the TensorRT-LLM model and use the -`gpu_device_ids` field to specify which GPU should be used by each model -instance. Then, we need to modify the client to distribute the requests between -different models. +With orchestrator mode, there are two options for running multiple instances +of a single model: + +1. Creating separate Triton models + +2. Starting from the 24.08 release, you can use Triton `instance_group` field to specify the number TRT-LLM model instances. With that option, the load balancing decision will be done in Triton core. + +#### 1. Creating Separate Triton Models 3b. Create a copy of the `tensorrt_llm` model: @@ -234,7 +238,22 @@ sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_2"/g' llama_ifb/tensorrt_llm_ 5b. Launch the server: ```bash -python3 scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ +``` + +Alternatively, you can start all MPI ranks at once and avoid dynamic process spawning +by using the `--disable-spawn-processes`. The config file must specify which ranks each +model should use: + +```bash +sed -i 's/\${participant_ids}/1,2/g' llama_ifb/tensorrt_llm/config.pbtxt +sed -i 's/\${participant_ids}/3,4/g' llama_ifb/tensorrt_llm_2/config.pbtxt +``` + +Note that rank 0 is reserved for the orchestrator rank. + +```bash +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ --disable-spawn-processes --world_size=5 ``` 6b. Run the test client to measure performance: @@ -243,11 +262,11 @@ python3 scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ pip3 install tritonclient[all] # We will only benchmark the core tensorrtllm models. -python3 tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \ - dataset --dataset ci/L0_backend_trtllm/simple_data.json \ - --tokenizer-dir $HF_LLAMA_MODEL \ - --tesnorrt-llm-model-name tensorrtllm \ - --tensorrt-llm-model-name tensorrtllm_2 +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \ + --tensorrt-llm-model-name tensorrt_llm \ + --tensorrt-llm-model-name tensorrt_llm_2 \ + dataset --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json \ + --tokenizer-dir $HF_LLAMA_MODEL ``` 7b. Kill the server: @@ -256,14 +275,52 @@ python3 tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \ pgrep mpirun | xargs kill ``` +#### 2. Using Triton Core's Load Balancing + +In order to use Triton core's load balancing for multiple instances, you can +increase the number of instances in the `instance_group` field and use the +`gpu_device_ids` parameter to specify which GPUs will be used by each model +instance. + +For example, if you're running a TP=2 model on a 4-GPU system and you want +to run one instance on GPUs 0 and 1 and the other instance on GPUs 2 and 3, +you can use the following model configuration: + +``` +instance_group [ + {kind: KIND_CPU, count: 2} +] + +parameters: { + key: "gpu_device_ids" + value: { + string_value: "0,1;2,3" + } +} +``` + +Please note that the number of set of GPU device ids must equal the number of instances. + ### Orchestrator Mode vs Leader Mode Summary The table below summarizes the differences between the orchestrator mode and leader mode: -| | Orchestrator Mode | Leader Mode | -| ----------------------------------| :----------------: | :----------:| -| Multi-node Support | ❌ | ✅ | -| Requires Reverse Proxy | ❌ | ✅ | -| Requires Client Changes | ✅ | ❌ | -| Requires `MPI_Comm_Spawn` Support | ✅ | ❌ | +| | Orchestrator Mode (Separate Models) | Orchestrator Mode (Triton Load Balancing) |Leader Mode | +| ----------------------------------| :----------------: | :----------------: |:----------:| +| Requires Reverse Proxy | ❌ | ❌ | ✅ | +| Requires Client Changes | ✅ | ❌ | ❌ | + +Orchestrator mode by default uses `MPI_Comm_Spawn` to create the child +processes. If `MPI_Comm_Spawn` is used, it is not possible to distribute +the model across multiple nodes. + +It is also possible to use orchestrator mode with MPI processes that have been +pre-spawned. In order to do that, you need to set `--disable-spawn-processes` +when using the [launch_triton_server.py](../scripts/launch_triton_server.py) +script or `export TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES=0`. In this mode, +it is possible to run the server across different nodes in orchestrator mode. + +In order to use the orchestrator mode itself, you need to set the `--multi-model` +flag when using the [launch_triton_server.py](../scripts/launch_triton_server.py) +script or `export TRTLLM_ORCHESTRATOR=1`. diff --git a/docs/llmapi.md b/docs/llmapi.md new file mode 100644 index 00000000..b9ee16da --- /dev/null +++ b/docs/llmapi.md @@ -0,0 +1,109 @@ +## End to end workflow to use the pytorch LLMAPI workflow + +* Start the Triton Server Docker container: + +```bash +# Replace with the version of Triton you want to use. +# The command below assumes the the current directory is the +# TRT-LLM backend root git repository. + +docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggingface --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash +``` + +* Prepare config + +```bash + cp -R tensorrt_llm/triton_backend/all_models/llmapi/ llmapi_repo/ +``` + +Edit `llmapi_repo/tensorrt_llm/1/model.yaml` to change the model. You can either use a HuggingFace path or a local path. The following is based on `meta-llama/Llama-3.1-8B`. + +This configuration file also allows you to enable CUDA graphs support and set pipeline parallelism and tensor parallelism sizes. + +* Launch server + +```bash +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --model_repo=llmapi_repo/ +``` + +* Send request + +```bash +curl -X POST localhost:8000/v2/models/tensorrt_llm/generate -d '{"text_input": "The future of AI is", "sampling_param_max_tokens":10}' | jq +``` + +* Optional: include performance metrics + +To retrieve detailed performance metrics per request such as KV cache usage, timing breakdowns, and speculative decoding statistics - add `"sampling_param_return_perf_metrics": true` to your request payload: + +```bash +curl -X POST localhost:8000/v2/models/tensorrt_llm/generate -d '{"text_input": "Please explain to me what is machine learning?", "sampling_param_max_tokens":10, "sampling_param_return_perf_metrics":true}' | jq +``` + +Sample response with performance metrics +```json +{ + "acceptance_rate": "0.0", + "arrival_time_ns": "76735247746000", + "first_scheduled_time_ns": "76735248284000", + "first_token_time_ns": "76735374300000", + "kv_cache_alloc_new_blocks": "1", + "kv_cache_alloc_total_blocks": "1", + "kv_cache_hit_rate": "0.0", + "kv_cache_missed_block": "1", + "kv_cache_reused_block": "0", + "last_token_time_ns": "76736545324000", + "model_name": "tensorrt_llm", + "model_version": "1", + "text_output": "Please explain to me what is machine learning? \n\nMachine learning is a field of computer science that involves the development of algorithms and models that can learn from data without being explicitly programmed. It is a", + "total_accepted_draft_tokens": "0", + "total_draft_tokens": "0" +} +``` + +`inflight_batcher_llm_client.py` is not supported yet. + +* Run test on dataset + +```bash +python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 --test-llmapi --model-name tensorrt_llm + +[INFO] Start testing on 13 prompts. +[INFO] Functionality test succeeded. +[INFO] Warm up for benchmarking. +FLAGS.model_name: tensorrt_llm +[INFO] Start benchmarking on 13 prompts. +[INFO] Total Latency: 377.254 ms +``` + +* Run benchmark + +```bash + python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \ + --tensorrt-llm-model-name tensorrt_llm \ + --test-llmapi \ + dataset --dataset ./tensorrt_llm/triton_backend/tools/dataset/mini_cnn_eval.json \ + --tokenizer-dir meta-llama/Llama-3.1-8B + +dataset +Tokenizer: Tokens per word = 1.308 +[INFO] Warm up for benchmarking. +[INFO] Start benchmarking on 39 prompts. +[INFO] Total Latency: 1446.623 ms +``` + +### Start the server on a multi-node configuration + +The `srun` tool can be used to start the server in a multi-node environment: + +``` +srun -N 2 \ + --ntasks-per-node=8 \ + --mpi=pmix \ + --container-image= \ + --container-mounts=$(pwd)/tensorrt_llm/:/code \ + trtllm-llmapi-launch /opt/tritonserver/bin/tritonserver --model-repository llmapi_repo + +``` + +Note: inter-node tensor parallelism is not yet supported. diff --git a/docs/lora.md b/docs/lora.md new file mode 100644 index 00000000..28452556 --- /dev/null +++ b/docs/lora.md @@ -0,0 +1,269 @@ +# Running LoRA inference with inflight batching + +Below is an example of how to run LoRA inference with inflight batching. See the +[LoRA documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/lora.md) +in the TensorRT-LLM repository for more information about running gpt-2b with +LoRA using inflight batching. + +## Launch Triton TensorRT-LLM container + +```bash +docker run --rm -it --net host --shm-size=2g \ + --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \ + -v :/tensorrtllm_backend \ + -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ + -v :/engines \ + nvcr.io/nvidia/tritonserver:-trtllm-python-py3 +``` + +## Prepare TensorRT-LLM engines with LoRA enable + +(Optional) Download the LLaMa model from HuggingFace if you haven't already. + +```bash +huggingface-cli login +huggingface-cli download meta-llama/Llama-2-7b-hf +``` + +> **NOTE** +> +> Make sure that you have access to https://huggingface.co/meta-llama/Llama-2-7b-hf. + +```bash +cd /tensorrtllm_backend/tensorrt_llm/examples/models/core/llama +BASE_LLAMA_MODEL=/path/to/llama-7b-hf + +python3 convert_checkpoint.py --model_dir ${BASE_LLAMA_MODEL} \ + --output_dir ./c-model/llama/fp16/1-gpu \ + --dtype float16 + +trtllm-build --checkpoint_dir ./c-model/llama/fp16/1-gpu \ + --output_dir /engines/llama_7b_with_lora_qkv/fp16/1-gpu \ + --gemm_plugin float16 \ + --max_batch_size 8 \ + --max_seq_len 562 \ + --gpt_attention_plugin float16 \ + --kv_cache_type paged \ + --remove_input_padding enable \ + --use_paged_context_fmha enable \ + --lora_plugin float16 \ + --lora_target_modules attn_q attn_k attn_v \ + --max_lora_rank 8 +``` + +Note that you still need to use `hf_lora_convert.py` to convert the lora weights and store in `/tmp/lora_prefetch`. But users don't need to send the `--lora-path` when you run the inference at the first time. + +## Generate LoRA tensors + +Now generate LoRA tensors that will be passed in with each request to triton. + +```bash +git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1 +git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0 + +python3 ..//hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16 +python3 ../hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16 +``` + +## Create a Triton model repository and launch the Triton server + +Create a Triton model repository following the instructions +[here](../README.md#prepare-the-model-repository), and modify the model +configuration following the steps +[here](../README.md#modify-the-model-configuration). + +## LoRA Cache + +As LoRA weights are passed to the backend they will be cached in a host cache. +As requests are scheduled, those weights with be prefetched to a gpu cache. +After a LoRA is loaded into the cache, only `lora_task_id` is needed for inference. + +### lora_cache_optimal_adapter_size + +Optimal adapter size used to size cache pages. Typically optimally sized +adapters will fix exactly into 1 cache page. (default: 8) + +``` +parameters: { + key: "lora_cache_optimal_adapter_size" + value: { + string_value: "${lora_cache_optimal_adapter_size}" + } +} +``` + +### lora_cache_max_adapter_size + +Used to set the minimum size of a cache page. Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default: 64) + +``` +parameters: { + key: "lora_cache_max_adapter_size" + value: { + string_value: "${lora_cache_max_adapter_size}" + } +} +``` + +### lora_cache_gpu_memory_fraction + +Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded (default: 0.05) + +``` +parameters: { + key: "lora_cache_gpu_memory_fraction" + value: { + string_value: "${lora_cache_gpu_memory_fraction}" + } +} +``` + +### lora_cache_host_memory_bytes + +Size of host LoRA cache in bytes (default: 1G) + +``` +parameters: { + key: "lora_cache_host_memory_bytes" + value: { + string_value: "${lora_cache_host_memory_bytes}" + } +} +``` + +### prefetch lora cache during initializing the model instance + +If users want to load the lora models during initializing the model instance, +instead of passing the lora weight as input, users can store the lora weights in `` +and pass it as a parameter to initialize the model instance. +Then, the model instance will try to load the lora weights from the folder. +In the folder, users can put many folders for different lora tasks. +For example, assume we want to store lora weights in `/tmp/lora_prefetch` and +there are three lora tasks `0`, `1` and `3`, then the architecture of the folder would be like + +```bash +/tmp/lora_prefetch +├── 0 +│ ├── model.lora_config.npy +│ └── model.lora_weights.npy +├── 1 +│ ├── model.lora_config.npy +│ └── model.lora_weights.npy +└── 3 + ├── model.lora_config.npy + └── model.lora_weights.npy +``` + +Note that you must name the folder by digit because the lora cache manager will view these name as lora task ids. + +```pbtxt +parameters: { + key: "lora_prefetch_dir" + value: { + string_value: "${lora_prefetch_dir}" + } +} +``` + +## Launch tritonserver + +```bash +MODEL_FOLDER=/path/to/triton_model_repo +# 'world_size' is the number of GPUs you want to use for serving. This should +# be aligned with the number of GPUs used to build the TensorRT-LLM engine. +python3 /tensorrtllm_backend/tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size=1 --model_repo=${MODEL_FOLDER} +``` + +Run Multi-LoRA example by issuing multiple concurrent requests. +The inflight batcher will execute mixed batches with multiple LoRAs in the same batch. + +First we cache the LoRAs by sending dummy requests for each adapter. The TASK_IDS are uniq to the adapter + +```bash +pip3 install tritonclient[all] + +TASK_IDS=("1" "2") +LORA_PATHS=("luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights") +INFLIGHT_BATCHER_LLM_CLIENT=/tensorrtllm_backend/tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py + +for index in ${!TASK_IDS[@]}; do + text="dummy" + lora_path=${LORA_PATHS[$index]} + task_id=${TASK_IDS[$index]} + lora_arg="--lora-path ${lora_path} --lora-task-id ${task_id}" + + python3 ${INFLIGHT_BATCHER_LLM_CLIENT} \ + --top-k 0 \ + --top-p 0.5 \ + --request-output-len 10 \ + --text "${text}" \ + --tokenizer-dir /path/to/llama-7b-hf \ + ${lora_arg} & +done +``` + +Now perform inference with just `--lora-task-id` + +```bash +INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:") +TASK_IDS=("" "1" "2" "" "1" "2") + +for index in ${!INPUT_TEXT[@]}; do + text=${INPUT_TEXT[$index]} + task_id=${TASK_IDS[$index]} + lora_arg="" + if [ "${task_id}" != "" ]; then + lora_arg="--lora-task-id ${task_id}" + fi + + python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \ + --top-k 0 \ + --top-p 0.5 \ + --request-output-len 10 \ + --text "${text}" \ + --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \ + ${lora_arg} & +done + +wait +``` + +Example Output: + +``` +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] +Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: ワシントン D.C. +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: Washington, D.C. +What is the +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: Washington D.C. +Washington D. +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: Washington, D.C. +Which of +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310] +Got completed request +Input: アメリカ合衆国の首都はどこですか? \n答え: +Output beam 0: Washington D.C. +1. ア +Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310] +Got completed request +Input: 美国的首都在哪里? \n答案: +Output beam 0: 华盛顿 +W +Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956] +``` diff --git a/docs/model_config.md b/docs/model_config.md new file mode 100644 index 00000000..b5e05d0c --- /dev/null +++ b/docs/model_config.md @@ -0,0 +1,376 @@ +# Model Configuration + +## Model Parameters + +The following tables show the parameters in the `config.pbtxt` of the models in +[all_models/inflight_batcher_llm](../tensorrt_llm/triton_backend/all_models/inflight_batcher_llm). +that can be modified before deployment. For optimal performance or custom +parameters, please refer to +[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md). + +The names of the parameters listed below are the values in the `config.pbtxt` +that can be modified using the +[`fill_template.py`](../tensorrt_llm/triton_backend/tools/fill_template.py) script. + +**NOTE** For fields that have comma as the value (e.g. `gpu_device_ids`, +`participant_ids`), you need to escape the comma with +a backslash. For example, if you want to set `gpu_device_ids` to `0,1` you need +to run `python3 fill_template.py -i config.pbtxt "gpu_device_ids:0\,1".` + +The mandatory parameters must be set for the model to run. The optional +parameters are not required but can be set to customize the model. + +### ensemble model + +See +[here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models) +to learn more about ensemble models. + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). | +| `logits_datatype` | The data type for context and generation logits. | + +### preprocessing model + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. | +| `tokenizer_dir` | The path to the tokenizer for the model. | +| `preprocessing_instance_count` | The number of instances of the model to run. | +| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. | +| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. | + +*Optional parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `add_special_tokens` | The `add_special_tokens` flag used by [HF tokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens). | +| `multimodal_model_path` | The vision engine path used in multimodal workflow. | +| `engine_dir` | The path to the engine for the model. This parameter is only needed for *multimodal processing* to extract the `vocab_size` from the engine_dir's config.json for `fake_prompt_id` mappings. | + + +### multimodal_encoders model + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. | +| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. | +| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. | +| `multimodal_model_path` | The vision engine path used in multimodal workflow. | +| `hf_model_path` | The Huggingface model path used for `llava_onevision` and `mllama` models. | + + +### postprocessing model + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. | +| `tokenizer_dir` | The path to the tokenizer for the model. | +| `postprocessing_instance_count` | The number of instances of the model to run. | + +*Optional parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `skip_special_tokens` | The `skip_special_tokens` flag used by [HF detokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.decode). | + +### tensorrt_llm model + +The majority of the `tensorrt_llm` model parameters and input/output tensors +can be mapped to parameters in the TRT-LLM C++ runtime API defined in +[`executor.h`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/executor/executor.h). +Please refer to the Doxygen comments in `executor.h` for a more detailed +description of the parameters below. + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_backend` | The backend to use for the model. Set to `tensorrtllm` to utilize the C++ TRT-LLM backend implementation. Set to `python` to utlize the TRT-LLM Python runtime. | +| `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). | +| `decoupled_mode` | Whether to use decoupled mode. Must be set to `true` for requests setting the `stream` tensor to `true`. | +| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. | +| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. | +| `engine_dir` | The path to the engine for the model. | +| `batching_strategy` | The batching strategy to use. Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` | +| `encoder_input_features_data_type` | The dtype for the input tensor `encoder_input_features`. For the mllama model, this must be `TYPE_BF16`. For other models like whisper, this is `TYPE_FP16`. | +| `logits_datatype` | The data type for context and generation logits. | + +*Optional parameters* + +- General + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `encoder_engine_dir` | When running encoder-decoder models, this is the path to the folder that contains the model configuration and engine for the encoder model. | +| `max_attention_window_size` | When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. (default=max_sequence_length) | +| `sink_token_length` | Number of sink tokens to always keep in attention window. | +| `exclude_input_in_output` | Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens. (default=`false`) | +| `cancellation_check_period_ms` | The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. (default=100) | +| `stats_check_period_ms` | The time for the statistics reporting thread to sleep before doing the next check. (default=100) | +| `recv_poll_period_ms` | The time for the receiving thread in orchestrator mode to sleep before doing the next check. (default=0) | +| `iter_stats_max_iterations` | The maximum number of iterations for which to keep statistics. (default=ExecutorConfig::kDefaultIterStatsMaxIterations) | +| `request_stats_max_iterations` | The maximum number of iterations for which to keep per-request statistics. (default=executor::kDefaultRequestStatsMaxIterations) | +| `normalize_log_probs` | Controls if log probabilities should be normalized or not. Set to `false` to skip normalization of `output_log_probs`. (default=`true`) | +| `gpu_device_ids` | Comma-separated list of GPU IDs to use for this model. Use semicolons to separate multiple instances of the model. If not provided, the model will use all visible GPUs. (default=unspecified) | +| `participant_ids` | Comma-separated list of MPI ranks to use for this model. Mandatory when using orchestrator mode with -disable-spawn-process (default=unspecified) | +| `num_nodes` | Number of MPI nodes to use for this model. (default=1) | +| `gpu_weights_percent` | Set to a number between 0.0 and 1.0 to specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime. Values less than 1.0 are only supported for an engine built with `weight_streaming` on. (default=1.0) | + +- KV cache + +Note that the parameter `enable_trt_overlap` has been removed from the +config.pbtxt. This option allowed to overlap execution of two micro-batches to +hide CPU overhead. Optimization work has been done to reduce the CPU overhead +and it was found that the overlapping of micro-batches did not provide +additional benefits. + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `max_tokens_in_paged_kv_cache` | The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. (default=unspecified) | +| `kv_cache_free_gpu_mem_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache. (default=0.9) | +| `cross_kv_cache_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of KV cache that may be used for cross attention, and the rest will be used for self attention. Optional param and should be set for encoder-decoder models ONLY. (default=0.5) | +| `kv_cache_host_memory_bytes` | Enable offloading to host memory for the given byte size. | +| `enable_kv_cache_reuse` | Set to `true` to reuse previously computed KV cache values (e.g. for system prompt) | + +- LoRA cache + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `lora_cache_optimal_adapter_size` | Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. (default=8) | +| `lora_cache_max_adapter_size` | Used to set the minimum size of a cache page. Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default=64) | +| `lora_cache_gpu_memory_fraction` | Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded. (default=0.05) | +| `lora_cache_host_memory_bytes` | Size of host LoRA cache in bytes. (default=1G) | +| `lora_prefetch_dir` | Folder to store the LoRA weights we hope to load during engine initialization. | + +- Decoding mode + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `max_beam_width` | The beam width value of requests that will be sent to the executor. (default=1) | +| `decoding_mode` | Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. Same applies to the ReDrafter, Lookahead and Eagle. | + +- Optimization + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `enable_chunked_context` | Set to `true` to enable context chunking. (default=`false`) | +| `multi_block_mode` | Set to `false` to disable multi block mode. (default=`true`) | +| `enable_context_fmha_fp32_acc` | Set to `true` to enable FMHA runner FP32 accumulation. (default=`false`) | +| `cuda_graph_mode` | Set to `true` to enable cuda graph. (default=`false`) | +| `cuda_graph_cache_size` | Sets the size of the CUDA graph cache, in numbers of CUDA graphs. (default=0) | + +- Scheduling + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `batch_scheduler_policy` | Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused. (default=`guaranteed_no_evict`) | + +- Medusa + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `medusa_choices` | To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. | + +- Eagle + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `eagle_choices` | To specify default per-server Eagle choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. | + +- Guided decoding + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `guided_decoding_backend` | Set to `xgrammar` to activate guided decoder. | +| `tokenizer_dir` | The guided decoding of tensorrt_llm python backend requires tokenizer's information. | +| `xgrammar_tokenizer_info_path` | The guided decoding of tensorrt_llm C++ backend requires xgrammar's tokenizer's info in 'json' format. | + +### tensorrt_llm_bls model + +See +[here](https://github.com/triton-inference-server/python_backend#business-logic-scripting) +to learn more about BLS models. + +*Mandatory parameters* + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `triton_max_batch_size` | The maximum batch size that the model can handle. | +| `decoupled_mode` | Whether to use decoupled mode. | +| `bls_instance_count` | The number of instances of the model to run. When using the BLS model instead of the ensemble, you should set the number of model instances to the maximum batch size supported by the TRT engine to allow concurrent request execution. | +| `logits_datatype` | The data type for context and generation logits. | + +*Optional parameters* + +- General + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `accumulate_tokens` | Used in the streaming mode to call the postprocessing model with all accumulated tokens, instead of only one token. This might be necessary for certain tokenizers. | + +- Speculative decoding + +The BLS model supports speculative decoding. Target and draft triton models are set with the parameters `tensorrt_llm_model_name` `tensorrt_llm_draft_model_name`. Speculative decodingis performed by setting `num_draft_tokens` in the request. `use_draft_logits` may be set to use logits comparison speculative decoding. Note that `return_generation_logits` and `return_context_logits` are not supported when using speculative decoding. Also note that requests with batch size greater than 1 is not supported with speculative decoding right now. + +| Name | Description | +| :----------------------: | :-----------------------------: | +| `tensorrt_llm_model_name` | The name of the TensorRT-LLM model to use. | +| `tensorrt_llm_draft_model_name` | The name of the TensorRT-LLM draft model to use. | + +### Model Input and Output + +Below is the lists of input and output tensors for the `tensorrt_llm` and +`tensorrt_llm_bls` models. + +#### Common Inputs + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `end_id` | [1] | `int32` | End token ID. If not specified, defaults to -1 | +| `pad_id` | [1] | `int32` | Padding token ID | +| `temperature` | [1] | `float32` | Sampling Config param: `temperature` | +| `repetition_penalty` | [1] | `float` | Sampling Config param: `repetitionPenalty` | +| `min_tokens` | [1] | `int32_t` | Sampling Config param: `minTokens` | +| `presence_penalty` | [1] | `float` | Sampling Config param: `presencePenalty` | +| `frequency_penalty` | [1] | `float` | Sampling Config param: `frequencyPenalty` | +| `seed` | [1] | `uint64_t` | Sampling Config param: `seed` | +| `return_log_probs` | [1] | `bool` | When `true`, include log probs in the output. Note: This requires at least one sampling parameter to be set (e.g., `runtime_top_k`, `runtime_top_p` for `tensorrt_llm` model, or `top_k`, `top_p` for `tensorrt_llm_bls` model). | +| `return_context_logits` | [1] | `bool` | When `true`, include context logits in the output | +| `return_generation_logits` | [1] | `bool` | When `true`, include generation logits in the output | +| `num_return_sequences` | [1] | `int32_t` | Number of generated sequences per request. (Default=1) | +| `beam_width` | [1] | `int32_t` | Beam width for this request; set to 1 for greedy sampling (Default=1) | +| `prompt_embedding_table` | [1] | `float16` (model data type) | P-tuning prompt embedding table | +| `prompt_vocab_size` | [1] | `int32` | P-tuning prompt vocab size | +| `return_perf_metrics` | [1] | `bool` | When `true`, include perf metrics in the output, such as kv cache reuse stats | +| `guided_decoding_guide_type` | [1] | `string` | Guided decoding param: `guide_type` | +| `guided_decoding_guide` | [1] | `string` | Guided decoding param: `guide` | + +The following inputs for lora are for both `tensorrt_llm` and `tensorrt_llm_bls` +models. The inputs are passed through the `tensorrt_llm` model and the +`tensorrt_llm_bls` model will refer to the inputs from the `tensorrt_llm` model. + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `lora_task_id` | [1] | `uint64` | The unique task ID for the given LoRA. To perform inference with a specific LoRA for the first time, `lora_task_id`, `lora_weights`, and `lora_config` must all be given. The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. If the cache is full, the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached | +| `lora_weights` | [ num_lora_modules_layers, D x Hi + Ho x D ] | `float` (model data type) | Weights for a LoRA adapter. See the config file for more details. | +| `lora_config` | [ num_lora_modules_layers, 3] | `int32t` | Module identifier. See the config file for more details. | + +#### Common Outputs + +Note: the timing metrics oputputs are represented as the number of nanoseconds since epoch. + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `cum_log_probs` | [-1] | `float` | Cumulative probabilities for each output | +| `output_log_probs` | [beam_width, -1] | `float` | Per-token log probabilities for each output. Only returned when `return_log_probs` is `true` and sampling parameters are set. | +| `context_logits` | [-1, vocab_size] | `float` | Context logits for input | +| `generation_logits` | [beam_width, seq_len, vocab_size] | `float` | Generation logits for each output | +| `batch_index` | [1] | `int32` | Batch index | +| `kv_cache_alloc_new_blocks` | [1] | `int32` | KV cache reuse metrics. Number of newly allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_new_blocks` in the outputs. | +| `kv_cache_reused_blocks` | [1] | `int32` | KV cache reuse metrics. Number of reused blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_reused_blocks` in the outputs. | +| `kv_cache_alloc_total_blocks` | [1] | `int32` | KV cache reuse metrics. Number of total allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_total_blocks` in the outputs. | +| `arrival_time_ns` | [1] | `float` | Time when the request was received by TRT-LLM. Set the optional input `return_perf_metrics` to `true` to include `arrival_time_ns` in the outputs. | +| `first_scheduled_time_ns` | [1] | `float` | Time when the request was first scheduled. Set the optional input `return_perf_metrics` to `true` to include `first_scheduled_time_ns` in the outputs. | +| `first_token_time_ns` | [1] | `float` | Time when the first token was generated. Set the optional input `return_perf_metrics` to `true` to include `first_token_time_ns` in the outputs. | +| `last_token_time_ns` | [1] | `float` | Time when the last token was generated. Set the optional input `return_perf_metrics` to `true` to include `last_token_time_ns` in the outputs. | +| `acceptance_rate` | [1] | `float` | Acceptance rate of the speculative decoding model. Set the optional input `return_perf_metrics` to `true` to include `acceptance_rate` in the outputs. | +| `total_accepted_draft_tokens` | [1] | `int32` | Number of tokens accepted by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_accepted_draft_tokens` in the outputs. | +| `total_draft_tokens` | [1] | `int32` | Maximum number of draft tokens acceptable by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_draft_tokens` in the outputs. | + +#### Unique Inputs for tensorrt_llm model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `input_ids` | [-1] | `int32` | Input token IDs | +| `input_lengths` | [1] | `int32` | Input lengths | +| `request_output_len` | [1] | `int32` | Requested output length | +| `draft_input_ids` | [-1] | `int32` | Draft input IDs | +| `decoder_input_ids` | [-1] | `int32` | Decoder input IDs | +| `decoder_input_lengths` | [1] | `int32` | Decoder input lengths | +| `draft_logits` | [-1, -1] | `float32` | Draft logits | +| `draft_acceptance_threshold` | [1] | `float32` | Draft acceptance threshold | +| `stop_words_list` | [2, -1] | `int32` | List of stop words | +| `bad_words_list` | [2, -1] | `int32` | List of bad words | +| `embedding_bias` | [-1] | `string` | Embedding bias words | +| `runtime_top_k` | [1] | `int32` | Top-k value for runtime top-k sampling | +| `runtime_top_p` | [1] | `float32` | Top-p value for runtime top-p sampling | +| `runtime_top_p_min` | [1] | `float32` | Minimum value for runtime top-p sampling | +| `runtime_top_p_decay` | [1] | `float32` | Decay value for runtime top-p sampling | +| `runtime_top_p_reset_ids` | [1] | `int32` | Reset IDs for runtime top-p sampling | +| `len_penalty` | [1] | `float32` | Controls how to penalize longer sequences in beam search (Default=0.f) | +| `early_stopping` | [1] | `bool` | Enable early stopping | +| `beam_search_diversity_rate` | [1] | `float32` | Beam search diversity rate | +| `stop` | [1] | `bool` | Stop flag | +| `streaming` | [1] | `bool` | Enable streaming | + +#### Unique Outputs for tensorrt_llm model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `output_ids` | [-1, -1] | `int32` | Output token IDs | +| `sequence_length` | [-1] | `int32` | Sequence length | + +#### Unique Inputs for tensorrt_llm_bls model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `text_input` | [-1] | `string` | Prompt text | +| `decoder_text_input` | [1] | `string` | Decoder input text | +| `image_input` | [3, 224, 224] | `float16` | Input image | +| `max_tokens` | [-1] | `int32` | Number of tokens to generate | +| `bad_words` | [2, num_bad_words] | `int32` | Bad words list | +| `stop_words` | [2, num_stop_words] | `int32` | Stop words list | +| `top_k` | [1] | `int32` | Sampling Config param: `topK` | +| `top_p` | [1] | `float32` | Sampling Config param: `topP` | +| `length_penalty` | [1] | `float32` | Sampling Config param: `lengthPenalty` | +| `stream` | [1] | `bool` | When `true`, stream out tokens as they are generated. When `false` return only when the full generation has completed (Default=`false`) | +|`embedding_bias_words` | [-1] | `string` | Embedding bias words | +| `embedding_bias_weights` | [-1] | `float32` | Embedding bias weights | +| `num_draft_tokens` | [1] | `int32` | Number of tokens to get from draft model during speculative decoding | +| `use_draft_logits` | [1] | `bool` | Use logit comparison during speculative decoding | + +#### Unique Outputs for tensorrt_llm_bls model + +| Name | Shape | Type | Description | +| :------------: | :---------------: | :-----------: | :--------: | +| `text_output` | [-1] | `string` | Text output | + +## Some tips for model configuration + +Below are some tips for configuring models for optimal performance. These +recommendations are based on our experiments and may not apply to all use cases. +For guidance on other parameters, please refer to the +[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md). + +- **Setting the `instance_count` for models to better utilize inflight batching** + + The `instance_count` parameter in the config.pbtxt file specifies the number + of instances of the model to run. Ideally, this should be set to match the + maximum batch size supported by the TRT engine, as this allows for concurrent + request execution and reduces performance bottlenecks. However, it will also + consume more CPU memory resources. While the optimal value isn't something we + can determine in advance, it generally shouldn't be set to a very small + value, such as 1. + For most use cases, we have found that setting `instance_count` to 5 works + well across a variety of workloads in our experiments. + +- **Adjusting `max_batch_size` and `max_num_tokens` to optimize inflight batching** + + `max_batch_size` and `max_num_tokens` are important parameters for optimizing + inflight batching. You can modify `max_batch_size` in the model configuration + file, while `max_num_tokens` is set during the conversion to a TRT-LLM engine + using the `trtllm-build` command. Tuning these parameters is necessary for + different scenarios, and experimentation is currently the best approach to + finding optimal values. Generally, the total number of requests should be + lower than `max_batch_size`, and the total tokens should be less than + `max_num_tokens`. diff --git a/docs/multimodal.md b/docs/multimodal.md new file mode 100755 index 00000000..a088ecf0 --- /dev/null +++ b/docs/multimodal.md @@ -0,0 +1,422 @@ +# End to end workflow to run a Multimodal model + +### Support Matrix +The following multimodal model is supported in tensorrtllm_backend: +* BLIP2-OPT +* LLAVA +* VILA +* LLaVA OneVision +* MLLAMA +* Qwen2-VL + +For more multimodal models supported in TensorRT-LLM, please visit [TensorRT-LLM multimodal examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal). + +## Run Multimodal with single-GPU Tritonserver +### Tritonserver setup steps +0. Make sure that you have initialized the TRT-LLM submodule: + + ```bash + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend + git lfs install + git submodule update --init --recursive + ``` + +1. Start the Triton Server Docker container: + + 1-1. If you're using Tritonserver from nvcr.io + ```bash + # Replace with the version of Triton you want to use. + # The command below assumes the the current directory is the + # TRT-LLM backend root git repository. + + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash + ``` + 1-2. If you are using `tensorrtllm_backend` container: + ```bash + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm + ``` + +2. Build the engine: + + 2-1. Clone the target model repository + ```bash + # For BLIP-OPT2 + export MODEL_NAME="blip2-opt-2.7b" + git clone https://huggingface.co/Salesforce/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For LLAVA + export MODEL_NAME="llava-1.5-7b-hf" + git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For VILA + pip install -r all_models/multimodal/requirements-vila.txt + + export MODEL_NAME="vila1.5-3b" + git clone https://huggingface.co/Efficient-Large-Model/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + export VILA_PATH="tmp/hf_models/VILA" + git clone https://github.com/Efficient-Large-Model/VILA.git ${VILA_PATH} + + # For LLaVA OneVision + pip install -r all_models/multimodal/requirements-llava-onevision.txt + + export MODEL_NAME="llava-onevision-qwen2-7b-ov-hf" + git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For MLLAMA + pip install -r all_models/multimodal/requirements-mllama.txt + + export MODEL_NAME="Llama-3.2-11B-Vision" + git clone https://huggingface.co/meta-llama/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + # For Qwen2-VL + pip install -r all_models/multimodal/requirements-qwen2vl.txt + + export MODEL_NAME="Qwen2-VL-7B-Instruct" + git clone https://huggingface.co/Qwen/${MODEL_NAME} tmp/hf_models/${MODEL_NAME} + + export + ``` + 2-2. Build TensorRT-LLM engines + ```bash + export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME} + export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp16/1-gpu + export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu + export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder + + # For BLIP-OPT2 + python tensorrt_llm/examples/models/contrib/opt/convert_checkpoint.py --model_type blip2 \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_beam_width 1 \ + --max_batch_size 8 \ + --max_seq_len 1024 \ + --max_input_len 924 \ + --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_multimodal_features) for BLIP2 + + python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_type blip2 --model_path ${HF_MODEL_PATH} --max_batch_size 8 + + # For LLAVA + python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_batch_size 8 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA + + python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8 + + # For VILA + python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_batch_size 8 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 6272 # 8 (max_batch_size) * 196 (num_multimodal_features) * 4 (max_num_images_per_request) + + python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type vila --vila_path ${VILA_PATH} --max_batch_size 32 #max_batch_size * max_num_images_per_request since vila support multiple images inference + + # For LLaVA OneVision + python tensorrt_llm/examples/models/contrib/qwen/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin float16 \ + --max_batch_size 1 \ + --max_input_len 7500 \ + --max_seq_len 7600 \ + --max_multimodal_len 7300 # max_batch_size * num_multimodal_features(depends on the image size or the specified video num frame) + + python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava_onevision --max_batch_size 16 # max_batch_size * patch for image or frame for video + + # For MLLAMA + python tensorrt_llm/examples/models/core/mllama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype bfloat16 + + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin auto \ + --max_batch_size 8 \ + --max_seq_len 2048 \ + --max_num_tokens 4096 \ + --max_encoder_input_len 6404 + + python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type mllama --output_dir ${MULTIMODAL_ENGINE_PATH} --max_batch_size 8 #max_batch_size * max_num_images_per_request + + # For Qwen2-VL + python3 ../qwen/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 + + trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin=float16 \ + --gpt_attention_plugin=float16 \ + --max_batch_size 4 \ + --max_input_len 2048 \ + --max_seq_len 3072 \ + --max_multimodal_len 1296 #(max_batch_size) * 324 (num_multimodal_features), this's for image_shape=[504,504] + + python build_multimodal_engine.py --model_type qwen2_vl --model_path tmp/hf_models/${MODEL_NAME} --output_dir ${MULTIMODAL_ENGINE_PATH} + ``` + + > **NOTE**: + > + > `max_multimodal_len = max_batch_size * num_multimodal_features`, so if you change `max_batch_size`, `max_multimodal_len` **MUST** be changed accordingly. + > For multi-image inference, where a single request could contain multiple images, `max_multimodal_len = max_batch_size * num_multimodal_features * max_num_images_per_request` + > + > The built visual engines are located in `tmp/trt_engines/${MODEL_NAME}/multimodal_encoder`. + +3. Prepare Tritonserver configs + + ```bash + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r + # Override the ensemble and creates new multimodal_encoders directories for multimodal + cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r + cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,cross_kv_cache_fraction:0.5 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32 + + # Newly added for multimodal + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000 + ``` + > **NOTE**: + > + > You can set the `decoupled_mode` option to True to use streaming mode. + > + > You can set the `accumulate_tokens` option to True in streaming mode to call the postprocessing model with all accumulated tokens. + > + > You can set the `enable_kv_cache_reuse` option to True to enable kv cache reuse. Requests with the same image/prompt table/input tokens will reuse the KV cache, which will help reduce latency. The specific performance improvement depends on the length of reuse. + > + > You can set the `max_num_images` to the max number of images per request. The value should be the same as the `max_num_images_per_request` value used at build the engine step above. + > + > Set `${ENCODER_INPUT_FEATURES_DTYPE}` to `TYPE_BF16` for mllama, and `TYPE_FP16` for other models. + > `cross_kv_cache_fraction` is used to determine the paged kv cache memory pool size of enc-dec models. For such case, we distinguish `free_fraction * (1 - cross_kv_cache_fraction)` to self attention kv caches, and `free_fraction * cross_kv_cache_fraction` to cross attention kv caches. + +4. Launch Tritonserver + + ```bash + python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000 + ``` + + > **NOTE**: + > If there is an error associated with 'MPI_Init_thread', please do `export PMIX_MCA_gds=hash`' + > + > When launching the server, since the prompt_embedding_table is in GPU memory, we need to set the CUDA pool memory for inter-step communication. For example, when we have a shape of (1, 576, 4096) promp_embedding table, we would need 300MB of CUDA pool memory, so we set 30MB to have some GPU buffers. (2(fp16=>2bytes) * 576 * 4096 * 8(max_batch_size) = 18,874,368) + > + > Also, the tensorrt_llm initialization assumes using another GPU, we need to initialize it but not use them. + +### Send requests +1. Send request with `decoupled_mode` set to False + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 + + [beam 0 ]: + Question: which city is this? Answer: singapore + [INFO] Latency: 41.942 ms + ``` +2. Send request with `decoupled_mode` set to True + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --streaming + + [beam 0 ]: sing + [beam 0 ]: apore + [beam 0 ]: + [INFO] Latency: 43.441 ms + ``` +3. Send request to the `tensorrt_llm_bls` model + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls + + [beam 0 ]: + Question: which city is this? Answer: singapore + [INFO] Latency: 44.152 ms + ``` + +4. Send request to the `tensorrt_llm_bls` model with `accumulate_tokens` set to True + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls --streaming + + [beam 0 ]: sing + [beam 0 ]: singapore + [beam 0 ]: singapore + [INFO] Latency: 45.48 ms + ``` + +5. Send request with `enable_kv_cache_reuse` set to True + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --prompt_table_extra_id ${id} + + [beam 0 ]: + Question: which city is this? Answer: singapore + [INFO] Latency: 42.514 ms + ``` +6. Send request with multiple images per request + ```bash + wget -O av.png https://raw.githubusercontent.com/Efficient-Large-Model/VILA/main/demo_images/av.png + + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text '\n\n Please elaborate what you see in the images?' --image av.png,'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 68 --model_type vila --hf_model_dir ${HF_MODEL_PATH} + + [beam 0 ]: + A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n \n Please elaborate what you see in the images? ASSISTANT: The first image shows a busy street scene with a car driving through a crosswalk, surrounded by pedestrians and traffic lights. The second image captures a beautiful sunset with the iconic Merlion statue spouting water into the bay, with the Singapore Flyer and the city skyline in the background. + + [INFO] Latency: 403.879 ms + ``` + +7. Send request with curl + The triton server supports curl requests with an image url in the payload. For example here is a request sent to a Llama-3.2-11B-Vision (mLLama) model: + ``` bash + curl -X POST localhost:8000/v2/models/ensemble/generate_stream \ + -d '{"id": "42", "text_input": "<|image|>If I had to write a haiku for this one", "image_url_input": "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png", "parameters": {"max_tokens": 16, "beam_width": 1, "end_id": 128001, "pad_id": 128004, "top_k": 1, "top_p": 0, "stream": false, "temperature": 0}}' + + # response + data: {"batch_index":0,"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"id":"42","model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_index":0,"sequence_start":false,"text_output":"If I had to write a haiku for this one, it would be:.\\nMerlion spouts water.\\nMarina"} + ``` + You can also send requests with base64 encoded images. Just replace the url above with `data:image/jpeg;base64,`. + +8. Send request with video input + ```bash + python tensorrt_llm/triton_backend/tools/multimodal/client.py --text "Why is this video funny?" --video sample_demo_1.mp4 --video_num_frames 8 --request-output-len 30 --model_type llava_onevision --end-id 151645 + + [beam 0 ]: + user + Why is this video funny?assistant + The video is funny because the child's actions are playful and exaggerated, as if they are reading the book with great enthusiasm. + [INFO] Latency: 507.537 ms + ``` + +> **NOTE**: +> Please ignore any exception thrown with the output. It's a known issue to be fixed. +> +> When `enable_kv_cache_reuse` is set to true, the `prompt_table_extra_id` must be specified in the requests. The `prompt_table_extra_id` is a unique identifier representing the image (or prompt table), the same image uses the same id. The data type is `uint64`, and the minimum value is 1. + +### Kill the server +```bash +pkill tritonserver +``` + +### Supported image input types +When programmatically preparing your own request for the server, note that `ensemble`: +- `image_input`: a float16 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of images already processed (via transformers AutoProcessor) for the vision encoder. +- `image_bytes_input`: a uint8 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of raw images. +- `image_url_input`: a list of strings of shape `[batch_size, num_images]` representing a batch of image urls. + +You may populate only one of these image inputs in a request. We suggest you use `image_bytes_input` when using grpc requests and `image_url_input` when sending http requests. For grpc requests where the client can preprocess images to reduce load on the server, use `image_input`. Note that `tensorrt_llm_bls` only supports `image_input`. + +### Long multimodal context, FP8 KV cache and tensor parallelism + +Follow these steps to enable chunked context inference (using LLaVA as an example) with FP8 KV cache and 2-way tensor parallelism. Ensure you convert the checkpoint using `--tp_size 2` and build the model with `--use_paged_context_fmha enable` and `--use_fp8_context_fmha enable`. Set the chunked context to true in the Tritonserver configuration file. The chunk size is determined by the `max_num_tokens` flag when building the engine, which defaults to 8192. When launching the server, you need to change the `--world_size` to match your tensor parallelism size. +1. Build the engine +```bash + export MODEL_NAME="llava-1.5-7b-hf" + export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME} + + # Convert checkpoint + # For fp16 KV cache + export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu + export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu + export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder + python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \ + --model_dir ${HF_MODEL_PATH} \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --dtype float16 \ + --tp_size 2 + + # For fp8 KV cache + export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu + export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu + export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder + python ./tensorrt_llm/examples/quantization/quantize.py \ + --model_dir ${HF_MODEL_PATH} \ + --dtype float16 \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --output_dir ${UNIFIED_CKPT_PATH} \ + --calib_size 512 \ + --tp_size 2 + + # Build the llm engine + # --use_paged_context_fmha and --use_fp8_context_fmha are defaultly enabled + # include --max_num_tokens to set the chunk size + trtllm-build \ + --checkpoint_dir ${UNIFIED_CKPT_PATH} \ + --output_dir ${ENGINE_PATH} \ + --gemm_plugin auto \ + --max_batch_size 8 \ + --max_input_len 2048 \ + --max_seq_len 2560 \ + --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA + + # Build the multimodal engine + python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8 --output_dir ${MULTIMODAL_ENGINE_PATH} +``` +2. Prepare the Tritonserver config file +Prepare the Tritonserver config file with `enable_chunked_context` set to True. Also, to further utilize the free memory, we can set `kv_cache_free_gpu_mem_fraction` to 0.9. +```bash +cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r +# Override the ensemble and creates new multimodal_encoders directories for multimodal +cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r +cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r + +# Changes the enable_chunked_context to True, and set kv_cache_free_gpu_mem_fraction to 0.9 +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:True,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,kv_cache_free_gpu_mem_fraction:0.9 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32 + +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32 + +# Newly added for multimodal +python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000 +``` +3. Launch the server +```bash +# Change --world_size to your tp size +python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000 +``` + +When you launch the server, you will see logs similar to the following. In theory, now you can process long multimodal context up to the "max tokens in paged KV cache" value, and the context prefill phase will be done in chunk sizes. +```bash +[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 93.10 GiB, available: 85.57 GiB +... +[TensorRT-LLM][INFO] [MemUsageChange] Allocated 77.02 GiB for max tokens in paged KV cache (315488). +``` diff --git a/docs/whisper.md b/docs/whisper.md new file mode 100644 index 00000000..29f33af0 --- /dev/null +++ b/docs/whisper.md @@ -0,0 +1,142 @@ +# End to end workflow to run a Multimodal model + +### Support Matrix +The following multimodal model is supported in tensorrtllm_backend: +* Whisper +* Distil-Whisper + +## Run Whisper with single-GPU Tritonserver +### Tritonserver setup steps +0. Make sure that you have initialized the TRT-LLM submodule: + + ```bash + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend + git lfs install + git submodule update --init --recursive + ``` + +1. Start the Triton Server Docker container: + + 1-1. If you're using Tritonserver from nvcr.io + ```bash + # Replace with the version of Triton you want to use. + # The command below assumes the the current directory is the + # TRT-LLM backend root git repository. + + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\-trtllm-python-py3 bash + ``` + 1-2. If you are using `tensorrtllm_backend` container: + ```bash + docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm + ``` + +2. Build the engine: + + 2-1. Download the whisper models + ```bash + wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken + wget --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + wget --directory-prefix=assets https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav + # take large-v3 model as an example + wget --directory-prefix=assets https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt + ``` + 2-2. Build TensorRT-LLM engines + ```bash + INFERENCE_PRECISION=float16 + MAX_BEAM_WIDTH=4 + MAX_BATCH_SIZE=64 + checkpoint_dir=tllm_checkpoint + output_dir=whisper_large_v3_max_batch_${MAX_BATCH_SIZE} + + python3 convert_checkpoint.py --model_dir ${MODEL_DIR} --output_dir ${checkpoint_dir} + + trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ + --output_dir ${output_dir}/encoder \ + --moe_plugin disable \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --gemm_plugin disable \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --max_input_len 3000 --max_seq_len=3000 + + trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ + --output_dir ${output_dir}/decoder \ + --moe_plugin disable \ + --max_beam_width ${MAX_BEAM_WIDTH} \ + --max_batch_size ${MAX_BATCH_SIZE} \ + --max_seq_len 114 \ + --max_input_len 14 \ + --max_encoder_input_len 3000 \ + --gemm_plugin ${INFERENCE_PRECISION} \ + --bert_attention_plugin ${INFERENCE_PRECISION} \ + --gpt_attention_plugin ${INFERENCE_PRECISION} + + ``` + + > **NOTE**: + > + > TensorRT-LLM also supports using [distil-whisper's](https://github.com/huggingface/distil-whisper) different models by first converting their params and weights from huggingface's naming format to [openai whisper](https://github.com/openai/whisper) naming format. You can do so by running the script [distil_whisper/convert_from_distil_whisper.py](./convert_from_distil_whisper.py). + +3. Prepare Tritonserver configs + + ```bash + cp tensorrt_llm/triton_backend/all_models/whisper/ model_repo_whisper -r + cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm model_repo_whisper -r + wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken + wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz + + BACKEND=tensorrtllm + DECOUPLED_MODE=false + DECODER_ENGINE_PATH=${output_dir}/decoder + ENCODER_ENGINE_PATH=${output_dir}/encoder + MAX_TOKENS_IN_KV_CACHE=24000 + BATCHING_STRATEGY=inflight_fused_batching + KV_CACHE_FREE_GPU_MEM_FRACTION=0.5 + EXCLUDE_INPUT_IN_OUTPUT=True + TRITON_MAX_BATCH_SIZE=8 + MAX_QUEUE_DELAY_MICROSECONDS=0 + MAX_BEAM_WIDTH=1 + MAX_QUEUE_SIZE="0" + ENABLE_KV_CACHE_REUSE=false + ENABLE_CHUNKED_CONTEXT=false + CROSS_KV_CACHE_FRACTION="0.5" + n_mels=128 + zero_pad=false + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},encoder_engine_dir:${ENCODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},cross_kv_cache_fraction:${CROSS_KV_CACHE_FRACTION},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32 + + python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/whisper_bls/config.pbtxt engine_dir:${ENCODER_ENGINE_PATH},n_mels:$n_mels,zero_pad:$zero_pad,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE} + ``` + > **NOTE**: + > + > TODO: You can set the `decoupled_mode` option to True to use streaming mode. + +4. Launch Tritonserver + + ```bash + python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=model_repo_whisper/ --tensorrt_llm_model_name tensorrt_llm,whisper_bls --multimodal_gpu0_cuda_mem_pool_bytes 300000000 + ``` + +### Send requests +1. Send request with a single audio file + ```bash + wget -nc https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav + # Test non-streaming + python3 tensorrt_llm/triton_backend/whisper/client.py --audio-path 1221-135766-0002.wav + ``` +2. Send requests with a whole audio dataset + ```bash + git clone https://github.com/yuekaizhang/Triton-ASR-Client.git + cd Triton-ASR-Client + num_task=16 + python3 tensorrt_llm/triton_backend/whisper/client.py \ + --server-addr localhost \ + --model-name whisper_bls \ + --num-tasks $num_task \ + --text-prompt "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \ + --manifest-dir ./datasets/aishell1_test \ + --compute-cer + ``` +### Kill the server +```bash +pkill tritonserver +``` diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt deleted file mode 100644 index 21d0c711..00000000 --- a/inflight_batcher_llm/CMakeLists.txt +++ /dev/null @@ -1,347 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: * -# Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. * Redistributions in binary -# form must reproduce the above copyright notice, this list of conditions and -# the following disclaimer in the documentation and/or other materials provided -# with the distribution. * Neither the name of NVIDIA CORPORATION nor the names -# of its contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS -# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -# EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, -# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -cmake_minimum_required(VERSION 3.17) -include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake) - -set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm) - -include_directories(${TRTLLM_DIR} ${TRTLLM_DIR}/cpp/include) - -include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake) - -project(tritontensorrtllmbackend LANGUAGES C CXX) - -add_compile_options("-DENABLE_MULTI_DEVICE=1") -# https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html -option(USE_CXX11_ABI "Using CXX11 ABI of libstdc++" OFF) -message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}") -if(USE_CXX11_ABI) - add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=1") -else() - add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0") -endif() - -# -# Options -# -# Must include options required for this project as well as any projects -# included in this one by FetchContent. -# -# TRITON_ENABLE_GPU is set to OFF as currently the code does not use any GPU -# related features since TRT-LLM backend manages the usage on GPUs itself. -option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF) -option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) -option(TRITON_ENABLE_METRICS "Include metrics support in server" ON) -option(BUILD_TESTS "Build Google tests" OFF) - -if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS) - message( - FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON") -endif() - -set(TRITON_COMMON_REPO_TAG - "main" - CACHE STRING "Tag for triton-inference-server/common repo") -set(TRITON_CORE_REPO_TAG - "main" - CACHE STRING "Tag for triton-inference-server/core repo") -set(TRITON_BACKEND_REPO_TAG - "main" - CACHE STRING "Tag for triton-inference-server/backend repo") - -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) -endif() - -set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDA_PATH}/include) -message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}") - -# -# Dependencies -# -# FetchContent requires us to include the transitive closure of all repos that -# we depend on so that we can override the tags. -# -include(FetchContent) - -FetchContent_Declare( - repo-common - GIT_REPOSITORY https://github.com/triton-inference-server/common.git - GIT_TAG ${TRITON_COMMON_REPO_TAG} - GIT_SHALLOW ON) -FetchContent_Declare( - repo-core - GIT_REPOSITORY https://github.com/triton-inference-server/core.git - GIT_TAG ${TRITON_CORE_REPO_TAG} - GIT_SHALLOW ON) -FetchContent_Declare( - repo-backend - GIT_REPOSITORY https://github.com/triton-inference-server/backend.git - GIT_TAG ${TRITON_BACKEND_REPO_TAG} - GIT_SHALLOW ON) -FetchContent_MakeAvailable(repo-common repo-core repo-backend) - -# -# The backend must be built into a shared library. Use an ldscript to hide all -# symbols except for the TRITONBACKEND API. -# -configure_file(src/libtriton_tensorrtllm.ldscript - libtriton_tensorrtllm.ldscript COPYONLY) - -set(COMMON_SRCS src/model_instance_state.cc src/model_state.cc src/utils.cc) - -add_library(triton-tensorrt-llm-common SHARED ${COMMON_SRCS}) - -set(BACKEND_SRCS src/libtensorrtllm.cc) - -add_library(triton-tensorrt-llm-backend SHARED ${BACKEND_SRCS}) - -enable_language(CUDA) - -find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED) -find_package(Python3 COMPONENTS Interpreter Development) - -find_library( - tensorrt_llm libtensorrt_llm.so REQUIRED - PATHS ${Python3_SITEARCH}/tensorrt_llm/libs - ${TRTLLM_DIR}/cpp/build/tensorrt_llm - ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm) - -find_library( - nvinfer_plugin_tensorrt_llm libnvinfer_plugin_tensorrt_llm.so REQUIRED - PATHS - ${Python3_SITEARCH}/tensorrt_llm/libs - ${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins - ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/plugins) - -find_program( - TRTLLM_EXECUTOR_WORKER executorWorker REQUIRED - PATHS - ${Python3_SITEARCH}/tensorrt_llm/bin - ${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker - ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/executor_worker -) -install( - PROGRAMS ${TRTLLM_EXECUTOR_WORKER} - DESTINATION ${CMAKE_BINARY_DIR} - RENAME trtllmExecutorWorker) - -find_library( - CUDNN_LIB cudnn - HINTS ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} - PATH_SUFFIXES lib64 lib) -find_library( - CUBLAS_LIB cublas - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib64 lib lib/stubs) -find_library( - CUBLASLT_LIB cublasLt - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib64 lib lib/stubs) -find_library( - CUDART_LIB cudart - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64) -find_library( - CUDA_DRV_LIB cuda - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs) -find_library( - NVIDIA_ML_LIB nvidia-ml - HINTS ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs) -set(CUDA_LIBRARIES ${CUDART_LIB} ${NVIDIA_ML_LIB}) - -find_package(MPI REQUIRED) -message(STATUS "Using MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}") -message(STATUS "Using MPI_LIBRARIES: ${MPI_LIBRARIES}") - -# NCCL dependencies -set_ifndef(NCCL_LIB_DIR /usr/lib/x86_64-linux-gnu/) -set_ifndef(NCCL_INCLUDE_DIR /usr/include/) -find_library(NCCL_LIB nccl HINTS ${NCCL_LIB_DIR}) - -# TRT_LIB_DIR and TRT_INCLUDE_DIR should be aligned with the path in the -# environment_setup.sh script -set_ifndef(TRT_LIB_DIR - /usr/local/tensorrt/targets/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/lib) -set_ifndef( - TRT_INCLUDE_DIR - /usr/local/tensorrt/targets/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/include) - -set(TRT_LIB nvinfer) -find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR}) - -file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS - REGEX "#define NV_TENSORRT_.*") -foreach(TYPE MAJOR MINOR PATCH BUILD) - string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING - ${VERSION_STRINGS}) - string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING}) -endforeach(TYPE) - -foreach(TYPE MAJOR MINOR PATCH) - string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING - ${VERSION_STRINGS}) - string(REGEX MATCH "[0-9]+" TRT_SO_${TYPE} ${TRT_TYPE_STRING}) -endforeach(TYPE) - -set(TRT_VERSION - "${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}" - CACHE STRING "TensorRT project version") -set(TRT_SOVERSION - "${TRT_SO_MAJOR}" - CACHE STRING "TensorRT library so version") -message( - STATUS - "Building for TensorRT version: ${TRT_VERSION}, library version: ${TRT_SOVERSION}" -) - -if(${TRT_MAJOR} GREATER_EQUAL 10) - add_definitions("-DTRT_LLM_USE_DIM64") - message( - STATUS "TensorRT version ${TRT_MAJOR} >= 10, int64 dimension is enabled") -endif() - -list(APPEND COMMON_HEADER_DIRS ${TORCH_INCLUDE_DIRS} ${TRT_INCLUDE_DIR}) -include_directories(${COMMON_HEADER_DIRS}) - -target_include_directories( - triton-tensorrt-llm-common - PUBLIC ${TRTLLM_DIR}/cpp - ${TRTLLM_DIR}/cpp/include - ${CMAKE_CURRENT_SOURCE_DIR}/src - ${CUDA_INCLUDE_DIRS} - ${CUDNN_ROOT_DIR}/include - ${NCCL_INCLUDE_DIR} - ${3RDPARTY_DIR}/cutlass/include - ${MPI_INCLUDE_PATH} - ${COMMON_HEADER_DIR}) - -target_compile_features(triton-tensorrt-llm-common PRIVATE cxx_std_17) -target_compile_features(triton-tensorrt-llm-backend PRIVATE cxx_std_17) - -set(COMPILE_OPTIONS - $<$,$,$>: - -Wall - -Wextra - -Wno-unused-parameter - -Wno-deprecated-declarations - -Wno-type-limits> - $<$:/Wall - /D_WIN32_WINNT=0x0A00 - /EHsc>) - -target_compile_options(triton-tensorrt-llm-common PRIVATE ${COMPILE_OPTIONS}) -target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS}) - -if(TRITON_ENABLE_METRICS) - list(APPEND REPORTER_SRCS - src/custom_metrics_reporter/custom_metrics_reporter.cc) - list(APPEND REPORTER_HDRS - src/custom_metrics_reporter/custom_metrics_reporter.h) - - add_library(triton-custom-metrics-reporter-library EXCLUDE_FROM_ALL - ${REPORTER_SRCS} ${REPORTER_HDRS}) - target_compile_features(triton-custom-metrics-reporter-library - PRIVATE cxx_std_17) - if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - target_compile_options(triton-custom-metrics-reporter-library - PRIVATE /W1 /D_WIN32_WINNT=0x0A00 /EHsc) - else() - target_compile_options( - triton-custom-metrics-reporter-library - PRIVATE -Wall -Wextra -Wno-unused-parameter -Wno-deprecated-declarations - -Werror) - endif() - - set_target_properties(triton-custom-metrics-reporter-library - PROPERTIES POSITION_INDEPENDENT_CODE ON) - - target_link_libraries( - triton-custom-metrics-reporter-library - PUBLIC triton-common-json # from repo-common - triton-common-logging # from repo-common - triton-core-serverapi # from repo-core - triton-core-serverstub # from repo-core - triton-backend-utils # from repo-backend - ${tensorrt_llm}) - - target_compile_definitions(triton-tensorrt-llm-common - PRIVATE TRITON_ENABLE_METRICS=1) - target_link_libraries(triton-tensorrt-llm-common - PRIVATE triton-custom-metrics-reporter-library) -endif() - -target_link_libraries( - triton-tensorrt-llm-common - PUBLIC ${tensorrt_llm} - triton-core-serverapi # from repo-core - triton-core-backendapi # from repo-core - triton-core-serverstub # from repo-core - triton-backend-utils # from repo-backend - ${MPI_LIBRARIES} - ${CUDA_LIBRARIES} - nvinfer - ${nvinfer_plugin_tensorrt_llm}) - -target_link_libraries(triton-tensorrt-llm-backend - PRIVATE triton-tensorrt-llm-common) - -FetchContent_Declare( - json - GIT_REPOSITORY https://github.com/nlohmann/json.git - GIT_TAG v3.11.2) - -FetchContent_MakeAvailable(json) - -target_link_libraries(triton-tensorrt-llm-common - PRIVATE nlohmann_json::nlohmann_json) - -if(WIN32) - set_target_properties( - triton-tensorrt-llm-backend PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm) - set_target_properties( - triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm_common) -else() - set_target_properties( - triton-tensorrt-llm-backend - PROPERTIES - POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm - LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript - LINK_FLAGS - "-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined" - ) - set_target_properties( - triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm_common) -endif() - -if(BUILD_TESTS) - enable_testing() - add_subdirectory(tests) -endif() diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md deleted file mode 100644 index 849c1947..00000000 --- a/inflight_batcher_llm/README.md +++ /dev/null @@ -1,379 +0,0 @@ -# Instructions to run TRT-LLM in-flight batching Triton backend: - -## Build TensorRT-LLM engine for inflight batching - -To configure a Triton server that runs a model using TensorRT-LLM, it is needed to compile a TensorRT-LLM engine for that model. - -For example, for LLaMA 7B, change to the `tensorrt_llm/examples/llama` directory: - -``` -cd tensorrt_llm/examples/llama -``` -Prepare the checkpoint of the model by following the instructions [here](https://huggingface.co/docs/transformers/main/en/model_doc/llama) and store it in a model directory. Then, create the engine: - -``` -python build.py --model_dir ${model_directory} \ - --dtype bfloat16 \ - --use_gpt_attention_plugin bfloat16 \ - --use_inflight_batching \ - --paged_kv_cache \ - --remove_input_padding \ - --use_gemm_plugin bfloat16 \ - --output_dir engines/bf16/1-gpu/ -``` - -To disable the support for in-flight batching (i.e. use the V1 batching mode), remove `--use_inflight_batching`. - -Similarly, for a GPT model, change to `tensorrt_llm/examples/gpt` directory: -``` -cd tensorrt_llm/examples/gpt - -``` -Prepare the model checkpoint following the instructions in the README file, store it in a model directory and build the TRT engine with: - -``` -python3 build.py --model_dir=${model_directory} \ - --dtype float16 \ - --use_inflight_batching \ - --use_gpt_attention_plugin float16 \ - --paged_kv_cache \ - --use_gemm_plugin float16 \ - --remove_input_padding \ - --hidden_act gelu \ - --output_dir=engines/fp16/1-gpu -``` - -## Create a model repository folder - -First run: -``` -rm -rf triton_model_repo -mkdir triton_model_repo -cp -R all_models/inflight_batcher_llm/* triton_model_repo -``` - -Then copy the TRT engine to `triton_model_repo/tensorrt_llm/1/`. For example for the LLaMA 7B example above, run: - -``` -cp -R tensorrt_llm/examples/llama/engines/bf16/1-gpu/ triton_model_repo/tensorrt_llm/1 -``` - -For the GPT example above, run: -``` -cp -R tensorrt_llm/examples/gpt/engines/fp16/1-gpu/ triton_model_repo/tensorrt_llm/1 -``` - - -Edit the `triton_model_repo/tensorrt_llm/config.pbtxt` file and replace `${decoupled_mode}` with `True` or `False`, and `${engine_dir}` with `/triton_model_repo/tensorrt_llm/1/1-gpu/` since the `triton_model_repo` folder created above will be mounted to `/triton_model_repo` in the Docker container. Decoupled mode must be set to true if using the streaming option from the client. - - -To use V1 batching, the `config.pbtxt` should have: -``` -parameters: { - key: "gpt_model_type" - value: { - string_value: "V1" - } -} -``` - -For in-flight batching, use: -``` -parameters: { - key: "gpt_model_type" - value: { - string_value: "inflight_fused_batching" - } -} -``` - -Note that the parameter `enable_trt_overlap` has been removed from the `config.pbtxt`. This option allowed to overlap execution of two micro-batches to hide CPU overhead. -Optimization work has been done to reduce the CPU overhead and it was found that the overlapping of micro-batches did not provide additional benefits. - -To reuse previously computed KV cache values (e.g. for system prompt), set `enable_kv_cache_reuse` -parameter to `True` in the `config.pbtxt` file: - -``` -parameters: { - key: "enable_kv_cache_reuse" - value: { - string_value: "True" - } -} -``` - -Or, equivalently, add `enable_kv_cache_reuse:True` to the invocation of the -`fill_template.py` tool: - -```bash -python3 tools/fill_template.py -i all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt "enable_kv_cache_reuse:True" -``` - -## Launch the Triton server container using the model_repository you just created - -``` -docker run --rm -it --net host --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --gpus='"'device=0'"' -v $(pwd)/triton_model_repo:/triton_model_repo tritonserver:w_trt_llm_backend /bin/bash -c "tritonserver --model-repository=/triton_model_repo" -``` - -## Run the provided client to send a request - -You can test the inflight batcher server with the provided reference python client as following: -``` -python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 -``` - -You can also stop the generation process early by using the `--stop-after-ms` option to send a stop request after a few milliseconds: - -``` -python inflight_batcher_llm_client.py --stop-after-ms 200 --request-output-len 200 -``` - -You will find that the generation process is stopped early and therefore the number of generated tokens is lower than 200. - -You can have a look at the client code to see how early stopping is achieved. - -## Running LoRA inference with inflight batching - -Build a model with LoRA enable - - -``` -BASE_LLAMA_MODEL=llama-7b-hf/ - -python convert_checkpoint.py --model_dir ${BASE_LLAMA_MODEL} \ - --output_dir ./tllm_checkpoint_1gpu \ - --dtype float16 - -trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu \ - --output_dir /tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/ \ - --gemm_plugin float16 \ - --max_batch_size 8 \ - --max_input_len 512 \ - --max_output_len 50 \ - --gpt_attention_plugin float16 \ - --paged_kv_cache enable \ - --remove_input_padding enable \ - --use_paged_context_fmha enable \ - --use_custom_all_reduce disable - --lora_plugin float16 \ - --lora_target_modules attn_q attn_k attn_v \ - --max_lora_rank 8 -``` - -Create a Triton model repository and launch the Triton server as described above. - -Now generate LoRA tensors that will be passed in with each request to triton. - -``` -git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1 -git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0 - -python3 tensorrt_llm/examples/hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16 -python3 tensorrt_llm/examples/hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16 -``` - -### LoRA Cache - -As LoRA weights are passed to the backend they will be cached in a host cache. As requests are scheduled, those weights with be prefetched to a gpu cache. After a LoRA is loaded into the cache, only `lora_task_id` is needed for inference. - - -Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. (default: 8) -``` -parameters: { - key: "lora_cache_optimal_adapter_size" - value: { - string_value: "${lora_cache_optimal_adapter_size}" - } -} -``` - - -Used to set the minimum size of a cache page. Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default: 64) -``` -parameters: { - key: "lora_cache_max_adapter_size" - value: { - string_value: "${lora_cache_max_adapter_size}" - } -} -``` - -Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded (default: 0.05) -``` -parameters: { - key: "lora_cache_gpu_memory_fraction" - value: { - string_value: "${lora_cache_gpu_memory_fraction}" - } -} -``` - -Size of host LoRA cache in bytes (default: 1G) -``` -parameters: { - key: "lora_cache_host_memory_bytes" - value: { - string_value: "${lora_cache_host_memory_bytes}" - } -} -``` - -Launch tritonserver as describe above - -Run Multi-LoRA example by issuing multiple concurrent requests. -The inflight batcher will execute mixed batches with multiple LoRAs in the same batch. - -First we cache the LoRAs by sending dummy requests for each adapter. The TASK_IDS are uniq to the adapter - -``` -TASK_IDS=("1" "2") -LORA_PATHS=("luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights") - -for index in ${!TASK_IDS[@]}; do - text="dummy" - lora_path=${LORA_PATHS[$index]} - task_id=${TASK_IDS[$index]} - lora_arg="--lora-path ${lora_path} --lora-task-id ${task_id}" - - python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \ - --top-k 0 \ - --top-p 0.5 \ - --request-output-len 10 \ - --text "${text}" \ - --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \ - ${lora_arg} & -done -``` - -Now perform inference with just `--lora-task-id` - -``` -INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:") -TASK_IDS=("" "1" "2" "" "1" "2") - -for index in ${!INPUT_TEXT[@]}; do - text=${INPUT_TEXT[$index]} - task_id=${TASK_IDS[$index]} - lora_arg="" - if [ "${task_id}" != "" ]; then - lora_arg="--lora-task-id ${task_id}" - fi - - python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \ - --top-k 0 \ - --top-p 0.5 \ - --request-output-len 10 \ - --text "${text}" \ - --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \ - ${lora_arg} & -done - -wait -``` - -Example Output: -``` -Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] -Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] -Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] -Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] -Input sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901] -Input sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901] -Got completed request -Input: アメリカ合衆国の首都はどこですか? \n答え: -Output beam 0: ワシントン D.C. -Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889] -Got completed request -Input: 美国的首都在哪里? \n答案: -Output beam 0: Washington, D.C. -What is the -Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278] -Got completed request -Input: 美国的首都在哪里? \n答案: -Output beam 0: Washington D.C. -Washington D. -Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889] -Got completed request -Input: アメリカ合衆国の首都はどこですか? \n答え: -Output beam 0: Washington, D.C. -Which of -Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310] -Got completed request -Input: アメリカ合衆国の首都はどこですか? \n答え: -Output beam 0: Washington D.C. -1. ア -Output sequence: [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310] -Got completed request -Input: 美国的首都在哪里? \n答案: -Output beam 0: 华盛顿 -W -Output sequence: [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956] -``` - -## Run the e2e/benchmark_core_model to benchmark - -### End to end test -End to end test script sends requests to deployed ensemble model. - -Ensemble model is ensembled by three models: preprocessing, tensorrt_llm and postprocessing. -* preprocessing: Tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints). -* tensorrt_llm: Inferencing. -* postprocessing: De-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string). - -The end to end latency includes the total latency of the three parts of an ensemble model. - -``` -cd tools/inflight_batcher_llm -python3 end_to_end_test.py --dataset -``` -Expected outputs -``` -[INFO] Functionality test succeed. -[INFO] Warm up for benchmarking. -[INFO] Start benchmarking on 125 prompts. -[INFO] Total Latency: 11099.243 ms -``` - -### benchmark core model - -benchmark_core_model script sends requests directly to deployed tensorrt_llm model, the benchmark core model latency indicates the inference latency of TensorRT-LLM, not including the pre/post-processing latency which is usually handled by a third-party library such as HuggingFace. - -benchmark_core_model can generate traffic from 2 sources. -1 - dataset (json file containning prompts and optional responses) -2 - token normal distribution (user specified input, output seqlen) - -By default, the test uses exponential distrution to control arrival rate of requests. It can be changed to constant arrival time. - -``` -cd tools/inflight_batcher_llm -``` -Example: Run dataset with 10 req/sec requested rate with provided tokenizer. -``` -python3 benchmark_core_model.py -i grpc --request_rate 10 dataset --dataset --tokenizer_dir <> --num_requests 5000 -``` -Example: Generate I/O seqlen tokens with input normal distribution with mean_seqlen=128, stdev=10. Output normal distribution with mean_seqlen=20, stdev=2. Set stdev=0 to get constant seqlens. -``` -python3 benchmark_core_model.py -i grpc --request_rate 10 token_norm_dist --input_mean 128 --input_stdev 5 --output_mean 20 --output_stdev 2 --num_requests 5000 -``` -Expected outputs -``` -[INFO] Warm up for benchmarking. -[INFO] Start benchmarking on 5000 prompts. -[INFO] Total Latency: 26585.349 ms -[INFO] Total request latencies: 11569672.000999955 ms -+----------------------------+----------+ -| Stat | Value | -+----------------------------+----------+ -| Requests/Sec | 188.09 | -| OP tokens/sec | 3857.66 | -| Avg. latency (ms) | 2313.93 | -| P99 latency (ms) | 3624.95 | -| P90 latency (ms) | 3127.75 | -| Avg. IP tokens per request | 128.53 | -| Avg. OP tokens per request | 20.51 | -| Total latency (ms) | 26582.72 | -| Total requests | 5000.00 | -+----------------------------+----------+ - -``` -*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.* diff --git a/inflight_batcher_llm/client/README.md b/inflight_batcher_llm/client/README.md deleted file mode 100644 index 9b3bea05..00000000 --- a/inflight_batcher_llm/client/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Sample TRT-LLM backend clients -Three sample TRT-LLM Triton clients are provided with the TRT-LLM Triton backend implementation. -* `e2e_grpc_speculative_decoding_client.py`: Demonstrates how to orchestrate between two independent TRT-LLM models - a draft model and a target model to achiever faster inferencing using speculative decoding. The high level design involves the client making a call to the draft model requesting a certain number of draft tokens, and then associating those draft tokens with a request to the target model. The target model returns some number of completion tokens internally leveraging the draft tokens to speed up inference. The client wraps these back-to-back calls to draft and target models in a loop to complete the full generation. -Example command: -``` -python3 e2e_grpc_speculative_decoding_client.py -p "The only thing we have to fear is" \ - --url-draft ${DRAFT_MODEL_URL} \ - --url-target ${TARGET_MODEL_URL} -``` -To get draft model draft tokens's logits, you need to enable `gather_generation_logits` when building then engine, and add `--return-draft-model-draft-logits` when running `e2e_grpc_speculative_decoding_client.py`. - -To get the target model accepted tokens's logits, you need to enable `gather_generation_logits` when building the engine, and add `--return-target-model-accepted-token-logits` when running `e2e_grpc_speculative_decoding_client.py`. - - -* `end_to_end_grpc_client.py`: Demonstrates sending a single request to a tritonserver running an ensemble including preprocessor (tokenizer), TRT-LLM model and postprocessor (detokenizer) and getting back a completion from it. -Example command: -``` -python3 end_to_end_grpc_client.py \ - --streaming --output-len 10 \ - --prompt "The only thing we have to fear is" - -``` -* `inflight_batcher_llm_client.py`: Isolates queries and responses to the TRT-LLM model alone. Invokes tokenizer and detokenizer in the client script i.e. outside the server running inference. -Example command: -``` -python3 inflight_batcher_llm_client.py \ - --tokenizer-dir ${TOKENIZER_PATH} \ - --tokenizer-type ${TOKENIZER_TYPE} \ - --input-tokens-csv=${LOGDIR}/prompts.csv \ - --output-tokens-csv=${LOGDIR}/completions.csv -``` diff --git a/inflight_batcher_llm/client/__init__.py b/inflight_batcher_llm/client/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py b/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py deleted file mode 100644 index ddcba288..00000000 --- a/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py +++ /dev/null @@ -1,564 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import queue -import sys - -import numpy as np -import tritonclient.grpc as grpcclient -from tritonclient.utils import InferenceServerException, np_to_triton_dtype - - -def prepare_tensor(name, input): - t = grpcclient.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - - -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - output = result.as_numpy('text_output') - print(output, flush=True) - - -def get_preprocessor_inputs(prompt, output_len, bad_words, stop_words, end_id, - pad_id): - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * output_len - - preprocessor_inputs = [ - prepare_tensor("QUERY", input0_data), - prepare_tensor("REQUEST_OUTPUT_LEN", output0_len), - ] - - if bad_words: - bad_words_list = np.array([bad_words], dtype=object) - preprocessor_inputs += [ - prepare_tensor("BAD_WORDS_DICT", bad_words_list) - ] - - if stop_words: - stop_words_list = np.array([stop_words], dtype=object) - preprocessor_inputs += [ - prepare_tensor("STOP_WORDS_DICT", stop_words_list) - ] - - if end_id: - end_id_data = np.array([[end_id]], dtype=np.int32) - preprocessor_inputs += [prepare_tensor("END_ID", end_id_data)] - - if pad_id: - pad_id_data = np.array([[pad_id]], dtype=np.int32) - preprocessor_inputs += [prepare_tensor("PAD_ID", pad_id_data)] - - return preprocessor_inputs - - -def extract_preprocessor_outputs(result): - - input_ids = np.squeeze(result.as_numpy("INPUT_ID").astype(np.int32), - axis=0) - bad_words_ids = result.as_numpy("BAD_WORDS_IDS").astype(np.int32) - stop_words_ids = result.as_numpy("STOP_WORDS_IDS").astype(np.int32) - end_id = result.as_numpy("OUT_END_ID").astype(np.int32)[0][0] - pad_id = result.as_numpy("OUT_PAD_ID").astype(np.int32)[0][0] - - return input_ids, bad_words_ids, stop_words_ids, end_id, pad_id - - -def get_trtllm_inputs(input_ids, - input_length, - request_output_len, - draft_tokens, - beam_width, - temperature, - repetition_penalty, - presence_penalty, - frequency_penalty, - bad_words_ids, - stop_words_ids, - end_id, - pad_id, - return_draft_model_draft_logits=False, - return_target_model_accepted_token_logits=False): - - # These two flags correspond to the settings of draft model and target model respectively, - # and only one of them can be true at a time. - assert not (return_draft_model_draft_logits - and return_target_model_accepted_token_logits) - - # input_ids is expected to have shape [input_length] - # Add batch dimension of 1 - input_ids_data = np.expand_dims(input_ids, axis=0) - inputs = [ - prepare_tensor("input_ids", input_ids_data), - prepare_tensor("input_lengths", - np.array([[input_length]], dtype=np.int32)), - prepare_tensor("request_output_len", - np.array([[request_output_len]], dtype=np.int32)), - prepare_tensor("bad_words_list", bad_words_ids), - prepare_tensor("stop_words_list", stop_words_ids), - prepare_tensor("beam_width", np.array([[beam_width]], dtype=np.int32)), - prepare_tensor("temperature", - np.array([[temperature]], dtype=np.float32)), - ] - - if draft_tokens is not None: - draft_tokens_data = np.array([draft_tokens], dtype=np.int32) - inputs.append(prepare_tensor("draft_input_ids", draft_tokens_data)) - - if repetition_penalty is not None: - repetition_penalty_data = np.array([[repetition_penalty]], - dtype=np.float32) - inputs.append( - prepare_tensor("repetition_penalty", repetition_penalty_data)) - - if presence_penalty is not None: - presence_penalty_data = np.array([[presence_penalty]], - dtype=np.float32) - inputs.append(prepare_tensor("presence_penalty", - presence_penalty_data)) - - if frequency_penalty is not None: - frequency_penalty_data = np.array([[frequency_penalty]], - dtype=np.float32) - inputs.append( - prepare_tensor("frequency_penalty", frequency_penalty_data)) - - if end_id is not None: - end_id_data = np.array([[end_id]], dtype=np.int32) - inputs.append(prepare_tensor("end_id", end_id_data)) - - if pad_id is not None: - pad_id_data = np.array([[pad_id]], dtype=np.int32) - inputs.append(prepare_tensor("pad_id", pad_id_data)) - - if return_draft_model_draft_logits: - return_draft_model_draft_logits_data = np.array( - [[return_draft_model_draft_logits]], dtype=bool) - inputs.append( - prepare_tensor("return_generation_logits", - return_draft_model_draft_logits_data)) - - if return_target_model_accepted_token_logits: - return_target_model_accepted_token_logits_data = np.array( - [[return_target_model_accepted_token_logits]], dtype=bool) - inputs.append( - prepare_tensor("return_generation_logits", - return_target_model_accepted_token_logits_data)) - - return inputs - - -def check_result(result, model_name): - if type(result) == InferenceServerException: - print( - f"Received an error from server while calling {model_name}: {result}" - ) - - -def extract_trtllm_outputs(result): - # Get batch 0, beam 0 output_ids - output_ids = np.squeeze(result.as_numpy("output_ids").astype(np.int32), - axis=(0, 1)) - sequence_length_data = result.as_numpy("sequence_length").astype(np.int32) - assert sequence_length_data.shape[0] == 1 - assert sequence_length_data.shape[1] == 1 - sequence_length = sequence_length_data[0, 0] - cum_log_probs = result.as_numpy("cum_log_probs").astype(np.float32) - output_log_probs = result.as_numpy("output_log_probs").astype(np.float32) - context_logits = result.as_numpy("context_logits").astype(np.float32) - generation_logits = result.as_numpy("generation_logits").astype(np.float32) - return output_ids, sequence_length, cum_log_probs, output_log_probs, context_logits, generation_logits - - -def get_postprocessor_inputs(output_ids, cum_log_probs, output_log_probs, - context_logits, generation_logits): - output_ids_data = np.expand_dims(output_ids, axis=(0, 1)) - inputs = [ - prepare_tensor("TOKENS_BATCH", output_ids_data), - prepare_tensor("SEQUENCE_LENGTH", - np.array([[len(output_ids)]], dtype=np.int32)), - prepare_tensor("CUM_LOG_PROBS", cum_log_probs), - prepare_tensor("OUTPUT_LOG_PROBS", output_log_probs), - prepare_tensor("CONTEXT_LOGITS", context_logits), - prepare_tensor("GENERATION_LOGITS", generation_logits) - ] - - return inputs - - -def encountered_stop_words(input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids): - return True - return False - - -def run_speculative_inference( - client_draft, client_target, prompt, output_len, in_num_draft_tokens, - request_id, repetition_penalty, presence_penalty, frequency_penalty, - temperature, stop_words, bad_words, end_id, pad_id, beam_width, - preprocessor_model_name, draft_tensorrt_llm_model_name, - target_tensorrt_llm_model_name, postprocessor_model_name, - return_draft_model_draft_logits, - return_target_model_accepted_token_logits, verbose): - - # Call the preprocessor - preprocessor_inputs = get_preprocessor_inputs(prompt, output_len, - bad_words, stop_words, - end_id, pad_id) - preprocessor_result = client_draft.infer(preprocessor_model_name, - preprocessor_inputs, - request_id=request_id) - check_result(preprocessor_result, preprocessor_model_name) - prompt_input_ids, bad_words_ids, stop_words_ids, end_id, pad_id = extract_preprocessor_outputs( - preprocessor_result) - - input_ids = prompt_input_ids - last_input_ids = None - draft_output_ids = None - - while True: - - num_draft_tokens = min( - in_num_draft_tokens, - len(prompt_input_ids) + output_len - len(input_ids) - 1) - - if num_draft_tokens > 0: - - if verbose: - print("Draft model input ids:") - print(input_ids.tolist()) - - #Generate up to num_draft_tokens with draft model - draft_inputs = get_trtllm_inputs( - input_ids, - len(input_ids), - num_draft_tokens, - None, - beam_width, - temperature, - repetition_penalty, - presence_penalty, - frequency_penalty, - bad_words_ids, - stop_words_ids, - end_id, - pad_id, - return_draft_model_draft_logits=return_draft_model_draft_logits - ) - - draft_result = client_draft.infer(draft_tensorrt_llm_model_name, - draft_inputs, - request_id=request_id) - check_result(draft_result, draft_tensorrt_llm_model_name) - draft_output_ids, draft_seq_len, cum_log_probs, output_log_probs, context_logits, generation_logits = extract_trtllm_outputs( - draft_result) - - if verbose: - print("Draft model output ids:") - print(draft_output_ids.tolist()) - print("draft_sequence_length") - print(draft_seq_len) - - # Set the draft token and call the target model to generate up to num_draft_tokens + 1 - draft_tokens = draft_output_ids[len(input_ids):draft_seq_len] - - if verbose: - print("draft_tokens") - print(draft_tokens.tolist()) - if return_draft_model_draft_logits: - draft_model_draft_token_logits = generation_logits.squeeze( - 0) # [beam_width, num_draft_tokens, vocab_size] - print( - f"draft model draft tokens' logits: shape: {draft_model_draft_token_logits.shape}, value: {draft_model_draft_token_logits}" - ) - - if verbose: - print("Target model input ids") - print(input_ids.tolist()) - - # Generate up to len(draft_tokens) + 1 with target model - target_inputs = get_trtllm_inputs( - input_ids, - len(input_ids), - len(draft_tokens) + 1 if num_draft_tokens > 0 else 1, - draft_tokens if num_draft_tokens > 0 else None, - beam_width, - temperature, - repetition_penalty, - presence_penalty, - frequency_penalty, - bad_words_ids, - stop_words_ids, - end_id, - pad_id, - return_target_model_accepted_token_logits= - return_target_model_accepted_token_logits) - - target_result = client_target.infer(target_tensorrt_llm_model_name, - target_inputs, - request_id=request_id) - check_result(target_result, target_tensorrt_llm_model_name) - target_output_ids, seq_length, cum_log_probs, output_log_probs, context_logits, generation_logits = extract_trtllm_outputs( - target_result) - - if verbose: - print("Target model output_ids") - print(target_output_ids.tolist()) - print("target seq_length") - print(seq_length) - if return_target_model_accepted_token_logits: - target_model_accept_token_logits = generation_logits.squeeze( - 0).squeeze(0) # [num_accepted_tokens, vocab_size] - print( - f"target model accepted tokens' logits: shape: {target_model_accept_token_logits.shape}, value: {target_model_accept_token_logits}" - ) - - # Store the last iteration input_ids to check if EOS was encountered - last_input_ids = input_ids - # Update the input ids with new output_ids - input_ids = target_output_ids - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = (len(input_ids) >= len(prompt_input_ids) + output_len) - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, target_output_ids) - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - # Need to check if stop words was encountered - hit_stop_words = encountered_stop_words(input_ids, stop_words_ids[0]) - - if verbose: - print("length_stop:", length_stop) - print("target_draft_equal:", target_draft_equal) - print("last_current_equal:", last_current_equal) - print("hit_stop_words:", hit_stop_words) - - if (length_stop or target_draft_equal or last_current_equal - or hit_stop_words): - break - - # Call the postprocessor - postprocessor_inputs = get_postprocessor_inputs(input_ids, cum_log_probs, - output_log_probs, - context_logits, - generation_logits) - postprocessor_result = client_target.infer(postprocessor_model_name, - postprocessor_inputs, - request_id=request_id) - check_result(postprocessor_result, postprocessor_model_name) - output = postprocessor_result.as_numpy("OUTPUT") - return output[0].decode("utf8") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - - parser.add_argument('--url-target', - type=str, - required=True, - help='Inference server URL for the target model') - - parser.add_argument('--url-draft', - type=str, - required=False, - help='Inference server URL for the draft model') - - parser.add_argument( - '--preprocessor-model-name', - type=str, - required=False, - default="preprocessing", - help='Name of the preprocessor model (should be hosted at url-draft)') - - parser.add_argument( - '--postprocessor-model-name', - type=str, - required=False, - default="postprocessing", - help='Name of the postprocessor model (should be hosted at url-target)' - ) - - parser.add_argument( - '--draft-tensorrt-llm-model-name', - type=str, - required=False, - default="tensorrt_llm", - help='Name of the tensorrt_llm draft model (hosted at url-draft)') - - parser.add_argument( - '--target-tensorrt-llm-model-name', - type=str, - required=False, - default="tensorrt_llm", - help='Name of the tensorrt_llm draft model (hosted at url-target)') - - parser.add_argument('-p', - '--prompt', - type=str, - required=True, - help='Input prompt.') - - parser.add_argument( - "-b", - "--beam-width", - required=False, - type=int, - default=1, - help="Beam width value", - ) - - parser.add_argument( - "--temperature", - type=float, - required=False, - default=1.0, - help="temperature value", - ) - - parser.add_argument( - "--repetition-penalty", - type=float, - required=False, - default=None, - help="The repetition penalty value", - ) - - parser.add_argument( - "--presence-penalty", - type=float, - required=False, - default=None, - help="The presence penalty value", - ) - - parser.add_argument( - "--frequency-penalty", - type=float, - required=False, - default=None, - help="The frequency penalty value", - ) - - parser.add_argument('-o', - '--output-len', - type=int, - default=100, - required=False, - help='Specify output length') - - parser.add_argument( - '--num-draft-tokens', - type=int, - default=5, - required=False, - help= - 'Specify the number of speculative tokens for the draft model to generate per lookahead.' - ) - - parser.add_argument('--end-id', - type=int, - default=None, - required=False, - help='The end if token') - - parser.add_argument('--pad-id', - type=int, - default=None, - required=False, - help='The pad if token') - - parser.add_argument('--request-id', - type=str, - default='1', - required=False, - help='The request_id for the stop request') - - parser.add_argument('--stop-words', - nargs='+', - default=[], - help='The stop words') - - parser.add_argument('--bad-words', - nargs='+', - default=[], - help='The bad words') - - parser.add_argument( - "--return-draft-model-draft-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine" - ) - - parser.add_argument( - "--return-target-model-accepted-token-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine", - ) - - FLAGS = parser.parse_args() - if not FLAGS.url_target: - FLAGS.url_target = "localhost:8001" - - if not FLAGS.url_draft: - FLAGS.url_draft = FLAGS.url_target - - try: - client_target = grpcclient.InferenceServerClient(url=FLAGS.url_target) - client_draft = grpcclient.InferenceServerClient( - url=FLAGS.url_draft) if ( - FLAGS.url_target != FLAGS.url_draft) else client_target - except Exception as e: - print("client creation failed: " + str(e)) - sys.exit(1) - - if (FLAGS.beam_width > 1): - raise Exception( - 'Beam width > 1 is not yet supported with speculative decoding') - - output_text = run_speculative_inference( - client_draft, client_target, FLAGS.prompt, FLAGS.output_len, - FLAGS.num_draft_tokens, FLAGS.request_id, FLAGS.repetition_penalty, - FLAGS.presence_penalty, FLAGS.frequency_penalty, FLAGS.temperature, - FLAGS.stop_words, FLAGS.bad_words, FLAGS.end_id, FLAGS.pad_id, - FLAGS.beam_width, FLAGS.preprocessor_model_name, - FLAGS.draft_tensorrt_llm_model_name, - FLAGS.target_tensorrt_llm_model_name, FLAGS.postprocessor_model_name, - FLAGS.return_draft_model_draft_logits, - FLAGS.return_target_model_accepted_token_logits, FLAGS.verbose) - - # Print the final text - print("Final text:\n", output_text) diff --git a/inflight_batcher_llm/client/end_to_end_grpc_client.py b/inflight_batcher_llm/client/end_to_end_grpc_client.py deleted file mode 100644 index 4a0240cb..00000000 --- a/inflight_batcher_llm/client/end_to_end_grpc_client.py +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/python - -import os -import sys -from functools import partial - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import queue -import sys - -import numpy as np -import tritonclient.grpc as grpcclient -from tritonclient.utils import InferenceServerException, np_to_triton_dtype - - -def prepare_tensor(name, input): - t = grpcclient.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - - -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - - -def run_inference(triton_client, - prompt, - output_len, - request_id, - repetition_penalty, - presence_penalty, - frequency_penalty, - temperature, - stop_words, - bad_words, - embedding_bias_words, - embedding_bias_weights, - model_name, - streaming, - beam_width, - overwrite_output_text, - return_context_logits_data, - return_generation_logits_data, - end_id, - pad_id, - verbose, - num_draft_tokens=0, - use_draft_logits=None): - - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * output_len - streaming_data = np.array([[streaming]], dtype=bool) - beam_width_data = np.array([[beam_width]], dtype=np.int32) - temperature_data = np.array([[temperature]], dtype=np.float32) - - inputs = [ - prepare_tensor("text_input", input0_data), - prepare_tensor("max_tokens", output0_len), - prepare_tensor("stream", streaming_data), - prepare_tensor("beam_width", beam_width_data), - prepare_tensor("temperature", temperature_data), - ] - - if num_draft_tokens > 0: - inputs.append( - prepare_tensor("num_draft_tokens", - np.array([[num_draft_tokens]], dtype=np.int32))) - if use_draft_logits is not None: - inputs.append( - prepare_tensor("use_draft_logits", - np.array([[use_draft_logits]], dtype=bool))) - - if bad_words: - bad_words_list = np.array([bad_words], dtype=object) - inputs += [prepare_tensor("bad_words", bad_words_list)] - - if stop_words: - stop_words_list = np.array([stop_words], dtype=object) - inputs += [prepare_tensor("stop_words", stop_words_list)] - - if repetition_penalty is not None: - repetition_penalty = [[repetition_penalty]] - repetition_penalty_data = np.array(repetition_penalty, - dtype=np.float32) - inputs += [ - prepare_tensor("repetition_penalty", repetition_penalty_data) - ] - - if presence_penalty is not None: - presence_penalty = [[presence_penalty]] - presence_penalty_data = np.array(presence_penalty, dtype=np.float32) - inputs += [prepare_tensor("presence_penalty", presence_penalty_data)] - - if frequency_penalty is not None: - frequency_penalty = [[frequency_penalty]] - frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32) - inputs += [prepare_tensor("frequency_penalty", frequency_penalty_data)] - - if return_context_logits_data is not None: - inputs += [ - prepare_tensor("return_context_logits", - return_context_logits_data), - ] - - if return_generation_logits_data is not None: - inputs += [ - prepare_tensor("return_generation_logits", - return_generation_logits_data), - ] - - if (embedding_bias_words is not None and embedding_bias_weights is None - ) or (embedding_bias_words is None - and embedding_bias_weights is not None): - assert 0, "Both embedding bias words and weights must be specified" - - if (embedding_bias_words is not None - and embedding_bias_weights is not None): - assert len(embedding_bias_words) == len( - embedding_bias_weights - ), "Embedding bias weights and words must have same length" - embedding_bias_words_data = np.array([embedding_bias_words], - dtype=object) - embedding_bias_weights_data = np.array([embedding_bias_weights], - dtype=np.float32) - inputs.append( - prepare_tensor("embedding_bias_words", embedding_bias_words_data)) - inputs.append( - prepare_tensor("embedding_bias_weights", - embedding_bias_weights_data)) - if end_id is not None: - end_id_data = np.array([[end_id]], dtype=np.int32) - inputs += [prepare_tensor("end_id", end_id_data)] - - if pad_id is not None: - pad_id_data = np.array([[pad_id]], dtype=np.int32) - inputs += [prepare_tensor("pad_id", pad_id_data)] - - user_data = UserData() - # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) - # Send request - triton_client.async_stream_infer(model_name, inputs, request_id=request_id) - - #Wait for server to close the stream - triton_client.stop_stream() - - # Parse the responses - output_text = "" - while True: - try: - result = user_data._completed_requests.get(block=False) - except Exception: - break - - if type(result) == InferenceServerException: - print("Received an error from server:") - print(result) - else: - output = result.as_numpy('text_output') - if streaming and beam_width == 1: - new_output = output[0].decode("utf-8") - if overwrite_output_text: - output_text = new_output - else: - output_text += new_output - else: - output_text = output[0].decode("utf-8") - if verbose: - print(output, flush=True) - - if return_context_logits_data is not None: - context_logits = result.as_numpy('context_logits') - if verbose: - print(f"context_logits.shape: {context_logits.shape}") - print(f"context_logits: {context_logits}") - if return_generation_logits_data is not None: - generation_logits = result.as_numpy('generation_logits') - if verbose: - print( - f"generation_logits.shape: {generation_logits.shape}") - print(f"generation_logits: {generation_logits}") - - if streaming and beam_width == 1: - if verbose: - print(output_text) - - return output_text - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - - parser.add_argument('-p', - '--prompt', - type=str, - required=True, - help='Input prompt.') - - parser.add_argument('--model-name', - type=str, - required=False, - default="ensemble", - choices=["ensemble", "tensorrt_llm_bls"], - help='Name of the Triton model to send request to') - - parser.add_argument( - "-S", - "--streaming", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable streaming mode. Default is False.", - ) - - parser.add_argument( - "-b", - "--beam-width", - required=False, - type=int, - default=1, - help="Beam width value", - ) - - parser.add_argument( - "--temperature", - type=float, - required=False, - default=1.0, - help="temperature value", - ) - - parser.add_argument( - "--repetition-penalty", - type=float, - required=False, - default=None, - help="The repetition penalty value", - ) - - parser.add_argument( - "--presence-penalty", - type=float, - required=False, - default=None, - help="The presence penalty value", - ) - - parser.add_argument( - "--frequency-penalty", - type=float, - required=False, - default=None, - help="The frequency penalty value", - ) - - parser.add_argument('-o', - '--output-len', - type=int, - default=100, - required=False, - help='Specify output length') - - parser.add_argument('--request-id', - type=str, - default='', - required=False, - help='The request_id for the stop request') - - parser.add_argument('--stop-words', - nargs='+', - default=[], - help='The stop words') - - parser.add_argument('--bad-words', - nargs='+', - default=[], - help='The bad words') - - parser.add_argument('--embedding-bias-words', - nargs='+', - default=[], - help='The biased words') - - parser.add_argument('--embedding-bias-weights', - nargs='+', - default=[], - help='The biased words weights') - - parser.add_argument( - '--overwrite-output-text', - action="http://github.com/store_true", - required=False, - default=False, - help= - 'In streaming mode, overwrite previously received output text instead of appending to it' - ) - - parser.add_argument( - "--return-context-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits", - ) - - parser.add_argument( - "--return-generation-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits", - ) - - parser.add_argument('--end-id', - type=int, - required=False, - help='The token id for end token.') - - parser.add_argument('--pad-id', - type=int, - required=False, - help='The token id for pad token.') - - FLAGS = parser.parse_args() - if FLAGS.url is None: - FLAGS.url = "localhost:8001" - - embedding_bias_words = FLAGS.embedding_bias_words if FLAGS.embedding_bias_words else None - embedding_bias_weights = FLAGS.embedding_bias_weights if FLAGS.embedding_bias_weights else None - - try: - client = grpcclient.InferenceServerClient(url=FLAGS.url) - except Exception as e: - print("client creation failed: " + str(e)) - sys.exit(1) - - return_context_logits_data = None - if FLAGS.return_context_logits: - return_context_logits_data = np.array([[FLAGS.return_context_logits]], - dtype=bool) - - return_generation_logits_data = None - if FLAGS.return_generation_logits: - return_generation_logits_data = np.array( - [[FLAGS.return_generation_logits]], dtype=bool) - - output_text = run_inference( - client, FLAGS.prompt, FLAGS.output_len, FLAGS.request_id, - FLAGS.repetition_penalty, FLAGS.presence_penalty, - FLAGS.frequency_penalty, FLAGS.temperature, FLAGS.stop_words, - FLAGS.bad_words, embedding_bias_words, embedding_bias_weights, - FLAGS.model_name, FLAGS.streaming, FLAGS.beam_width, - FLAGS.overwrite_output_text, return_context_logits_data, - return_generation_logits_data, FLAGS.end_id, FLAGS.pad_id, True) diff --git a/inflight_batcher_llm/client/inflight_batcher_llm_client.py b/inflight_batcher_llm/client/inflight_batcher_llm_client.py deleted file mode 100755 index f0e3a837..00000000 --- a/inflight_batcher_llm/client/inflight_batcher_llm_client.py +++ /dev/null @@ -1,874 +0,0 @@ -#!/usr/bin/env python -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import csv -import os -import queue -import sys -import time -from functools import partial - -import numpy as np -import tritonclient.grpc as grpcclient -from transformers import AutoTokenizer -from tritonclient.utils import InferenceServerException, np_to_triton_dtype - -# -# Simple streaming client for TRT-LLM inflight bacthing backend -# -# In order for this code to work properly, config.pbtxt must contain these values: -# -# model_transaction_policy { -# decoupled: True -# } -# -# parameters: { -# key: "gpt_model_type" -# value: { -# string_value: "inflight_batching" -# } -# } -# -# In order for gpt_model_type 'inflight_batching' to work, you must copy engine from -# -# tensorrt_llm/cpp/tests/resources/models/rt_engine/gpt2/fp16-inflight-batching-plugin/1-gpu/ -# - -np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"}) - -_str_to_np_dict = dict( - float16=np.float16, - float32=np.float32, - int32=np.int32, - bfloat16=np_bfloat16, -) - - -def curate_log_output(token_sequence, - identifier="Input", - log_max_sequence_len=256): - if len(token_sequence) > log_max_sequence_len: - print(f"{identifier} sequence starts with: ", - token_sequence[:log_max_sequence_len]) - else: - print(f"{identifier} sequence: ", token_sequence) - - -def str_dtype_to_np(dtype): - ret = _str_to_np_dict.get(dtype) - assert ret is not None, f'Unsupported dtype: {dtype}' - return ret - - -def check_output_names(expected_outputs, infer_result): - if expected_outputs: - output_names = set([o.name for o in infer_result._result.outputs]) - if set(expected_outputs) != output_names: - raise Exception( - f"expected outputs do not match actual outputs {expected_outputs} != {output_names}" - ) - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - - -def prepare_tensor(name, input): - t = grpcclient.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -def prepare_outputs(output_names): - - outputs = [] - for output_name in output_names: - outputs.append(grpcclient.InferRequestedOutput(output_name)) - return outputs - - -def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data, - beam_width_data, temperature_data, repetition_penalty_data, - presence_penalty_data, frequency_penalty_data, - streaming_data, end_id, pad_id, prompt_embedding_table_data, - prompt_vocab_size_data, lora_task_id_data, - lora_weights_data, lora_config_data, return_log_probs_data, - top_k_data, top_p_data, draft_ids_data, - return_context_logits_data, return_generation_logits_data, - decoder_input_ids_data): - inputs = [ - prepare_tensor("input_ids", input_ids_data), - prepare_tensor("input_lengths", input_lengths_data), - prepare_tensor("request_output_len", request_output_len_data), - prepare_tensor("beam_width", beam_width_data), - prepare_tensor("temperature", temperature_data), - prepare_tensor("streaming", streaming_data), - prepare_tensor("end_id", end_id), - prepare_tensor("pad_id", pad_id), - prepare_tensor("return_log_probs", return_log_probs_data), - prepare_tensor("runtime_top_k", top_k_data), - prepare_tensor("runtime_top_p", top_p_data), - ] - if prompt_embedding_table_data is not None: - inputs += [ - prepare_tensor("prompt_embedding_table", - prompt_embedding_table_data), - prepare_tensor("prompt_vocab_size", prompt_vocab_size_data) - ] - if lora_task_id_data is not None: - inputs += [prepare_tensor("lora_task_id", lora_task_id_data)] - if lora_weights_data is not None: - inputs += [ - prepare_tensor("lora_weights", lora_weights_data), - prepare_tensor("lora_config", lora_config_data), - ] - if repetition_penalty_data is not None: - inputs += [ - prepare_tensor("repetition_penalty", repetition_penalty_data), - ] - if presence_penalty_data is not None: - inputs += [ - prepare_tensor("presence_penalty", presence_penalty_data), - ] - if frequency_penalty_data is not None: - inputs += [ - prepare_tensor("frequency_penalty", frequency_penalty_data), - ] - if draft_ids_data is not None: - inputs += [ - prepare_tensor("draft_input_ids", draft_ids_data), - ] - if return_context_logits_data is not None: - inputs += [ - prepare_tensor("return_context_logits", - return_context_logits_data), - ] - if return_generation_logits_data is not None: - inputs += [ - prepare_tensor("return_generation_logits", - return_generation_logits_data), - ] - if decoder_input_ids_data is not None: - inputs += [ - prepare_tensor("decoder_input_ids", decoder_input_ids_data), - ] - return inputs - - -def prepare_stop_signals(): - - inputs = [ - grpcclient.InferInput('input_ids', [1, 1], "INT32"), - grpcclient.InferInput('input_lengths', [1, 1], "INT32"), - grpcclient.InferInput('request_output_len', [1, 1], "INT32"), - grpcclient.InferInput('stop', [1, 1], "BOOL"), - ] - - inputs[0].set_data_from_numpy(np.empty([1, 1], dtype=np.int32)) - inputs[1].set_data_from_numpy(np.zeros([1, 1], dtype=np.int32)) - inputs[2].set_data_from_numpy(np.array([[0]], dtype=np.int32)) - inputs[3].set_data_from_numpy(np.array([[True]], dtype='bool')) - - return inputs - - -# Define the callback function. Note the last two parameters should be -# result and error. InferenceServerClient would povide the results of an -# inference as grpcclient.InferResult in result. For successful -# inference, error will be None, otherwise it will be an object of -# tritonclientutils.InferenceServerException holding the error details -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - if (FLAGS.streaming): - if result.get_output('output_ids') is not None: - output_ids = result.as_numpy('output_ids') - seq_lens = result.as_numpy('sequence_length') - if seq_lens == None or seq_lens[0][0] > 0: - tokens = list(output_ids[0][0]) - print(tokens, flush=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-v", - "--verbose", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable verbose output", - ) - parser.add_argument( - "-u", - "--url", - type=str, - required=False, - default="localhost:8001", - help="Inference server URL. Default is localhost:8001.", - ) - parser.add_argument( - '--text', - type=str, - required=False, - default='Born in north-east France, Soyer trained as a', - help='Input text') - - parser.add_argument('--input-tokens-csv', - type=str, - required=False, - default='', - help='Path to csv file containing the input tokens') - - parser.add_argument('--draft-tokens-csv', - type=str, - required=False, - default='', - help='Path to csv file containing the draft tokens') - - parser.add_argument( - '--output-tokens-csv', - type=str, - required=False, - default='', - help='Path to csv file containing the expected output tokens') - - parser.add_argument( - '--end-id', - type=int, - required=False, - default=50256, - help='The token id for end token. Only needed if tokenizer is not used.' - ) - - parser.add_argument( - '--pad-id', - type=int, - required=False, - default=50256, - help='The token id for pad token. Only needed if tokenizer is not used.' - ) - - parser.add_argument( - "-s", - "--ssl", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable SSL encrypted channel to the server", - ) - parser.add_argument( - "-t", - "--stream-timeout", - type=float, - required=False, - default=None, - help="Stream timeout in seconds. Default is None.", - ) - parser.add_argument( - "-r", - "--root-certificates", - type=str, - required=False, - default=None, - help="File holding PEM-encoded root certificates. Default is None.", - ) - parser.add_argument( - "-p", - "--private-key", - type=str, - required=False, - default=None, - help="File holding PEM-encoded private key. Default is None.", - ) - parser.add_argument( - "-x", - "--certificate-chain", - type=str, - required=False, - default=None, - help="File holding PEM-encoded certificate chain. Default is None.", - ) - parser.add_argument( - "-C", - "--grpc-compression-algorithm", - type=str, - required=False, - default=None, - help= - "The compression algorithm to be used when sending request to server. Default is None.", - ) - parser.add_argument( - "-S", - "--streaming", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable streaming mode. Default is False.", - ) - parser.add_argument( - "-c", - "--check-output", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable check of output ids for CI", - ) - - parser.add_argument( - "-b", - "--beam-width", - required=False, - type=int, - default=1, - help="Beam width value", - ) - parser.add_argument( - "--temperature", - type=float, - required=False, - default=1.0, - help="temperature value", - ) - parser.add_argument( - "--repetition-penalty", - type=float, - required=False, - default=None, - help="The repetition penalty value", - ) - parser.add_argument( - "--presence-penalty", - type=float, - required=False, - default=None, - help="The presence penalty value", - ) - parser.add_argument( - "--frequency-penalty", - type=float, - required=False, - default=None, - help="The frequency penalty value", - ) - - parser.add_argument( - "--request-output-len", - type=int, - required=False, - default=16, - help="Request output length", - ) - parser.add_argument( - '--stop-after-ms', - type=int, - required=False, - default=0, - help='Early stop the generation after a few milliseconds') - parser.add_argument( - "--stop-via-request-cancel", - action="http://github.com/store_true", - required=False, - default=False, - help="Early stop use request cancellation instead of stop request") - parser.add_argument('--tokenizer-dir', - type=str, - required=False, - default='', - help='Specify tokenizer directory') - parser.add_argument('--tokenizer-type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - parser.add_argument('--request-id', - type=str, - default='', - required=False, - help='The request_id for the stop request') - - parser.add_argument('--prompt-embedding-table-path', - type=str, - default='', - required=False, - help='The prompt embedding table to use for ptuning') - parser.add_argument("--lora-path", - type=str, - default='', - required=False, - help="LoRA weights") - parser.add_argument("--lora-task-id", - type=int, - default=None, - required=False, - help="LoRA task id") - parser.add_argument( - "--exclude-input-in-output", - action="http://github.com/store_true", - required=False, - default=False, - help="Expect that output IDs do not contain input IDs", - ) - - parser.add_argument( - '--prompt-task-id', - type=int, - default=0, - required=False, - help='The prompt task id in the prompt embedding table') - - parser.add_argument('--dtype', - type=str, - default='float16', - choices=['float16', 'float32', 'bfloat16']) - - parser.add_argument( - "--return-log-probs", - action="http://github.com/store_true", - required=False, - default=False, - help="Enable computation of log probs", - ) - - parser.add_argument( - "--return-context-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits", - ) - - parser.add_argument( - "--return-generation-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits", - ) - - parser.add_argument( - "--top-k", - type=int, - required=False, - default=1, - help="top k value", - ) - - parser.add_argument( - "--top-p", - type=float, - required=False, - default=0., - help="top p value", - ) - - parser.add_argument('--requested-outputs', - nargs='+', - default=[], - help='The requested output tensors') - - parser.add_argument('--model-name', - type=str, - required=False, - default='tensorrt_llm', - help='Specify model name') - - FLAGS = parser.parse_args() - - tokenizer = None - draft_ids = None - decoder_input_ids = None - if FLAGS.input_tokens_csv != "": - with open(FLAGS.input_tokens_csv) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=",") - for row in csv_reader: - input_ids = [[int(val) for val in row]] - break - - curate_log_output(input_ids[0], "Input") - - if FLAGS.draft_tokens_csv != "": - with open(FLAGS.draft_tokens_csv) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=",") - for row in csv_reader: - draft_ids = [[int(val) for val in row]] - break - - end_id = FLAGS.end_id - pad_id = FLAGS.pad_id - - else: - print('=========') - if (os.path.isdir(FLAGS.tokenizer_dir) - and not os.path.exists(FLAGS.tokenizer_dir)): - raise FileNotFoundError( - "Input tokens are not provided and tokenizer directory does" - f" not exist: {FLAGS.tokenizer_dir}", ) - - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - pad_id = tokenizer.encode(tokenizer.pad_token, - add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, - add_special_tokens=False)[0] - print("Using pad_id: ", pad_id) - print("Using end_id: ", end_id) - - input_ids = [tokenizer.encode(FLAGS.text)] - curate_log_output(input_ids[0], "Input") - - end_id_data = np.array([[end_id]], dtype=np.int32) - pad_id_data = np.array([[pad_id]], dtype=np.int32) - - #Get the prompt embedding table for the task id - prompt_embedding_table_data = None - prompt_vocab_size_data = None - if (FLAGS.prompt_embedding_table_path != ""): - prompt_table = np.load(FLAGS.prompt_embedding_table_path) - prompt_table = prompt_table.astype(str_dtype_to_np(FLAGS.dtype)) - task_vocab_size = prompt_table.shape[1] - - # squeeze the first 2 dimensions - prompt_embedding_table_data = prompt_table[FLAGS.prompt_task_id] - prompt_embedding_table_data = np.expand_dims( - prompt_table[FLAGS.prompt_task_id], axis=0) - - prompt_vocab_size = [[task_vocab_size]] - prompt_vocab_size_data = np.array(prompt_vocab_size, dtype=np.int32) - - lora_weights_data = None - lora_config_data = None - if (FLAGS.lora_path != ""): - lora_weights_data = np.load( - os.path.join(FLAGS.lora_path, "model.lora_weights.npy")) - try: - lora_config_data = np.load( - os.path.join(FLAGS.lora_path, "model.lora_config.npy")) - except Exception: - lora_config_data = np.load( - os.path.join(FLAGS.lora_path, "model.lora_keys.npy")) - lora_task_id_data = None - if FLAGS.lora_task_id is not None and FLAGS.lora_task_id != 0: - lora_task_id_data = np.array([[FLAGS.lora_task_id]], dtype=np.uint64) - - input_ids_data = np.array(input_ids, dtype=np.int32) - input_lengths = [[len(ii)] for ii in input_ids] - input_lengths_data = np.array(input_lengths, dtype=np.int32) - request_output_len = [[FLAGS.request_output_len]] - request_output_len_data = np.array(request_output_len, dtype=np.int32) - beam_width = [[FLAGS.beam_width]] - beam_width_data = np.array(beam_width, dtype=np.int32) - top_k = [[FLAGS.top_k]] - top_k_data = np.array(top_k, dtype=np.int32) - top_p = [[FLAGS.top_p]] - top_p_data = np.array(top_p, dtype=np.float32) - temperature = [[FLAGS.temperature]] - temperature_data = np.array(temperature, dtype=np.float32) - return_log_probs = [[FLAGS.return_log_probs]] - return_log_probs_data = np.array(return_log_probs, dtype=bool) - - return_context_logits_data = None - if FLAGS.return_context_logits: - return_context_logits_data = np.array([[FLAGS.return_context_logits]], - dtype=bool) - - return_generation_logits_data = None - if FLAGS.return_generation_logits: - return_generation_logits_data = np.array( - [[FLAGS.return_generation_logits]], dtype=bool) - - repetition_penalty_data = None - if FLAGS.repetition_penalty is not None: - repetition_penalty = [[FLAGS.repetition_penalty]] - repetition_penalty_data = np.array(repetition_penalty, - dtype=np.float32) - presence_penalty_data = None - if FLAGS.presence_penalty is not None: - presence_penalty = [[FLAGS.presence_penalty]] - presence_penalty_data = np.array(presence_penalty, dtype=np.float32) - frequency_penalty_data = None - if FLAGS.frequency_penalty is not None: - frequency_penalty = [[FLAGS.frequency_penalty]] - frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32) - streaming = [[FLAGS.streaming]] - streaming_data = np.array(streaming, dtype=bool) - - draft_ids_data = None - if draft_ids is not None: - draft_ids_data = np.array(draft_ids, dtype=np.int32) - - decoder_input_ids_data = None - if decoder_input_ids is not None: - decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32) - - inputs = prepare_inputs( - input_ids_data, input_lengths_data, request_output_len_data, - beam_width_data, temperature_data, repetition_penalty_data, - presence_penalty_data, frequency_penalty_data, streaming_data, - end_id_data, pad_id_data, prompt_embedding_table_data, - prompt_vocab_size_data, lora_task_id_data, lora_weights_data, - lora_config_data, return_log_probs_data, top_k_data, top_p_data, - draft_ids_data, return_context_logits_data, - return_generation_logits_data, decoder_input_ids_data) - - if FLAGS.requested_outputs: - # Must have at least output_ids in requested outputs - if "output_ids" not in FLAGS.requested_outputs: - raise Exception( - "requested outputs must at least have \"output_ids\"") - outputs = prepare_outputs(FLAGS.requested_outputs) - else: - outputs = None - - stop_inputs = None - if FLAGS.stop_after_ms > 0 and not FLAGS.stop_via_request_cancel: - stop_inputs = prepare_stop_signals() - - request_id = FLAGS.request_id - - if FLAGS.output_tokens_csv != "": - with open(FLAGS.output_tokens_csv) as csv_file: - csv_reader = csv.reader(csv_file, delimiter=",") - for row in csv_reader: - expected_output_ids = [int(val) for val in row] - break - else: - expected_output_ids = ([] if FLAGS.exclude_input_in_output else - input_ids[0]) + [ - 21221, 290, 257, 4255, 379, 262, 1957, 7072, - 11, 4689, 347, 2852, 2564, 494, 13, 679 - ] - - if FLAGS.streaming: - actual_output_ids = [ - [] if FLAGS.exclude_input_in_output else input_ids[0] - ] - else: - actual_output_ids = [] - - sequence_lengths = [] - cum_log_probs = None - output_log_probs = None - context_logits = None - generation_logits = None - - user_data = UserData() - with grpcclient.InferenceServerClient( - url=FLAGS.url, - verbose=FLAGS.verbose, - ssl=FLAGS.ssl, - root_certificates=FLAGS.root_certificates, - private_key=FLAGS.private_key, - certificate_chain=FLAGS.certificate_chain, - ) as triton_client: - try: - - if FLAGS.streaming: - - # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), - stream_timeout=FLAGS.stream_timeout, - ) - # Send request - triton_client.async_stream_infer( - FLAGS.model_name, - inputs, - outputs=outputs, - request_id=request_id, - ) - - if FLAGS.stop_after_ms > 0: - time.sleep(FLAGS.stop_after_ms / 1000.0) - - if not FLAGS.stop_via_request_cancel: - triton_client.async_stream_infer( - FLAGS.model_name, - stop_inputs, - request_id=request_id, - parameters={'Streaming': FLAGS.streaming}) - - # Close the grpc stream - cancel_requests = FLAGS.stop_after_ms > 0 and FLAGS.stop_via_request_cancel - triton_client.stop_stream(cancel_requests=cancel_requests) - - # Parse the responses - while True: - try: - result = user_data._completed_requests.get(block=False) - except Exception: - break - - if type(result) == InferenceServerException: - if result.status() == "StatusCode.CANCELLED": - print("Request is cancelled") - else: - print("Received an error from server:") - print(result) - raise result - else: - check_output_names(FLAGS.requested_outputs, result) - output_ids = result.as_numpy('output_ids') - sequence_lengths = result.as_numpy('sequence_length') - if output_ids is not None: - # Only one beam is supported - if sequence_lengths == None or sequence_lengths[0][ - 0] > 0: - tokens = list(output_ids[0][0]) - actual_output_ids[ - 0] = actual_output_ids[0] + tokens - else: - print("Got cancellation response from server") - else: - # Send request - infer_future = triton_client.async_infer( - FLAGS.model_name, - inputs, - outputs=outputs, - request_id=request_id, - callback=partial(callback, user_data), - parameters={'Streaming': FLAGS.streaming}) - - expected_responses = 1 - - if FLAGS.stop_after_ms > 0: - - time.sleep(FLAGS.stop_after_ms / 1000.0) - - if FLAGS.stop_via_request_cancel: - infer_future.cancel() - else: - triton_client.async_infer( - FLAGS.model_name, - stop_inputs, - request_id=request_id, - callback=partial(callback, user_data), - parameters={'Streaming': FLAGS.streaming}) - expected_responses += 1 - - processed_count = 0 - while processed_count < expected_responses: - try: - result = user_data._completed_requests.get() - print("Got completed request", flush=True) - except Exception: - break - - if type(result) == InferenceServerException: - if result.status() == "StatusCode.CANCELLED": - print("Request is cancelled") - else: - print("Received an error from server:") - print(result) - raise result - else: - check_output_names(FLAGS.requested_outputs, result) - output_ids = result.as_numpy('output_ids') - if FLAGS.return_log_probs: - cum_log_probs = result.as_numpy('cum_log_probs') - output_log_probs = result.as_numpy( - 'output_log_probs') - if FLAGS.return_context_logits: - context_logits = result.as_numpy('context_logits') - if FLAGS.return_generation_logits: - generation_logits = result.as_numpy( - 'generation_logits') - if output_ids is not None: - sequence_lengths = result.as_numpy( - 'sequence_length') - for beam_output_ids in output_ids[0]: - tokens = list(beam_output_ids) - actual_output_ids.append(tokens) - else: - print("Got cancellation response from server") - - processed_count = processed_count + 1 - except Exception as e: - err = "Encountered error: " + str(e) - print(err) - sys.exit(err) - - passed = True - - for beam in range(FLAGS.beam_width): - seq_len = sequence_lengths[0][beam] if ( - not FLAGS.streaming and len(sequence_lengths) > 0) else len( - actual_output_ids[beam]) - # These should be equal when input IDs are excluded from output - output_ids_w_prompt = actual_output_ids[beam][:seq_len] - output_ids_wo_prompt = ( - output_ids_w_prompt if FLAGS.exclude_input_in_output else - output_ids_w_prompt[input_ids_data.shape[1]:]) - if tokenizer != None: - output_text = tokenizer.decode(output_ids_wo_prompt) - print(f'Input: {FLAGS.text}') - print(f'Output beam {beam}: {output_text}') - - # If cancelled, the number of output tokens should be less than request output length. - if FLAGS.stop_after_ms > 0 and len( - output_ids_wo_prompt) >= FLAGS.request_output_len: - raise AssertionError("expect less than " + - str(FLAGS.request_output_len) + - " output tokens, got " + - str(len(output_ids_wo_prompt))) - - curate_log_output(output_ids_w_prompt, "Output") - - if (FLAGS.check_output and beam == 0): - passed = (output_ids_w_prompt == expected_output_ids) - print("expected_output_ids = ", expected_output_ids) - print("\n=====") - print("PASS!" if passed else "FAIL!") - print("=====") - - if FLAGS.return_log_probs: - print(cum_log_probs) - print(output_log_probs) - - if FLAGS.return_context_logits: - print(f"context_logits.shape: {context_logits.shape}") - print(f"context_logits: {context_logits}") - - if FLAGS.return_generation_logits: - print(f"generation_logits.shape: {generation_logits.shape}") - print(f"generation_logits: {generation_logits}") - - sys.exit(not passed) diff --git a/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in b/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in deleted file mode 100644 index 84239071..00000000 --- a/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -include(CMakeFindDependencyMacro) - -get_filename_component( - TRITONTRTLLMBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH -) - -list(APPEND CMAKE_MODULE_PATH ${TRITONTRTLLMBACKEND_CMAKE_DIR }) - -if(NOT TARGET TritonTRTLLMBackend::triton-trtllm-backend) - include("${TRITONTRTLLMBACKEND_CMAKE_DIR }/TritonTRTLLMBackendTargets.cmake") -endif() - -set(TRITONTRTLLMBACKEND_LIBRARIES TritonTRTLLMBackend::triton-trtllm-backend) diff --git a/inflight_batcher_llm/cmake/modules/set_ifndef.cmake b/inflight_batcher_llm/cmake/modules/set_ifndef.cmake deleted file mode 100644 index bd8f0a3e..00000000 --- a/inflight_batcher_llm/cmake/modules/set_ifndef.cmake +++ /dev/null @@ -1,24 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & -# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. -# - -function(set_ifndef variable value) - if(NOT DEFINED ${variable}) - set(${variable} - ${value} - PARENT_SCOPE) - endif() -endfunction() diff --git a/inflight_batcher_llm/scripts/build.sh b/inflight_batcher_llm/scripts/build.sh deleted file mode 100644 index 22ac8bb6..00000000 --- a/inflight_batcher_llm/scripts/build.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash - -Help() -{ - # Display Help - echo "Syntax: build.sh [h|-t |u]" - echo "options:" - echo "h Print this Help." - echo "t Location of tensorrt library" - echo "u Option to build unit tests" - echo -} - -TRT_ROOT='/usr/local/tensorrt' -BUILD_UNIT_TESTS='false' - -# Get the options -while getopts ":ht:u" option; do - case $option in - h) # display Help - Help - exit;; - t) # Location of tensorrt - TRT_ROOT=$OPTARG;; - u) # Option to build unit tests - BUILD_UNIT_TESTS='true';; - \?) # Invalid option - echo "Error: Invalid option" - echo "" - Help - exit;; - esac -done - -echo "Using TRT_ROOT=${TRT_ROOT}" -echo "Using BUILD_UNIT_TESTS=${BUILD_UNIT_TESTS}" - -set -x -apt-get update -apt-get install -y --no-install-recommends rapidjson-dev - -BUILD_DIR=$(dirname $0)/../build -mkdir $BUILD_DIR -BUILD_DIR=$(cd -- "$BUILD_DIR" && pwd) -cd $BUILD_DIR - -export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib.real:${LD_LIBRARY_PATH}" - -BUILD_TESTS_ARG="" -if [[ "$BUILD_UNIT_TESTS" == "true" ]]; then - BUILD_TESTS_ARG="-DBUILD_TESTS=ON" -fi - -cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} .. -make install diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc deleted file mode 100644 index b1791b99..00000000 --- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include "custom_metrics_reporter.h" -#include "triton/backend/backend_common.h" -#include -#include -#include - -using namespace ::triton::common; // TritonJson - -namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter -{ - -const std::vector CustomMetricsReporter::request_keys_{ - "Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests"}; -const std::vector CustomMetricsReporter::request_labels_{"active", "max", "scheduled", "context"}; - -const std::vector CustomMetricsReporter::runtime_memory_keys_{ - "Runtime CPU Memory Usage", "Runtime GPU Memory Usage", "Runtime Pinned Memory Usage"}; -const std::vector CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"}; - -const std::vector CustomMetricsReporter::kv_cache_keys_{ - "Max KV cache blocks", "Free KV cache blocks", "Used KV cache blocks", "Tokens per KV cache block"}; -const std::vector CustomMetricsReporter::kv_cache_labels_{"max", "free", "used", "tokens_per"}; - -const std::vector CustomMetricsReporter::v1_specific_keys_{ - "Total Context Tokens", "Total Generation Tokens", "Empty Generation Slots"}; -const std::vector CustomMetricsReporter::v1_specific_labels_{ - "total_context_tokens", "total_generation_tokens", "empty_generation_slots"}; - -const std::vector CustomMetricsReporter::IFB_specific_keys_{ - "Total Context Tokens", "Generation Requests", "MicroBatch ID", "Paused Requests"}; -const std::vector CustomMetricsReporter::IFB_specific_labels_{ - "total_context_tokens", "generation_requests", "micro_batch_id", "paused_requests"}; - -const std::vector CustomMetricsReporter::general_metric_keys_{"Timestamp", "Iteration Counter"}; -const std::vector CustomMetricsReporter::general_metric_labels_{"timestamp", "iteration_counter"}; - -uint64_t convertTimestampToSeconds(std::string const& ts) -{ - std::tm tm = {}; - std::stringstream ss(ts); - ss >> std::get_time(&tm, "%m-%d-%Y %H:%M:%S"); - auto timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm)); - auto epoch = std::chrono::time_point_cast(timestamp).time_since_epoch(); - uint64_t time_in_seconds = std::chrono::duration_cast(epoch).count(); - return time_in_seconds; -} - -TritonMetricGroup::TritonMetricGroup(std::string const& metric_family_label, - std::string const& metric_family_description, std::string const& category_label, - std::vector const& json_keys, std::vector const& sub_labels) - : metric_family_label_(metric_family_label) - , metric_family_description_(metric_family_description) - , category_label_(category_label) - , json_keys_(json_keys) - , sub_labels_(sub_labels) -{ -} - -TRITONSERVER_Error* TritonMetricGroup::CreateGroup(std::string const& model_name, const uint64_t version) -{ - TRITONSERVER_MetricFamily* metric_family = nullptr; - RETURN_IF_ERROR(TRITONSERVER_MetricFamilyNew(&metric_family, TRITONSERVER_METRIC_KIND_GAUGE, - metric_family_label_.c_str(), metric_family_description_.c_str())); - metric_family_.reset(metric_family); - - std::vector labels; - std::unique_ptr model_label( - TRITONSERVER_ParameterNew("model", TRITONSERVER_PARAMETER_STRING, model_name.c_str())); - std::unique_ptr model_version( - TRITONSERVER_ParameterNew("version", TRITONSERVER_PARAMETER_STRING, std::to_string(version).c_str())); - labels.emplace_back(model_label.get()); - labels.emplace_back(model_version.get()); - - for (size_t i = 0; i < sub_labels_.size(); ++i) - { - TRITONSERVER_Metric* metric; - std::unique_ptr sub_label( - TRITONSERVER_ParameterNew(category_label_.c_str(), TRITONSERVER_PARAMETER_STRING, sub_labels_[i].c_str())); - labels.emplace_back(sub_label.get()); - RETURN_IF_ERROR(TRITONSERVER_MetricNew(&metric, metric_family_.get(), labels.data(), labels.size())); - std::unique_ptr unique_metric(metric); - metrics_.push_back(std::move(unique_metric)); - labels.pop_back(); - } - - return nullptr; // success -} - -TRITONSERVER_Error* TritonMetricGroup::UpdateGroup(std::vector& values) -{ - for (size_t i = 0; i < values.size(); ++i) - { - RETURN_IF_ERROR(TRITONSERVER_MetricSet(metrics_[i].get(), values[i])); - } - return nullptr; // success -} - -std::vector const& TritonMetricGroup::JsonKeys() const -{ - return json_keys_; -} - -TRITONSERVER_Error* CustomMetricsReporter::InitializeReporter( - std::string const& model_name, const uint64_t version, bool const is_v1_model) -{ - /* REQUEST METRIC GROUP */ - request_metric_family_ = std::make_unique( - "nv_trt_llm_request_metrics", "TRT LLM request metrics", "request_type", request_keys_, request_labels_); - - RETURN_IF_ERROR(request_metric_family_->CreateGroup(model_name, version)); - metric_groups_.push_back(std::move(request_metric_family_)); - - /* RUNTIME MEMORY METRIC GROUP */ - runtime_memory_metric_family_ = std::make_unique("nv_trt_llm_runtime_memory_metrics", - "TRT LLM runtime memory metrics", "memory_type", runtime_memory_keys_, runtime_memory_labels_); - - RETURN_IF_ERROR(runtime_memory_metric_family_->CreateGroup(model_name, version)); - metric_groups_.push_back(std::move(runtime_memory_metric_family_)); - - /* KV CACHE METRIC GROUP */ - kv_cache_metric_family_ = std::make_unique("nv_trt_llm_kv_cache_block_metrics", - "TRT LLM KV cache block metrics", "kv_cache_block_type", kv_cache_keys_, kv_cache_labels_); - - RETURN_IF_ERROR(kv_cache_metric_family_->CreateGroup(model_name, version)); - metric_groups_.push_back(std::move(kv_cache_metric_family_)); - - /* MODEL-TYPE METRIC GROUP (V1 / IFB) */ - std::string model = (is_v1_model) ? "v1" : "inflight_batcher"; - std::string model_metric_family_label = "nv_trt_llm_" + model + "_metrics"; - std::string model_metric_family_description = "TRT LLM " + model + "-specific metrics"; - std::string model_metric_family_category = model + "_specific_metric"; - - if (is_v1_model) - { - model_type_metric_family_ = std::make_unique(model_metric_family_label, - model_metric_family_description, model_metric_family_category, v1_specific_keys_, v1_specific_labels_); - } - else - { - model_type_metric_family_ = std::make_unique(model_metric_family_label, - model_metric_family_description, model_metric_family_category, IFB_specific_keys_, IFB_specific_labels_); - } - - RETURN_IF_ERROR(model_type_metric_family_->CreateGroup(model_name, version)); - metric_groups_.push_back(std::move(model_type_metric_family_)); - - /* GENERAL METRIC GROUP */ - general_metric_family_ = std::make_unique("nv_trt_llm_general_metrics", - "General TRT LLM metrics", "general_type", general_metric_keys_, general_metric_labels_); - - RETURN_IF_ERROR(general_metric_family_->CreateGroup(model_name, version)); - metric_groups_.push_back(std::move(general_metric_family_)); - - return nullptr; // success -} - -TRITONSERVER_Error* CustomMetricsReporter::UpdateCustomMetrics(std::string const& custom_metrics) -{ - triton::common::TritonJson::Value metrics; - std::vector members; - metrics.Parse(custom_metrics); - metrics.Members(&members); - - for (auto const& metric_group : metric_groups_) - { - std::vector metric_group_keys = metric_group->JsonKeys(); - std::vector metric_group_values; - for (auto const& key : metric_group_keys) - { - triton::common::TritonJson::Value value_json; - uint64_t value; - if (!metrics.Find(key.c_str(), &value_json)) - { - std::string errStr = std::string("Failed to find " + key + " in metrics."); - return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - } - if (key == "Timestamp") - { - std::string timestamp; - value_json.AsString(×tamp); - value = convertTimestampToSeconds(timestamp); - } - else - { - value_json.AsUInt(&value); - } - - metric_group_values.push_back(value); - } - - RETURN_IF_ERROR(metric_group->UpdateGroup(metric_group_values)); - } - - return nullptr; -} - -} // namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h deleted file mode 100644 index d0960178..00000000 --- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" -#include -#include -#include -#include -#include - -namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter -{ - -/// TritonMetricGroups are handled by the CustomMetricsReporter class -/// and encapsulate the creation/update functionality for a -/// group of TRT LLM statistics to be reported as custom Triton metrics. -/// The statistics (or custom metrics) handled by this class should -/// not be confused with Triton base metrics. -class TritonMetricGroup -{ -public: - TritonMetricGroup(std::string const& metric_family_label, std::string const& metric_family_description, - std::string const& category_label, std::vector const& json_keys, - std::vector const& labels); - ~TritonMetricGroup(){}; - - /// Create a new Triton metric family with corresponding metric - /// pointers and parameters. - /// - /// \param model_name The name of the model to provide a metrics - /// group for. - /// \param version The version of the model to provide a metrics - /// group for. - /// \return a TRITONSERVER_Error indicating success or failure. - TRITONSERVER_Error* CreateGroup(std::string const& model_name, const uint64_t version); - - /// Update the Triton metrics associated with this group using - /// the parsed TRT LLM backend statistics values. - /// - /// \param values Values parsed from the TRT LLM backend - /// statistics output, filtered by this group's JSON keys. - /// \return a TRITONSERVER_Error indicating success or failure. - TRITONSERVER_Error* UpdateGroup(std::vector& values); - - /// Return a list of JSON keys that correspond to the TRT LLM - /// statistics handled by this metric group. - /// - /// \return A const reference to vector of strings corresponding - /// to the JSON keys associated with this group. - std::vector const& JsonKeys() const; - - /// Custom deleter for a unique TRITONSERVER_MetricFamily pointer - struct MetricFamilyDeleter - { - void operator()(TRITONSERVER_MetricFamily* family) - { - if (family != nullptr) - { - TRITONSERVER_MetricFamilyDelete(family); - } - } - }; - - /// Custom deleter for a unique TRITONSERVER_Metric pointer - struct MetricDeleter - { - void operator()(TRITONSERVER_Metric* metric) - { - if (metric != nullptr) - { - TRITONSERVER_MetricDelete(metric); - } - } - }; - - /// Custom deleter for a unique TRITONSERVER_Parameter pointer - struct ParameterDeleter - { - void operator()(TRITONSERVER_Parameter* parameter) - { - if (parameter != nullptr) - { - TRITONSERVER_ParameterDelete(parameter); - } - } - }; - -private: - std::unique_ptr metric_family_; - std::vector> metrics_; - std::string metric_family_label_; - std::string metric_family_description_; - std::string category_label_; - std::vector json_keys_; - std::vector sub_labels_; -}; - -/// CustomMetricsReporter is an interface class meant to facilitate the -/// connection between TRT LLM backend statistics and Triton custom metrics. -/// It functions by passing BatchManager statistics data from -/// the TRT LLM backend to the multiple TritonMetricsGroup objects -/// it handles. -class CustomMetricsReporter -{ -public: - CustomMetricsReporter(){}; - ~CustomMetricsReporter(){}; - - /// Initialize the various TritonMetricGroups handled by - /// by this class using the static key/label members below. - /// - /// \param model The name of the model to provide metrics for. - /// \param version The version of the model to provide metrics for. - /// \param is_v1_model Whether the model type is v1 or an inflight - /// batching model. - /// \return a TRITONSERVER_Error indicating success or failure. - TRITONSERVER_Error* InitializeReporter(std::string const& model, const uint64_t version, bool const is_v1_model); - - /// Updates the vector of TritonMetricGroup objects with a - /// JSON-formatted statistics string. - /// - /// \param statistics A JSON-formatted string of TRT LLM backend - /// statistics. - /// \return a TRITONSERVER_Error indicating success or failure. - TRITONSERVER_Error* UpdateCustomMetrics(std::string const& custom_metrics); - - static const std::vector request_keys_; - static const std::vector request_labels_; - - static const std::vector runtime_memory_keys_; - static const std::vector runtime_memory_labels_; - - static const std::vector kv_cache_keys_; - static const std::vector kv_cache_labels_; - - static const std::vector v1_specific_keys_; - static const std::vector v1_specific_labels_; - - static const std::vector IFB_specific_keys_; - static const std::vector IFB_specific_labels_; - - static const std::vector general_metric_keys_; - static const std::vector general_metric_labels_; - -private: - std::vector> metric_groups_; - std::unique_ptr request_metric_family_; - std::unique_ptr runtime_memory_metric_family_; - std::unique_ptr kv_cache_metric_family_; - std::unique_ptr model_type_metric_family_; - std::unique_ptr general_metric_family_; -}; - -} // namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc deleted file mode 100644 index c6266dff..00000000 --- a/inflight_batcher_llm/src/libtensorrtllm.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include -#include -#include -#include -#include -#include - -// Triton headers -#include "triton/backend/backend_common.h" -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -// trtllm backend headers -#include "model_instance_state.h" -#include "model_state.h" - -namespace triton::backend::inflight_batcher_llm -{ - -extern "C" -{ - // Triton calls TRITONBACKEND_ModelInitialize when a model is loaded - // to allow the backend to create any state associated with the model, - // and to also examine the model configuration to determine if the - // configuration is suitable for the backend. Any errors reported by - // this function will prevent the model from loading. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) - { - // Create a ModelState object and associate it with the - // TRITONBACKEND_Model. If anything goes wrong with initialization - // of the model state then an error is returned and Triton will fail - // to load the model. - char const* cname; - RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname)); - const std::string name(cname); - - uint64_t version; - RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version)); - - ModelState* model_state; - RETURN_IF_ERROR(ModelState::Create(model, name, version, &model_state)); - RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast(model_state))); - - return nullptr; // success - } - - // Triton calls TRITONBACKEND_ModelFinalize when a model is no longer - // needed. The backend should cleanup any state associated with the - // model. This function will not be called until all model instances - // of the model have been finalized. - // - TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) - { - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); - ModelState* model_state = reinterpret_cast(vstate); - delete model_state; - - return nullptr; // success - } - - // Triton calls TRITONBACKEND_ModelInstanceInitialize when a model - // instance is created to allow the backend to initialize any state - // associated with the instance. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) - { - // Get the model state associated with this instance's model. - TRITONBACKEND_Model* model; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); - - void* vmodelstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); - ModelState* model_state = reinterpret_cast(vmodelstate); - - // Create a ModelInstanceState object and associate it with the - // TRITONBACKEND_ModelInstance. - ModelInstanceState* instance_state; - RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state)); - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast(instance_state))); - - return nullptr; // success - } - - // Triton calls TRITONBACKEND_ModelInstanceFinalize when a model - // instance is no longer needed. The backend should cleanup any state - // associated with the model instance. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) - { - TRITONBACKEND_Model* model; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); - - void* vstate; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); - ModelInstanceState* instance_state = reinterpret_cast(vstate); - delete instance_state; - - return nullptr; // success - } - - // When Triton calls TRITONBACKEND_ModelInstanceExecute it is required - // that a backend create a response for each request in the batch. A - // response may be the output tensors required for that request or may - // be an error that is returned in the response. - // - TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( - TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) - { - ModelInstanceState* instance_state; - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast(&instance_state))); - - instance_state->enqueue(requests, request_count); - - return nullptr; // success - } - -} // extern "C" - -} // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc deleted file mode 100644 index e098ba66..00000000 --- a/inflight_batcher_llm/src/model_instance_state.cc +++ /dev/null @@ -1,998 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "model_instance_state.h" -#include "utils.h" - -#include - -using executor::SizeType32; - -namespace triton::backend::inflight_batcher_llm -{ - -TRITONSERVER_Error* ModelInstanceState::Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state) -{ - try - { - *state = new ModelInstanceState(model_state, triton_model_instance); - } - catch (std::exception const& ex) - { - std::string errStr = std::string("unexpected error when creating modelInstanceState: ") + ex.what(); - return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - } - - return nullptr; // success -} - -executor::BatchingType ModelInstanceState::getBatchingTypeFromParams() -{ - executor::BatchingType batchingType; - auto gpt_model_type = model_state_->GetParameter("gpt_model_type"); - - if (gpt_model_type == "V1" || gpt_model_type == "v1") - { - batchingType = executor::BatchingType::kSTATIC; - } - else if (gpt_model_type == "inflight_batching" || gpt_model_type == "inflight_fused_batching") - { - batchingType = executor::BatchingType::kINFLIGHT; - } - else - { - throw std::runtime_error( - "Invalid gpt_model_type. Must be " - "v1/inflight_batching/inflight_fused_batching."); - } - return batchingType; -} - -executor::KvCacheConfig ModelInstanceState::getKvCacheConfigFromParams() -{ - std::optional maxTokensInPagedKvCache = std::nullopt; - try - { - maxTokensInPagedKvCache = model_state_->GetParameter("max_tokens_in_paged_kv_cache"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING( - "max_tokens_in_paged_kv_cache is not specified, will " - "use default value"); - } - - std::optional kvCacheFreeGpuMemFraction = std::nullopt; - try - { - kvCacheFreeGpuMemFraction = model_state_->GetParameter("kv_cache_free_gpu_mem_fraction"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING( - "kv_cache_free_gpu_mem_fraction is not specified, will use default value of 0.9 or " - "max_tokens_in_paged_kv_cache"); - } - - std::optional kvCacheHostCacheSize = std::nullopt; - try - { - kvCacheHostCacheSize = model_state_->GetParameter("kv_cache_host_memory_bytes"); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING("kv_cache_host_memory_bytes not set, defaulting to 0"); - } - - bool kvCacheOnboardBlocks = true; - try - { - kvCacheOnboardBlocks = model_state_->GetParameter("kv_cache_onboard_blocks"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("kv_cache_onboard_blocks not set, defaulting to true"); - } - - std::optional maxAttentionWindow = std::nullopt; - try - { - maxAttentionWindow = model_state_->GetParameter("max_attention_window_size"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING( - "max_attention_window_size is not specified, will " - "use default value (i.e. max_sequence_length)"); - } - - std::optional sinkTokenLength = std::nullopt; - try - { - sinkTokenLength = model_state_->GetParameter("sink_token_length"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING( - "sink_token_length is not specified, will " - "use default value"); - } - - bool enableKVCacheReuse = false; - try - { - enableKVCacheReuse = model_state_->GetParameter("enable_kv_cache_reuse"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("enable_kv_cache_reuse is not specified, will be set to false"); - } - - std::optional maxAttentionWindowSizeType = std::nullopt; - if (maxAttentionWindow.has_value()) - { - maxAttentionWindowSizeType = static_cast(maxAttentionWindow.value()); - } - - return executor::KvCacheConfig(enableKVCacheReuse, maxTokensInPagedKvCache, maxAttentionWindowSizeType, - sinkTokenLength, kvCacheFreeGpuMemFraction, kvCacheHostCacheSize, kvCacheOnboardBlocks); -} - -executor::ParallelConfig ModelInstanceState::getParallelConfigFromParams() -{ - executor::ParallelConfig parallelConfig; - auto const gpuDeviceIds = model_state_->GetDeviceIds(); - if (gpuDeviceIds.has_value()) - { - parallelConfig.setDeviceIds(gpuDeviceIds.value()); - } - - char const* str = std::getenv("TRTLLM_ORCHESTRATOR"); - if (str && std::atoi(str) != 0) - { - parallelConfig.setCommunicationMode(executor::CommunicationMode::kORCHESTRATOR); - auto workerExecutablePath = model_state_->GetExecutorWorkerPath(); - auto orchestratorConfig = executor::OrchestratorConfig(true, workerExecutablePath); - parallelConfig.setOrchestratorConfig(orchestratorConfig); - } - return parallelConfig; -} - -executor::PeftCacheConfig ModelInstanceState::getPeftCacheConfigFromParams() -{ - // parse LoRA / Peft cache parameters - // lora_cache_max_adapter_size - // lora_cache_optimal_adapter_size - // lora_cache_gpu_memory_fraction - // lora_cache_host_memory_bytes - - SizeType32 maxAdapterSize = 64; - SizeType32 optimalAdapterSize = 8; - std::optional hostCacheSize = std::nullopt; - std::optional deviceCachePercent = std::nullopt; - - std::string fieldName = "lora_cache_max_adapter_size"; - try - { - maxAdapterSize = model_state_->GetParameter(fieldName); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING(fieldName + " not set, defaulting to 64"); - } - - fieldName = "lora_cache_optimal_adapter_size"; - try - { - optimalAdapterSize = model_state_->GetParameter(fieldName); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING(fieldName + " not set, defaulting to 8"); - } - fieldName = "lora_cache_gpu_memory_fraction"; - try - { - deviceCachePercent = model_state_->GetParameter(fieldName); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING(fieldName + " not set, defaulting to 0.05"); - } - fieldName = "lora_cache_host_memory_bytes"; - try - { - hostCacheSize = model_state_->GetParameter(fieldName); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING(fieldName + " not set, defaulting to 1GB"); - } - - return executor::PeftCacheConfig(0, 0, optimalAdapterSize, maxAdapterSize, - ModelInstanceState::kPeftCacheNumPutWorkers, ModelInstanceState::kPeftCacheNumEnsureWorkers, - ModelInstanceState::kPeftCacheNumCopyStreams, 24, 8, deviceCachePercent, hostCacheSize); -} - -executor::SchedulerConfig ModelInstanceState::getSchedulerConfigFromParams(bool enableChunkedContext) -{ - using executor::CapacitySchedulerPolicy; - auto schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; - try - { - std::string schedulerPolicyStr = model_state_->GetParameter("batch_scheduler_policy"); - if (schedulerPolicyStr == "max_utilization") - { - schedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION; - } - else if (schedulerPolicyStr == "guaranteed_no_evict") - { - schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; - } - else - { - throw std::runtime_error( - "batch_scheduler_policy parameter was not found or is invalid " - "(must be max_utilization or guaranteed_no_evict)"); - } - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING(e.what()); - } - - if (isDecoupled() && schedulerPolicy != CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT) - { - if (!enableChunkedContext) - { - TLLM_LOG_WARNING( - "Decoupled mode with a batch scheduler policy other than guaranteed_no_evict " - "requires building the model with use_paged_context_fmha and setting " - "enable_chunked_context to true. " - "The batch scheduler policy will be set to guaranteed_no_evict " - "since enable_chunked_context is false."); - schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT; - } - } - return executor::SchedulerConfig(schedulerPolicy); -} - -executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams() -{ - auto batchingType = getBatchingTypeFromParams(); - - int32_t maxBeamWidth = 1; - try - { - maxBeamWidth = model_state_->GetParameter("max_beam_width"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("max_beam_width is not specified, will use default value of 1"); - } - - int32_t iterStatsMaxIterations = executor::kDefaultIterStatsMaxIterations; - try - { - iterStatsMaxIterations = model_state_->GetParameter("iter_stats_max_iterations"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("iter_stats_max_iterations is not specified, will use default value of " - + std::to_string(iterStatsMaxIterations)); - } - - int32_t requestStatsMaxIterations = executor::kDefaultRequestStatsMaxIterations; - try - { - requestStatsMaxIterations = model_state_->GetParameter("request_stats_max_iterations"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("request_stats_max_iterations is not specified, will use default value of " - + std::to_string(requestStatsMaxIterations)); - } - - try - { - model_state_->GetParameter("enable_trt_overlap"); - TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored"); - } - catch (std::exception const& e) - { - } - - bool normalizeLogProbs = true; - try - { - normalizeLogProbs = model_state_->GetParameter("normalize_log_probs"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("normalize_log_probs is not specified, will be set to true"); - } - - executor::ExecutorConfig executorConfig; - - auto kvCacheConfig = getKvCacheConfigFromParams(); - - bool enableChunkedContext = false; - try - { - enableChunkedContext = model_state_->GetParameter("enable_chunked_context"); - if (enableChunkedContext) - { - TLLM_LOG_WARNING( - "enable_chunked_context is set to true, will use context chunking " - "(requires building the model with use_paged_context_fmha)."); - } - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("enable_chunked_context is not specified, will be set to false."); - } - - auto schedulerConfig = getSchedulerConfigFromParams(enableChunkedContext); - - auto peftCacheConfig = getPeftCacheConfigFromParams(); - - auto parallelConfig = getParallelConfigFromParams(); - - std::optional decodingMode = std::nullopt; - try - { - std::string decodingModeStr = model_state_->GetParameter("decoding_mode"); - if (decodingModeStr == "top_k") - { - decodingMode = executor::DecodingMode::TopK(); - } - else if (decodingModeStr == "top_p") - { - decodingMode = executor::DecodingMode::TopP(); - } - else if (decodingModeStr == "top_k_top_p") - { - decodingMode = executor::DecodingMode::TopKTopP(); - } - else if (decodingModeStr == "beam_search") - { - decodingMode = executor::DecodingMode::BeamSearch(); - } - else if (decodingModeStr == "medusa") - { - decodingMode = executor::DecodingMode::Medusa(); - } - else - { - throw std::runtime_error(""); - } - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING( - "decoding_mode parameter is invalid or not specified" - "(must be one of the {top_k, top_p, top_k_top_p, beam_search, medusa})." - "Using default: top_k_top_p if max_beam_width == 1, beam_search otherwise"); - } - - executor::DecodingConfig decodingConfig(decodingMode); - - try - { - auto medusaChoices = model_state_->GetParameter("medusa_choices"); - decodingConfig.setMedusaChoices(medusaChoices); - } - catch (std::exception const& e) - { - if (decodingMode && decodingMode->isMedusa()) - { - TLLM_LOG_WARNING( - "medusa_choices parameter is not specified. " - "Will be using default mc_sim_7b_63 choices instead."); - } - } - - float gpuWeightsPercent = 1.0f; - try - { - gpuWeightsPercent = model_state_->GetParameter("gpu_weights_percent"); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING("gpu_weights_percent parameter is not specified, will use default value of 1.0"); - } - - return executor::ExecutorConfig(maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext, - normalizeLogProbs, iterStatsMaxIterations, requestStatsMaxIterations, batchingType, std::nullopt, - parallelConfig, peftCacheConfig, std::nullopt, std::nullopt, decodingConfig, gpuWeightsPercent); -} - -ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : model_state_(model_state) - , modelInstance_(triton_model_instance) -{ - - auto executorConfig = getExecutorConfigFromParams(); - -#ifdef TRITON_ENABLE_METRICS - custom_metrics_reporter_ = std::make_unique(); - custom_metrics_reporter_->InitializeReporter(model_state->GetModelName(), model_state->GetModelVersion(), - (executorConfig.getBatchingType() == executor::BatchingType::kSTATIC)); -#endif - - std::string decoderModelPath; - try - { - decoderModelPath = model_state_->GetParameter("gpt_model_path"); - TLLM_CHECK_WITH_INFO(std::filesystem::exists(decoderModelPath), - "Decoder (GPT) model path at %s does not exist.", decoderModelPath.c_str()); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("gpt_model_path is not specified, will be left empty"); - decoderModelPath = ""; - } - - std::string encoderModelPath; - try - { - encoderModelPath = model_state_->GetParameter("encoder_model_path"); - TLLM_CHECK_WITH_INFO(std::filesystem::exists(encoderModelPath), "Encoder model path at %s does not exist.", - encoderModelPath.c_str()); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("encoder_model_path is not specified, will be left empty"); - encoderModelPath = ""; - } - - TLLM_CHECK_WITH_INFO( - !decoderModelPath.empty() || !encoderModelPath.empty(), "Both encoder and decoder model paths are empty"); - - if (!decoderModelPath.empty()) - { - // Encoder-decoder model - if (!encoderModelPath.empty()) - { - mModelType = executor::ModelType::kENCODER_DECODER; - mExecutor - = std::make_unique(encoderModelPath, decoderModelPath, mModelType, executorConfig); - } - // Decoder only model - else - { - mModelType = executor::ModelType::kDECODER_ONLY; - mExecutor = std::make_unique(decoderModelPath, mModelType, executorConfig); - } - } - // Encoder only - else - { - mModelType = executor::ModelType::kENCODER_ONLY; - mExecutor = std::make_unique(encoderModelPath, mModelType, executorConfig); - } - - bool excludeInputInOutput = false; - try - { - excludeInputInOutput = model_state_->GetParameter("exclude_input_in_output"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("exclude_input_in_output is not specified, will be set to false"); - } - mInstanceSpecificConfig.excludeInputFromOutput = excludeInputInOutput; - - int cancellationCheckPeriodMs = 100; - try - { - cancellationCheckPeriodMs = model_state_->GetParameter("cancellation_check_period_ms"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("cancellation_check_period_ms is not specified, will be set to 100 (ms)"); - } - mInstanceSpecificConfig.cancellationCheckPeriodMs = cancellationCheckPeriodMs; - - int statsCheckPeriodMs = 100; - try - { - statsCheckPeriodMs = model_state_->GetParameter("stats_check_period_ms"); - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("stats_check_period_ms is not specified, will be set to 100 (ms)"); - } - mInstanceSpecificConfig.statsCheckPeriodMs = statsCheckPeriodMs; - - if (mExecutor->canEnqueueRequests()) - { - mStopWaitForResponse = false; - mWaitForResponseThread = std::thread(&ModelInstanceState::WaitForResponse, this); - - mStopWaitForStats = false; - mWaitForStatsThread = std::thread(&ModelInstanceState::WaitForStats, this); - - mStopWaitForCancel = false; - mWaitForCancelThread = std::thread(&ModelInstanceState::WaitForCancel, this); - } - else - { - // Shutdown the worker ranks which will cause them to wait for leader/orchestrator to terminate - mExecutor->shutdown(); - } -} - -void ModelInstanceState::sendEnqueueResponse(TRITONBACKEND_Request* request, TRITONSERVER_Error* error) -{ - TRITONBACKEND_ResponseFactory* factory; - LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory, request), "failed to create triton response factory"); - TRITONBACKEND_Response* tritonResponse; - LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&tritonResponse, factory), "Failed to create response"); - LOG_IF_ERROR(TRITONBACKEND_ResponseSend(tritonResponse, TRITONSERVER_RESPONSE_COMPLETE_FINAL, error), - "Cannot send response"); - LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), "Cannot release request"); -} - -bool ModelInstanceState::handleStopRequest(TRITONBACKEND_Request* request, std::string const& tritonRequestId) -{ - bool stopRequest = utils::getRequestBooleanInputTensor(request, kStopInputTensorName); - if (!stopRequest) - { - return false; - } - - TRITONSERVER_Error* error = nullptr; - - try - { - if (tritonRequestId == "") - { - throw std::runtime_error("Trying to stop a request but request ID is not provided"); - } - std::lock_guard lock(mRequestIdToRequestDataMutex); - if (mTritonRequestIdToRequestId.count(tritonRequestId)) - { - auto requestId = mTritonRequestIdToRequestId[tritonRequestId]; - mExecutor->cancelRequest(requestId); - } - } - catch (std::exception const& e) - { - error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, e.what()); - } - // mTritonRequestIdToRequestId.count(tritonRequestId) == false doesn't necessary mean an error since the - // request to cancel may already be completed. - // Send an empty response to indicate the request has been successfully cancelled - sendEnqueueResponse(request, error); - return true; -} - -executor::Request ModelInstanceState::createExecutorRequest( - TRITONBACKEND_Request* request, bool excludeInputFromOutput, bool isDecoupled, executor::ModelType modelType) -{ - auto inputsTensors = utils::readInputsTensors(request); - bool streaming = utils::getRequestBooleanInputTensor(request, kStreamingInputTensorName); - return utils::createRequestFromInputTensors( - inputsTensors, excludeInputFromOutput, isDecoupled, streaming, modelType); -} - -void ModelInstanceState::enqueue(TRITONBACKEND_Request** requests, uint32_t const request_count) -{ - - uint64_t exec_start_ns{0}; - SET_TIMESTAMP(exec_start_ns); - - for (uint32_t i = 0; i < request_count; ++i) - { - TRITONBACKEND_Request* request = requests[i]; - - try - { - char const* charRequestId = nullptr; - TRITONBACKEND_RequestId(request, &charRequestId); - std::string tritonRequestId; - if (charRequestId != nullptr) - { - tritonRequestId = charRequestId; - } - - if (handleStopRequest(request, tritonRequestId)) - { - continue; - } - - auto executorRequest = createExecutorRequest( - request, mInstanceSpecificConfig.excludeInputFromOutput, isDecoupled(), mModelType); - - int64_t inputTokensSize = executorRequest.getInputTokenIds().size(); - executor::SizeType32 beamWidthCopy = executorRequest.getSamplingConfig().getBeamWidth(); - std::lock_guard lock(mRequestIdToRequestDataMutex); - uint64_t compute_start_ns{0}; - SET_TIMESTAMP(compute_start_ns); - auto requestId = mExecutor->enqueueRequest(executorRequest); - if (mRequestIdToRequestData.count(requestId)) - { - TLLM_LOG_ERROR( - "Executor returns a request ID that already exists. This shouldn't happen unless there is " - "something " - "wrong in TRT-LLM runtime."); - } - TRITONBACKEND_ResponseFactory* factory; - LOG_IF_ERROR( - TRITONBACKEND_ResponseFactoryNew(&factory, request), "failed to create triton response factory"); - - auto requestOutputNames = utils::getRequestOutputNames(request); - mRequestIdToRequestData.emplace(requestId, - RequestData{factory, request, tritonRequestId, inputTokensSize, beamWidthCopy, - std::move(requestOutputNames), {exec_start_ns, compute_start_ns, 0, 0}}); - if (tritonRequestId != "") - { - mTritonRequestIdToRequestId[tritonRequestId] = requestId; - } - } - catch (std::exception const& e) - { - sendEnqueueResponse(request, TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, e.what())); - } - } - return; -} - -TRITONSERVER_Error* ModelInstanceState::reportBaseMetrics(RequestData& requestData, TRITONSERVER_Error* error) -{ - auto& timestamps = requestData.timestamps; - SET_TIMESTAMP(timestamps.exec_end_ns); - - RETURN_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics(modelInstance_, requestData.tritonRequest, (error == nullptr), - timestamps.exec_start_ns, timestamps.compute_start_ns, timestamps.compute_end_ns, timestamps.exec_end_ns)); - - // For now we will assume a batch size of 1 for each request. This may change in the future but for - // now it seems that even when requests are dynamically batched together each workItem is associated - // with its own request object and is handled independently due to the nature of IFB. - RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceReportBatchStatistics(modelInstance_, 1 /* batch size */, - timestamps.exec_start_ns, timestamps.compute_start_ns, timestamps.compute_end_ns, timestamps.exec_end_ns)); - - return nullptr; // success -} - -std::tuple ModelInstanceState::fillTritonResponse( - TRITONBACKEND_ResponseFactory* factory, executor::Response const& response, RequestData const& requestData) -{ - TRITONBACKEND_Response* tritonResponse; - LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&tritonResponse, factory), "Failed to create response"); - - TRITONSERVER_Error* error = nullptr; - bool isFinal = false; - try - { - if (!response.hasError()) - { - auto const& result = response.getResult(); - isFinal = result.isFinal; - error = nullptr; - auto outputIds = result.outputTokenIds; - std::vector beamLength(outputIds.size()); - int32_t maxBeamLength = -1; - for (size_t i = 0; i < outputIds.size(); ++i) - { - beamLength[i] = outputIds[i].size(); - maxBeamLength = std::max(beamLength[i], maxBeamLength); - } - if (maxBeamLength == -1) - { - TLLM_LOG_ERROR("Output ids is empty"); - maxBeamLength = 0; - } - for (auto& vec : outputIds) - { - vec.resize(maxBeamLength, -1); - } - - if (requestData.outputNames.count(OutputFieldsNames::outputIds) > 0) - { - std::vector outputIdsShape{1, static_cast(outputIds.size()), maxBeamLength}; - auto outputIdsType = TRITONSERVER_TYPE_INT32; - auto outputIdsBuffer = utils::getResponseBuffer( - tritonResponse, outputIdsShape, outputIdsType, OutputFieldsNames::outputIds); - utils::flatten(outputIds, outputIdsBuffer, outputIdsShape); - } - else - { - TLLM_THROW("%s tensor must be present in list of output tensors", OutputFieldsNames::outputIds); - } - - if (requestData.outputNames.count(OutputFieldsNames::sequenceLength) > 0) - { - std::vector sequenceLengthShape{1, static_cast(outputIds.size())}; - auto sequenceLengthType = TRITONSERVER_TYPE_INT32; - auto sequenceLengthBuffer = utils::getResponseBuffer( - tritonResponse, sequenceLengthShape, sequenceLengthType, OutputFieldsNames::sequenceLength); - utils::flatten(beamLength, sequenceLengthBuffer, sequenceLengthShape); - } - else - { - TLLM_THROW("%s tensor must be present in list of output tensors", OutputFieldsNames::sequenceLength); - } - - if (requestData.outputNames.count(OutputFieldsNames::contextLogits) > 0) - { - if (result.contextLogits.has_value()) - { - auto contextLogitsShapeOriginal = result.contextLogits.value().getShape(); - std::vector contextLogitsShape{ - 1, contextLogitsShapeOriginal[0], contextLogitsShapeOriginal[1]}; - auto contextLogitsType = TRITONSERVER_TYPE_FP32; - auto contextLogitsBuffer = utils::getResponseBuffer( - tritonResponse, contextLogitsShape, contextLogitsType, OutputFieldsNames::contextLogits); - utils::flatten(result.contextLogits.value(), contextLogitsBuffer, contextLogitsShape); - } - else - { - std::vector contextLogitsShape{1, 1, 1}; - auto contextLogitsType = TRITONSERVER_TYPE_FP32; - auto contextLogitsBuffer = utils::getResponseBuffer( - tritonResponse, contextLogitsShape, contextLogitsType, OutputFieldsNames::contextLogits); - utils::flatten(std::vector{0}, contextLogitsBuffer, contextLogitsShape); - } - } - - if (requestData.outputNames.count(OutputFieldsNames::generationLogits) > 0) - { - if (result.generationLogits.has_value()) - { - auto generationLogitsShapeOriginal = result.generationLogits.value().getShape(); - std::vector generationLogitsShape{1, generationLogitsShapeOriginal[0], - generationLogitsShapeOriginal[1], generationLogitsShapeOriginal[2]}; - auto generationLogitsType = TRITONSERVER_TYPE_FP32; - auto generationLogitsBuffer = utils::getResponseBuffer(tritonResponse, generationLogitsShape, - generationLogitsType, OutputFieldsNames::generationLogits); - utils::flatten( - result.generationLogits.value(), generationLogitsBuffer, generationLogitsShape); - } - else - { - std::vector generationLogitsShape{1, 1, 1, 1}; - auto generationLogitsType = TRITONSERVER_TYPE_FP32; - auto generationLogitsBuffer = utils::getResponseBuffer(tritonResponse, generationLogitsShape, - generationLogitsType, OutputFieldsNames::generationLogits); - utils::flatten(std::vector{0}, generationLogitsBuffer, generationLogitsShape); - } - } - - if (requestData.outputNames.count(OutputFieldsNames::outputLogProbs) > 0) - { - if (result.logProbs.has_value()) - { - std::vector outputLogProbsShape{1, static_cast(result.logProbs.value().size()), - static_cast(result.logProbs.value()[0].size())}; - auto outputLogProbsType = TRITONSERVER_TYPE_FP32; - auto outputLogProbsBuffer = utils::getResponseBuffer( - tritonResponse, outputLogProbsShape, outputLogProbsType, OutputFieldsNames::outputLogProbs); - utils::flatten(result.logProbs.value(), outputLogProbsBuffer, outputLogProbsShape); - } - else - { - std::vector outputLogProbsShape{1, 1, requestData.inputTokensSize}; - auto outputLogProbsType = TRITONSERVER_TYPE_FP32; - auto outputLogProbsBuffer = utils::getResponseBuffer( - tritonResponse, outputLogProbsShape, outputLogProbsType, OutputFieldsNames::outputLogProbs); - utils::flatten( - std::vector(requestData.inputTokensSize), outputLogProbsBuffer, outputLogProbsShape); - } - } - - if (requestData.outputNames.count(OutputFieldsNames::cumLogProbs) > 0) - { - if (result.cumLogProbs.has_value()) - { - std::vector cumLogProbsShape{1, static_cast(result.cumLogProbs.value().size())}; - auto cumLogProbsType = TRITONSERVER_TYPE_FP32; - auto cumLogProbsBuffer = utils::getResponseBuffer( - tritonResponse, cumLogProbsShape, cumLogProbsType, OutputFieldsNames::cumLogProbs); - utils::flatten(result.cumLogProbs.value(), cumLogProbsBuffer, cumLogProbsShape); - } - else - { - std::vector cumLogProbsShape{1, 1}; - auto cumLogProbsType = TRITONSERVER_TYPE_FP32; - auto cumLogProbsBuffer = utils::getResponseBuffer( - tritonResponse, cumLogProbsShape, cumLogProbsType, OutputFieldsNames::cumLogProbs); - utils::flatten(std::vector{0}, cumLogProbsBuffer, cumLogProbsShape); - } - } - } - else - { - isFinal = true; - std::string errMsg = "Executor failed process requestId " + std::to_string(response.getRequestId()) - + " due to the following error: " + response.getErrorMsg(); - error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str()); - } - } - catch (std::exception const& e) - { - // In case of error while processing response, return response with error - isFinal = true; - std::string errMsg = "Error encountered while populating response: " + std::string(e.what()); - error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str()); - } - - return {tritonResponse, isFinal, error}; -} - -void ModelInstanceState::WaitForResponse() -{ - while (!mStopWaitForResponse) - { - std::chrono::milliseconds waitTime(1); - auto responses = mExecutor->awaitResponses(waitTime); - uint64_t compute_end_ns{0}; - SET_TIMESTAMP(compute_end_ns); - - for (auto const& response : responses) - { - auto requestId = response.getRequestId(); - RequestData requestData; - { - std::lock_guard lock(mRequestIdToRequestDataMutex); - if (!mRequestIdToRequestData.count(requestId)) - { - TLLM_LOG_ERROR("Unexpected response for a request ID that is not active"); - continue; - } - requestData = mRequestIdToRequestData[requestId]; - } - - auto factory = requestData.factory; - - auto [tritonResponse, isFinal, error] = fillTritonResponse(factory, response, requestData); - - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend(tritonResponse, isFinal ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0, error), - "Cannot send response"); - - if (isFinal) - { - std::lock_guard lock(mRequestIdToRequestDataMutex); - if (requestData.tritonRequestId != "") - { - mTritonRequestIdToRequestId.erase(requestData.tritonRequestId); - } - - requestData.timestamps.compute_end_ns = compute_end_ns; - LOG_IF_ERROR(reportBaseMetrics(requestData, error), "Error reporting metrics"); - - LOG_IF_ERROR(TRITONBACKEND_RequestRelease(requestData.tritonRequest, TRITONSERVER_REQUEST_RELEASE_ALL), - "Cannot release request"); - LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryDelete(factory), "Cannot delete response factory"); - mRequestIdToRequestData.erase(requestId); - } - } - } -} - -void ModelInstanceState::WaitForStats() -{ - while (!mStopWaitForStats) - { - std::this_thread::sleep_for(std::chrono::milliseconds(mInstanceSpecificConfig.statsCheckPeriodMs)); - auto stats = mExecutor->getLatestIterationStats(); - for (auto const& stat : stats) - { - std::string statJson = "{"; - statJson.append("\"Active Request Count\":" + std::to_string(stat.numActiveRequests) + ","); - statJson.append("\"Iteration Counter\":" + std::to_string(stat.iter) + ","); - statJson.append("\"Max Request Count\":" + std::to_string(stat.maxNumActiveRequests) + ","); - statJson.append("\"Runtime CPU Memory Usage\":" + std::to_string(stat.cpuMemUsage) + ","); - statJson.append("\"Runtime GPU Memory Usage\":" + std::to_string(stat.gpuMemUsage) + ","); - statJson.append("\"Runtime Pinned Memory Usage\":" + std::to_string(stat.pinnedMemUsage) + ","); - statJson.append("\"Timestamp\":" + ("\"" + stat.timestamp + "\"") + ","); - - if (stat.inflightBatchingStats.has_value()) - { - auto const& modelStats = stat.inflightBatchingStats.value(); - statJson.append("\"Context Requests\":" + std::to_string(modelStats.numContextRequests) + ","); - statJson.append("\"Generation Requests\":" + std::to_string(modelStats.numGenRequests) + ","); - statJson.append("\"MicroBatch ID\":" + std::to_string(modelStats.microBatchId) + ","); - statJson.append("\"Paused Requests\":" + std::to_string(modelStats.numPausedRequests) + ","); - statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ","); - statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ","); - } - else if (stat.staticBatchingStats.has_value()) - { - auto const& modelStats = stat.staticBatchingStats.value(); - statJson.append("\"Context Requests\":" + std::to_string(modelStats.numContextRequests) + ","); - statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ","); - statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ","); - statJson.append("\"Total Generation Tokens\":" + std::to_string(modelStats.numGenTokens) + ","); - statJson.append("\"Empty Generation Slots\":" + std::to_string(modelStats.emptyGenSlots) + ","); - } - else - { - TLLM_LOG_ERROR("Missing stats"); - continue; - } - - if (stat.kvCacheStats.has_value()) - { - auto const& kvStats = stat.kvCacheStats.value(); - statJson.append("\"Free KV cache blocks\":" + std::to_string(kvStats.freeNumBlocks) + ","); - statJson.append("\"Max KV cache blocks\":" + std::to_string(kvStats.maxNumBlocks) + ","); - statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ","); - statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ","); - } - - statJson.back() = '}'; - - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, statJson.c_str()); -#ifdef TRITON_ENABLE_METRICS - LOG_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(statJson), "Failed updating TRT LLM statistics"); -#endif - } - } -} - -void ModelInstanceState::WaitForCancel() -{ - while (!mStopWaitForCancel) - { - std::this_thread::sleep_for(std::chrono::milliseconds(mInstanceSpecificConfig.cancellationCheckPeriodMs)); - std::lock_guard lock(mRequestIdToRequestDataMutex); - for (auto const& pair : mRequestIdToRequestData) - { - auto const& requestId = pair.first; - auto const& requestData = pair.second; - bool isCancelled = false; - LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryIsCancelled(requestData.factory, &isCancelled), - "Failed to query factory status"); - if (isCancelled) - { - mExecutor->cancelRequest(requestId); - } - } - } -} - -} // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/model_instance_state.h b/inflight_batcher_llm/src/model_instance_state.h deleted file mode 100644 index cc630a61..00000000 --- a/inflight_batcher_llm/src/model_instance_state.h +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include "triton/backend/backend_common.h" -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -#include "tensorrt_llm/batch_manager/callbacks.h" -#include "tensorrt_llm/batch_manager/kvCacheConfig.h" -#include "tensorrt_llm/batch_manager/namedTensor.h" -#include "tensorrt_llm/executor/types.h" - -#include "model_state.h" - -#ifdef TRITON_ENABLE_METRICS -#include "custom_metrics_reporter/custom_metrics_reporter.h" -#endif - -#include -#include -#include - -using namespace tensorrt_llm; -using namespace tensorrt_llm::batch_manager; - -namespace triton::backend::inflight_batcher_llm -{ - -/// @brief Struct to hold configs that is will be used later when creating the executor requests -struct InstanceSpecificConfig -{ - bool excludeInputFromOutput; - int cancellationCheckPeriodMs; - int statsCheckPeriodMs; -}; - -/// @brief Timestamps for each request, used to report Triton metrics -struct Timestamps -{ - uint64_t exec_start_ns = 0; - uint64_t compute_start_ns = 0; - uint64_t compute_end_ns = 0; - uint64_t exec_end_ns = 0; - - void Reset() - { - exec_start_ns = 0; - compute_start_ns = 0; - compute_end_ns = 0; - exec_end_ns = 0; - } -}; - -/// @brief Per-request data stored for handling requests -struct RequestData -{ - TRITONBACKEND_ResponseFactory* factory; - TRITONBACKEND_Request* tritonRequest; - std::string tritonRequestId; - int64_t inputTokensSize; - executor::SizeType32 beamWidth; - std::unordered_set outputNames; - Timestamps timestamps; -}; - -// -// ModelInstanceState -// State associated with a model instance. An object of this class is -// created and associated with each -// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from -// -class ModelInstanceState -{ - using InferenceRequest = tensorrt_llm::batch_manager::InferenceRequest; - using NamedTensor = tensorrt_llm::batch_manager::NamedTensor; - -public: - // number of cpu workers used to move weights host cache to gpu cache - static constexpr executor::SizeType32 kPeftCacheNumEnsureWorkers = 4; - // number of cuda streams used for H2D copies of peft cache pages - static constexpr executor::SizeType32 kPeftCacheNumCopyStreams = 4; - // number of cpu workers used to load weight into host cache - static constexpr executor::SizeType32 kPeftCacheNumPutWorkers = 4; - - /// @brief Create a ModelInstanceObject - static TRITONSERVER_Error* Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state); - - virtual ~ModelInstanceState() - { - mStopWaitForResponse = true; - mWaitForResponseThread.join(); - - mStopWaitForStats = true; - mWaitForStatsThread.join(); - - mStopWaitForCancel = true; - mWaitForCancelThread.join(); - } - - // Get the state of the model that corresponds to this instance. - ModelState* StateForModel() const - { - return model_state_; - } - - bool isDecoupled() const - { - return model_state_->IsDecoupled(); - } - - /// @brief Add the request to the executor - void enqueue(TRITONBACKEND_Request** requests, uint32_t const request_count); - -private: - /// @brief Get batching type - executor::BatchingType getBatchingTypeFromParams(); - - /// @brief Get kv cache config - executor::KvCacheConfig getKvCacheConfigFromParams(); - - /// @brief Get scheduler config - executor::SchedulerConfig getSchedulerConfigFromParams(bool enableChunkedContext); - - /// @brief Get peft config - executor::PeftCacheConfig getPeftCacheConfigFromParams(); - - /// @brief Get parallel config - executor::ParallelConfig getParallelConfigFromParams(); - - /// @brief Get executor config - executor::ExecutorConfig getExecutorConfigFromParams(); - - /// @brief Constructor - ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance); - - ModelState* model_state_; - TRITONBACKEND_ModelInstance* modelInstance_; - - /// @brief Send a response during enqueue - void sendEnqueueResponse(TRITONBACKEND_Request* request, TRITONSERVER_Error* error); - - /// @brief Cancel a request - bool handleStopRequest(TRITONBACKEND_Request* request, std::string const& tritonRequestId); - - /// @brief Create an executor::Request from input tensors - static executor::Request createExecutorRequest( - TRITONBACKEND_Request* request, bool excludeInputFromOutput, bool isDecoupled, executor::ModelType modelType); - - /// @brief Fill in a triton response based on executor response - std::tuple fillTritonResponse( - TRITONBACKEND_ResponseFactory* factory, executor::Response const& response, RequestData const& requestData); - - /// @brief TRT-LLM Executor that handles requests - std::unique_ptr mExecutor; - /// @brief Config to be used when sending requests to executor - InstanceSpecificConfig mInstanceSpecificConfig; - - /// @brief Report Triton base metrics for a given request - TRITONSERVER_Error* reportBaseMetrics(RequestData& requestData, TRITONSERVER_Error* error); - - /// @brief Retrieve responses from the executor - void WaitForResponse(); - /// @brief The thread for WaitForResponse() to run - std::thread mWaitForResponseThread; - /// @brief Flag to stop the WaitForResponse thread when the model instance is being destroyed - bool mStopWaitForResponse; - - /// @brief Retrieve stats from the executor - void WaitForStats(); - /// @brief The thread for WaitForStats() to run - std::thread mWaitForStatsThread; - /// @brief Flag to stop the WaitForStats thread when the model instance is being destroyed - bool mStopWaitForStats; - - /// @brief Cancel a request for executor if it is marked as cancelled by Triton backend - void WaitForCancel(); - /// @brief The thread for WaitForCancel() to run - std::thread mWaitForCancelThread; - /// @brief Flag to stop the WaitForCancel thread when the model instance is being destroyed - bool mStopWaitForCancel; - - std::unordered_map mRequestIdToRequestData; - std::unordered_map mTritonRequestIdToRequestId; - std::mutex mRequestIdToRequestDataMutex; - - // The type of model (encoder-only, decoder-only, encoder-decoder) - executor::ModelType mModelType; - -#ifdef TRITON_ENABLE_METRICS - std::unique_ptr custom_metrics_reporter_; -#endif -}; - -} // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/model_state.cc b/inflight_batcher_llm/src/model_state.cc deleted file mode 100644 index d0539311..00000000 --- a/inflight_batcher_llm/src/model_state.cc +++ /dev/null @@ -1,283 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "model_state.h" - -#include "utils.h" - -#include - -namespace triton::backend::inflight_batcher_llm -{ - -TRITONSERVER_Error* ModelState::Create( - TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t const version, ModelState** state) -{ - TRITONSERVER_Message* config_message; - RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(triton_model, 1 /* config_version */, &config_message)); - // We can get the model configuration as a json string from - // config_message, parse it with our favorite json parser to create - // DOM that we can access when we need to example the - // configuration. We use TritonJson, which is a wrapper that returns - // nice errors (currently the underlying implementation is - // rapidjson... but others could be added). You can use any json - // parser you prefer. - char const* buffer; - size_t byte_size; - RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size)); - - common::TritonJson::Value model_config; - TRITONSERVER_Error* err = model_config.Parse(buffer, byte_size); - RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message)); - RETURN_IF_ERROR(err); - - try - { - *state = new ModelState(triton_model, name, version, std::move(model_config)); - } - catch (std::exception const& ex) - { - std::string errStr = std::string("unexpected error when creating modelState: ") + ex.what(); - return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str()); - } - - return nullptr; // success -} - -void ModelState::LoadParameters() -{ - // Check if model is in decoupled mode: - triton::common::TritonJson::Value transaction_policy; - model_config_.MemberAsObject("model_transaction_policy", &transaction_policy); - transaction_policy.MemberAsBool("decoupled", &is_decoupled_); - - try - { - gpu_device_ids_ = GetParameter>("gpu_device_ids"); - - if (gpu_device_ids_) - { - std::string deviceIdInfo("Using GPU device ids: "); - for (auto const& deviceId : gpu_device_ids_.value()) - { - deviceIdInfo += std::to_string(deviceId) + " "; - } - TLLM_LOG_INFO(deviceIdInfo); - } - } - catch (std::exception const& e) - { - // If parameter is not specified, just ignore - TLLM_LOG_WARNING("gpu_device_ids is not specified, will be automatically set"); - } -} - -common::TritonJson::Value& ModelState::GetModelConfig() -{ - return model_config_; -} - -std::string const& ModelState::GetModelName() const -{ - return model_name_; -} - -uint64_t ModelState::GetModelVersion() const -{ - return model_version_; -} - -std::string const ModelState::GetExecutorWorkerPath() -{ - - // Check if worker_path is specified, if so throw an error - try - { - auto workerPath = GetParameter("worker_path"); - TLLM_THROW( - "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path " - "instead to specify the location of the trtllmExecutorWorker executable."); - } - catch (std::exception const& e) - { - } - - std::string executorWorkerPath = "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"; - try - { - executorWorkerPath = GetParameter("executor_worker_path"); - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING("executor_worker_path is not specified, will use default value"); - } - - return executorWorkerPath; -} - -std::vector ModelState::serialize() const -{ - // model name - // model version - // model config - size_t totalSize = 3; - - int nameSize = (model_name_.size() + sizeof(int64_t)) / sizeof(int64_t); - totalSize += nameSize; - - TritonJson::WriteBuffer buffer; - model_config_.Write(&buffer); - - totalSize += buffer.Size(); - - std::vector packed(totalSize); - int64_t* ptr = packed.data(); - - *ptr++ = model_name_.size(); - std::memcpy(ptr, model_name_.c_str(), model_name_.size()); - ptr += nameSize; - - *ptr++ = model_version_; - *ptr++ = buffer.Size(); - std::memcpy(ptr, buffer.Base(), buffer.Size()); - - return packed; -} - -ModelState ModelState::deserialize(int64_t const* packed_ptr) -{ - auto const nameSize = *packed_ptr++; - char const* cname = reinterpret_cast(packed_ptr); - packed_ptr += (nameSize + sizeof(int64_t)) / sizeof(int64_t); - - uint64_t const version = *packed_ptr++; - - auto const jsonSize = *packed_ptr++; - char const* jsonBuffer = reinterpret_cast(packed_ptr); - common::TritonJson::Value model_config; - TRITONSERVER_Error* err = model_config.Parse(jsonBuffer, jsonSize); - if (err) - { - throw std::runtime_error("Failed to parse model config"); - } - - return ModelState{nullptr, cname, version, std::move(model_config)}; -} - -ModelState ModelState::deserialize(std::vector const& packed) -{ - return ModelState::deserialize(packed.data()); -} - -template <> -std::string ModelState::GetParameter(std::string const& name) -{ - TritonJson::Value parameters; - TRITONSERVER_Error* err = model_config_.MemberAsObject("parameters", ¶meters); - if (err != nullptr) - { - throw std::runtime_error("Model config doesn't have a parameters section"); - TRITONSERVER_ErrorDelete(err); - } - TritonJson::Value value; - std::string str_value; - err = parameters.MemberAsObject(name.c_str(), &value); - if (err != nullptr) - { - std::string errStr = "Cannot find parameter with name: " + name; - throw std::runtime_error(errStr); - TRITONSERVER_ErrorDelete(err); - } - value.MemberAsString("string_value", &str_value); - return str_value; -} - -template <> -int32_t ModelState::GetParameter(std::string const& name) -{ - return std::stoi(GetParameter(name)); -} - -template <> -std::vector ModelState::GetParameter>(std::string const& name) -{ - auto deviceIdsStr = GetParameter(name); - // Parse as comma delimited string - return utils::csvStrToVecInt(deviceIdsStr); -} - -template <> -uint32_t ModelState::GetParameter(std::string const& name) -{ - return (uint32_t) std::stoul(GetParameter(name)); -} - -template <> -int64_t ModelState::GetParameter(std::string const& name) -{ - return std::stoll(GetParameter(name)); -} - -template <> -uint64_t ModelState::GetParameter(std::string const& name) -{ - return std::stoull(GetParameter(name)); -} - -template <> -float ModelState::GetParameter(std::string const& name) -{ - return std::stof(GetParameter(name)); -} - -template <> -bool ModelState::GetParameter(std::string const& name) -{ - auto val = GetParameter(name); - if (val == "True" || val == "true" || val == "TRUE" || val == "1") - { - return true; - } - else if (val == "False" || val == "false" || val == "FALSE" || val == "0") - { - return false; - } - else - { - std::string err = "Cannot convert " + val + " to a boolean."; - throw std::runtime_error(err); - } -} - -template <> -std::vector> ModelState::GetParameter>>(std::string const& name) -{ - auto str = GetParameter(name); - // Parse as comma delimited string and {} as array bounders - return utils::csvStrToVecVecInt(str); -} - -} // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/model_state.h b/inflight_batcher_llm/src/model_state.h deleted file mode 100644 index fdd68de9..00000000 --- a/inflight_batcher_llm/src/model_state.h +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/plugins/api/tllmPlugin.h" -#include "tensorrt_llm/runtime/tllmLogger.h" - -#include "triton/backend/backend_common.h" -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -#include - -using namespace ::triton::common; // TritonJson - -namespace triton::backend::inflight_batcher_llm -{ - -// ModelState -// -// State associated with a model that is using this backend. An object -// of this class is created and associated with each -// TRITONBACKEND_Model. - -class ModelState -{ -public: - static TRITONSERVER_Error* Create( - TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t const version, ModelState** state); - - template - T GetParameter(std::string const& name) - { - assert(false); - auto dummy = T(); - return dummy; - } - - virtual ~ModelState() = default; - - common::TritonJson::Value& GetModelConfig(); - std::string const& GetModelName() const; - uint64_t GetModelVersion() const; - std::string const GetExecutorWorkerPath(); - - std::optional> GetDeviceIds() - { - return gpu_device_ids_; - } - - bool IsDecoupled() const - { - return is_decoupled_; - } - - [[nodiscard]] std::vector serialize() const; - - static ModelState deserialize(int64_t const* packed_ptr); - - static ModelState deserialize(std::vector const& packed); - -private: - std::string const model_name_; - uint64_t model_version_; - common::TritonJson::Value model_config_; - std::shared_ptr mTrtLogger{}; - - // model parameters - std::optional> gpu_device_ids_; - bool is_decoupled_ = false; - - void LoadParameters(); - -public: - ModelState( - TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t version, TritonJson::Value&& model_config) - : model_name_(name) - , model_version_(version) - , model_config_(std::move(model_config)) - { - mTrtLogger = std::make_shared(); - initTrtLlmPlugins(mTrtLogger.get()); - - LoadParameters(); - } -}; - -template <> -std::string ModelState::GetParameter(std::string const& name); - -template <> -int32_t ModelState::GetParameter(std::string const& name); - -template <> -uint32_t ModelState::GetParameter(std::string const& name); - -template <> -int64_t ModelState::GetParameter(std::string const& name); - -template <> -uint64_t ModelState::GetParameter(std::string const& name); - -template <> -float ModelState::GetParameter(std::string const& name); - -template <> -bool ModelState::GetParameter(std::string const& name); - -template <> -std::vector ModelState::GetParameter>(std::string const& name); - -template <> -std::vector> ModelState::GetParameter>>(std::string const& name); - -} // namespace triton::backend::inflight_batcher_llm diff --git a/inflight_batcher_llm/src/utils.cc b/inflight_batcher_llm/src/utils.cc deleted file mode 100644 index bb611c5a..00000000 --- a/inflight_batcher_llm/src/utils.cc +++ /dev/null @@ -1,620 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "utils.h" - -using namespace tensorrt_llm::batch_manager; - -namespace triton::backend::inflight_batcher_llm::utils -{ - -nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type) -{ - if (data_type == TRITONSERVER_TYPE_INVALID) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_BOOL) - { - return nvinfer1::DataType::kBOOL; - } - else if (data_type == TRITONSERVER_TYPE_UINT8) - { - return nvinfer1::DataType::kUINT8; - } - else if (data_type == TRITONSERVER_TYPE_UINT16) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_UINT32) - { - return nvinfer1::DataType::kINT32; - } - else if (data_type == TRITONSERVER_TYPE_UINT64) - { - return nvinfer1::DataType::kINT64; - } - else if (data_type == TRITONSERVER_TYPE_INT8) - { - return nvinfer1::DataType::kINT8; - } - else if (data_type == TRITONSERVER_TYPE_INT16) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_INT32) - { - return nvinfer1::DataType::kINT32; - } - else if (data_type == TRITONSERVER_TYPE_INT64) - { - return nvinfer1::DataType::kINT64; - } - else if (data_type == TRITONSERVER_TYPE_FP16) - { - return nvinfer1::DataType::kHALF; - } - else if (data_type == TRITONSERVER_TYPE_FP32) - { - return nvinfer1::DataType::kFLOAT; - } - else if (data_type == TRITONSERVER_TYPE_FP64) - { - assert(false); - } - else if (data_type == TRITONSERVER_TYPE_BYTES) - { - return nvinfer1::DataType::kINT8; - } - else if (data_type == TRITONSERVER_TYPE_BF16) - { - return nvinfer1::DataType::kBF16; - } - else - { - assert(false); - } - return nvinfer1::DataType(0); -} - -std::unordered_map readInputsTensors(TRITONBACKEND_Request* request) -{ - std::unordered_map inputsTensors; - uint32_t num_inputs; - LOG_IF_ERROR(TRITONBACKEND_RequestInputCount(request, &num_inputs), "Error getting input count"); - for (uint32_t idx = 0; idx < num_inputs; ++idx) - { - TRITONBACKEND_Input* input = nullptr; - LOG_IF_ERROR(TRITONBACKEND_RequestInputByIndex(request, idx, &input), "Error getting input index"); - - char const* input_name = nullptr; - TRITONSERVER_DataType data_type = TRITONSERVER_TYPE_INVALID; - int64_t const* shape = nullptr; - uint32_t dims_count = 0; - uint64_t byte_size = 0; - uint32_t buffer_count = 0; - LOG_IF_ERROR(TRITONBACKEND_InputProperties( - input, &input_name, &data_type, &shape, &dims_count, &byte_size, &buffer_count), - "Error getting input properties"); - - if (std::string(input_name) == "START" || std::string(input_name) == "CORRID" - || std::string(input_name) == "END" || std::string(input_name) == kStopInputTensorName - || std::string(input_name) == kStreamingInputTensorName) - { - continue; - } - - std::vector shapev; - for (uint32_t i = 0; i < dims_count; ++i) - { - shapev.push_back(shape[i]); - } - - NamedTensor t(utils::to_trt_datatype(data_type), shapev, input_name); - uint64_t buffer_offset = 0; - for (int64_t buffer_id = 0; buffer_id < buffer_count; ++buffer_id) - { - void const* buffer = nullptr; - uint64_t buffer_byte_size = 0; - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - LOG_IF_ERROR( - TRITONBACKEND_InputBuffer(input, buffer_id, &buffer, &buffer_byte_size, &memory_type, &memory_type_id), - "failed to get input buffer"); - assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED)); - std::memcpy(static_cast(t.tensor->data()) + buffer_offset, buffer, buffer_byte_size); - buffer_offset += buffer_byte_size; - } - - inputsTensors.insert(make_pair(t.name, std::move(t))); - } - return inputsTensors; -} - -uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map& requestIdStrMap) -{ - char const* charRequestId; - TRITONBACKEND_RequestId(request, &charRequestId); - uint64_t requestId = 0; - if (charRequestId != nullptr) - { - std::string strRequestId(charRequestId); - if (!strRequestId.empty()) - { - try - { - requestId = stoul(strRequestId); - } - catch (std::exception const& e) - { - std::hash hasher; - requestId = hasher(strRequestId); - - // Check for hash collisions - // If requestID already exists in the map with the same string, increment the ID and check again - for (auto it = requestIdStrMap.find(requestId); - it != requestIdStrMap.end() && it->second != strRequestId;) - { - requestId++; - } - } - requestIdStrMap.insert({requestId, strRequestId}); - } - } - - return requestId; -} - -std::unordered_set getRequestOutputNames(TRITONBACKEND_Request* request) -{ - std::unordered_set outputNames; - uint32_t outputCount; - LOG_IF_ERROR(TRITONBACKEND_RequestOutputCount(request, &outputCount), "Error getting request output count"); - for (size_t i = 0; i < outputCount; ++i) - { - char const* name; - LOG_IF_ERROR(TRITONBACKEND_RequestOutputName(request, i, &name), "Error getting request output name"); - std::string name_s(name); - outputNames.insert(std::move(name_s)); - } - return outputNames; -} - -bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, std::string const& inputTensorName) -{ - // Get stop signal from the request - TRITONBACKEND_Input* input; - TRITONSERVER_Error* error = TRITONBACKEND_RequestInput(request, inputTensorName.c_str(), &input); - if (error) - { - // If the user does not provide input "stop", then regard the request as - // unstopped - std::string msg - = "ModelInstanceState::getRequestBooleanInputTensor: user " - "did not not provide " - + inputTensorName + " input for the request"; - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, msg.c_str()); - TRITONSERVER_ErrorDelete(error); - return false; - } - - uint64_t input_byte_size = 0; - uint32_t buffer_count = 0; - TRITONBACKEND_InputProperties(input, nullptr, nullptr, nullptr, nullptr, &input_byte_size, &buffer_count); - - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, - ("ModelInstanceState::getRequestStopSignal: buffer_count = " + std::to_string(buffer_count)).c_str()); - - void const* buffer = 0L; - uint64_t buffer_byte_size = 0; - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - TRITONBACKEND_InputBuffer(input, 0, &buffer, &buffer_byte_size, &memory_type, &memory_type_id); - - assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED)); - - bool boolean = *reinterpret_cast(buffer); - - return boolean; -} - -std::string sparseListToStr(executor::VecTokens const& sparseList) -{ - std::string buffer; - for (auto v : sparseList) - { - buffer.append(std::to_string(v) + " "); - } - return buffer; -} - -std::list convertWordList(executor::VecTokens const& sparseList) -{ - std::list convertedList; - int32_t n = sparseList.size(); - TLLM_CHECK_WITH_INFO(n % 2 == 0, "Sparse list must not have odd length: " + sparseListToStr(sparseList)); - int32_t numTokens = n / 2; - int32_t currentIndex = 0; - for (auto i = numTokens; i < n; ++i) - { - if (sparseList[i] == -1) - { - for (auto j = i + 1; j < n; ++j) - { - TLLM_CHECK_WITH_INFO( - sparseList[j] == -1, "Sparse list must not have additional -1s: " + sparseListToStr(sparseList)); - } - break; - } - TLLM_CHECK_WITH_INFO(sparseList[i] <= numTokens, - "Sparse list must not have out-of-bound offsets: " + sparseListToStr(sparseList)); - if (i != numTokens) - { - TLLM_CHECK_WITH_INFO(sparseList[i] > sparseList[i - 1], - "Sparse list must not have non-increasing offsets: " + sparseListToStr(sparseList)); - } - executor::VecTokens currentWords; - while (currentIndex < sparseList[i]) - { - currentWords.push_back(sparseList[currentIndex]); - ++currentIndex; - } - convertedList.push_back(currentWords); - } - return convertedList; -} - -void squeezeTensor(std::shared_ptr const& tensor, int32_t expectedNumDims) -{ - auto shape = tensor->getShape(); - if (shape.nbDims == expectedNumDims) - { - return; - } - if (shape.nbDims == expectedNumDims + 1 && shape.d[0] == 1) - { - --shape.nbDims; - for (int32_t i = 0; i < expectedNumDims; ++i) - { - shape.d[i] = shape.d[i + 1]; - } - tensor->reshape(shape); - } - else - { - TLLM_LOG_ERROR("Unexpected prompt tensor shape"); - } -} - -std::vector csvStrToVecInt(std::string const& str) -{ - TLLM_CHECK_WITH_INFO(!str.empty(), "Cannot convert empty string to vector of vector of ints"); - - std::vector output; - std::stringstream ss(str); - while (ss.good()) - { - std::string substr; - ss >> std::ws; - getline(ss, substr, ','); - if (substr.empty()) - { - break; - } - output.push_back(std::stoi(substr)); - } - TLLM_CHECK_WITH_INFO(!output.empty(), "Empty vector"); - return output; -} - -std::vector> csvStrToVecVecInt(std::string const& str) -{ - TLLM_CHECK_WITH_INFO(!str.empty(), "Cannot convert empty string to vector of vector of ints"); - - std::vector> output; - std::stringstream ss(str); - - while (true) - { - std::string substr; - getline(ss, substr, '}'); - if (substr.empty() || ss.eof()) - { - break; - } - if (substr[0] == '{') - { - // Remove the opening bracket from the content - substr = substr.substr(1); - } - output.push_back(csvStrToVecInt(substr)); - // Ignore the comma and any whitespace - ss >> std::ws; - ss.ignore(std::numeric_limits::max(), ','); - ss >> std::ws; - } - TLLM_CHECK_WITH_INFO(!output.empty(), "Empty vector of vector"); - return output; -} - -int64_t numElements(std::vector const& shape) -{ - int64_t n = 1; - for (auto d : shape) - { - n *= d; - } - return n; -} - -executor::SamplingConfig getSamplingConfigFromTensors(InputTensors const& inputsTensors) -{ - int32_t beamWidth = 1; - // If beam_width is specified, set it from config.pbtxt - extractSingleton(inputsTensors, InputFieldsNames::beamWidth, beamWidth); - - std::optional topK{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::topK, topK); - - std::optional topP{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::topP, topP); - if (topP.has_value() && topP.value() <= 0.F) - { - topP.reset(); - } - - std::optional topPMin{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::topPMin, topPMin); - - std::optional topPDecay{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::topPDecay, topPDecay); - - std::optional topPResetIds{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::topPResetIds, topPResetIds); - - std::optional temperature{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::temperature, temperature); - - std::optional lengthPenalty{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::lengthPenalty, lengthPenalty); - - std::optional earlyStopping{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::earlyStopping, earlyStopping); - - std::optional repetitionPenalty{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::repetitionPenalty, repetitionPenalty); - - std::optional minLength{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::minLength, minLength); - - std::optional beamSearchDiversityRate{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::beamSearchDiversityRate, beamSearchDiversityRate); - - std::optional presencePenalty{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::presencePenalty, presencePenalty); - - std::optional frequencyPenalty{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::frequencyPenalty, frequencyPenalty); - - std::optional randomSeed{std::nullopt}; - extractOptionalSingleton(inputsTensors, InputFieldsNames::randomSeed, randomSeed); - - return executor::SamplingConfig(beamWidth, topK, topP, topPMin, topPResetIds, topPDecay, randomSeed, temperature, - minLength, beamSearchDiversityRate, repetitionPenalty, presencePenalty, frequencyPenalty, lengthPenalty, - earlyStopping); -} - -executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTensors) -{ - bool returnLogProbs{false}; - extractSingleton(inputsTensors, InputFieldsNames::returnLogProbs, returnLogProbs); - - bool returnGenerationLogits{false}; - extractSingleton(inputsTensors, InputFieldsNames::returnGenerationLogits, returnGenerationLogits); - - bool returnContextLogits{false}; - extractSingleton(inputsTensors, InputFieldsNames::returnContextLogits, returnContextLogits); - - // Note that currently excludeInputFromOutput is set from the backend parameters. - return executor::OutputConfig(returnLogProbs, returnContextLogits, returnGenerationLogits); -} - -std::optional getExternalDraftTokensConfigFromTensors( - InputTensors const& inputsTensors) -{ - std::optional externalDraftTokensConfig = std::nullopt; - - if (inputsTensors.count(InputFieldsNames::draftInputs)) - { - executor::VecTokens draftInputs; - extractVector(inputsTensors, InputFieldsNames::draftInputs, draftInputs); - - std::optional draftLogits = std::nullopt; - if (inputsTensors.count(InputFieldsNames::draftLogits)) - { - std::shared_ptr originaldraftLogitsTensor - = inputsTensors.at(InputFieldsNames::draftLogits).tensor; - utils::squeezeTensor(originaldraftLogitsTensor, 2); - draftLogits = executor::detail::ofITensor(originaldraftLogitsTensor); - } - - std::optional draftAcceptanceThreshold{std::nullopt}; - utils::extractOptionalSingleton( - inputsTensors, InputFieldsNames::draftAcceptanceThreshold, draftAcceptanceThreshold); - - externalDraftTokensConfig - = executor::ExternalDraftTokensConfig(draftInputs, draftLogits, draftAcceptanceThreshold); - } - return externalDraftTokensConfig; -} - -std::optional getPromptTuningConfigFromTensors(InputTensors const& inputsTensors) -{ - std::optional pTuningConfig = std::nullopt; - if (inputsTensors.count(InputFieldsNames::promptEmbeddingTable)) - { - std::shared_ptr originalTensor - = inputsTensors.at(InputFieldsNames::promptEmbeddingTable).tensor; - utils::squeezeTensor(originalTensor, 2); - auto const& executorTensor = executor::detail::ofITensor(originalTensor); - pTuningConfig = executor::PromptTuningConfig(executorTensor); - } - return pTuningConfig; -} - -std::optional getLoraConfigFromTensors(InputTensors const& inputsTensors) -{ - std::optional loraConfig = std::nullopt; - if (inputsTensors.count(InputFieldsNames::loraTaskId)) - { - uint64_t taskId; - if (!utils::extractSingleton(inputsTensors, InputFieldsNames::loraTaskId, taskId)) - { - throw std::runtime_error("failed to extract lora task id"); - } - - std::optional loraConfigTensor{std::nullopt}; - if (inputsTensors.count(InputFieldsNames::loraConfig)) - { - std::shared_ptr originalLoraConfigTensor - = inputsTensors.at(InputFieldsNames::loraConfig).tensor; - utils::squeezeTensor(originalLoraConfigTensor, 2); - loraConfigTensor = executor::detail::ofITensor(originalLoraConfigTensor); - } - - std::optional loraWeightsTensor{std::nullopt}; - if (inputsTensors.count(InputFieldsNames::loraWeights)) - { - std::shared_ptr originalLoraWeightsTensor - = inputsTensors.at(InputFieldsNames::loraWeights).tensor; - utils::squeezeTensor(originalLoraWeightsTensor, 2); - loraWeightsTensor = executor::detail::ofITensor(originalLoraWeightsTensor); - } - - loraConfig = executor::LoraConfig(taskId, loraWeightsTensor, loraConfigTensor); - } - return loraConfig; -} - -executor::Request createRequestFromInputTensors(std::unordered_map const& inputsTensors, - bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType) -{ - executor::OutputConfig outConfig = utils::getOutputConfigFromTensors(inputsTensors); - outConfig.excludeInputFromOutput = excludeInputFromOutput; - - executor::VecTokens inputTokens; - if (!utils::extractVector(inputsTensors, InputFieldsNames::inputTokens, inputTokens)) - { - TLLM_THROW("%s is not present in the request.", InputFieldsNames::inputTokens); - } - executor::SizeType32 maxNewTokens; - if (!utils::extractSingleton(inputsTensors, InputFieldsNames::maxNewTokens, maxNewTokens)) - { - throw std::runtime_error("request_output_len is not present in the request"); - } - - std::optional endId{std::nullopt}; - utils::extractOptionalSingleton(inputsTensors, InputFieldsNames::endId, endId); - - std::optional padId{std::nullopt}; - utils::extractOptionalSingleton(inputsTensors, InputFieldsNames::padId, padId); - - std::optional encoderInputTokens{std::nullopt}; - if (modelType == executor::ModelType::kENCODER_ONLY || modelType == executor::ModelType::kENCODER_DECODER) - { - encoderInputTokens = inputTokens; - - // If encoder-decoder, check if decoder tokens are specified - if (modelType == executor::ModelType::kENCODER_DECODER) - { - if (!utils::extractVector(inputsTensors, InputFieldsNames::decoderInputTokens, inputTokens)) - { - if (padId) - { - TLLM_LOG_WARNING( - "%s is not present in the request for encoder-decoder model. The decoder input tokens will be " - "set to " - "[padId]", - InputFieldsNames::decoderInputTokens); - inputTokens = {padId.value()}; - } - else - { - TLLM_LOG_WARNING("%s is not present in the request for encoder-decoder model", - InputFieldsNames::decoderInputTokens); - inputTokens.clear(); - } - } - } - } - - if (streaming && !isDecoupled) - { - throw std::runtime_error( - "Streaming is only supported if model is " - "deployed using decoupled mode."); - } - - auto samplingConfig = utils::getSamplingConfigFromTensors(inputsTensors); - - std::optional> badWords = std::nullopt; - executor::VecTokens badWordsRaw; - if (utils::extractVector(inputsTensors, InputFieldsNames::badWords, badWordsRaw)) - { - badWords = utils::convertWordList(badWordsRaw); - } - - std::optional> stopWords = std::nullopt; - executor::VecTokens stopWordsRaw; - if (utils::extractVector(inputsTensors, InputFieldsNames::stopWords, stopWordsRaw)) - { - stopWords = utils::convertWordList(stopWordsRaw); - } - - std::optional embeddingBias{std::nullopt}; - if (inputsTensors.count(InputFieldsNames::embeddingBias)) - { - std::shared_ptr originalTensor = inputsTensors.at(InputFieldsNames::embeddingBias).tensor; - utils::squeezeTensor(originalTensor, 1); - auto newShape = originalTensor->getShape(); - if (!(newShape.nbDims == 1 && newShape.d[0] == 0)) - { - embeddingBias = executor::detail::ofITensor(originalTensor); - } - } - - auto pTuningConfig = utils::getPromptTuningConfigFromTensors(inputsTensors); - - auto loraConfig = utils::getLoraConfigFromTensors(inputsTensors); - - auto externalDraftTokensConfig = utils::getExternalDraftTokensConfigFromTensors(inputsTensors); - - return executor::Request(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig, endId, padId, badWords, - stopWords, embeddingBias, externalDraftTokensConfig, pTuningConfig, loraConfig, std::nullopt, - encoderInputTokens); -} - -} // namespace triton::backend::inflight_batcher_llm::utils diff --git a/inflight_batcher_llm/src/utils.h b/inflight_batcher_llm/src/utils.h deleted file mode 100644 index 6d7ea384..00000000 --- a/inflight_batcher_llm/src/utils.h +++ /dev/null @@ -1,287 +0,0 @@ -// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#pragma once - -#include "NvInfer.h" -#include "tensorrt_llm/batch_manager/inferenceRequest.h" -#include "tensorrt_llm/common/logger.h" -#include "tensorrt_llm/runtime/tllmLogger.h" -#include "triton/backend/backend_common.h" -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" -#include -#include -#include - -using namespace tensorrt_llm; - -namespace triton::backend::inflight_batcher_llm -{ - -/// @brief Names of input fields -struct InputFieldsNames -{ - static constexpr char const* inputTokens = "input_ids"; - static constexpr char const* decoderInputTokens = "decoder_input_ids"; - static constexpr char const* maxNewTokens = "request_output_len"; - static constexpr char const* endId = "end_id"; - static constexpr char const* padId = "pad_id"; - static constexpr char const* badWords = "bad_words_list"; - static constexpr char const* stopWords = "stop_words_list"; - static constexpr char const* embeddingBias = "embedding_bias"; - - // OutputConfig - static constexpr char const* returnLogProbs = "return_log_probs"; - static constexpr char const* returnGenerationLogits = "return_generation_logits"; - static constexpr char const* returnContextLogits = "return_context_logits"; - - // SamplingConfig - static constexpr char const* beamWidth = "beam_width"; - static constexpr char const* topK = "runtime_top_k"; - static constexpr char const* topP = "runtime_top_p"; - static constexpr char const* topPMin = "runtime_top_k_min"; - static constexpr char const* topPDecay = "runtime_top_p_decay"; - static constexpr char const* topPResetIds = "runtime_top_p_reset_ids"; - static constexpr char const* temperature = "temperature"; - static constexpr char const* lengthPenalty = "len_penalty"; - static constexpr char const* earlyStopping = "early_stopping"; - static constexpr char const* repetitionPenalty = "repetition_penalty"; - static constexpr char const* minLength = "min_length"; - static constexpr char const* beamSearchDiversityRate = "beam_search_diversity_rate"; - static constexpr char const* presencePenalty = "presence_penalty"; - static constexpr char const* frequencyPenalty = "frequency_penalty"; - static constexpr char const* randomSeed = "random_seed"; - - // PromptTuningConfig - static constexpr char const* promptEmbeddingTable = "prompt_embedding_table"; - - // LoraConfig - static constexpr char const* loraTaskId = "lora_task_id"; - static constexpr char const* loraWeights = "lora_weights"; - static constexpr char const* loraConfig = "lora_config"; - - // ExternalDraftTokensConfig - static constexpr char const* draftInputs = "draft_input_ids"; - static constexpr char const* draftLogits = "draft_logits"; - static constexpr char const* draftAcceptanceThreshold = "draft_acceptance_threshold"; -}; - -/// @brief Names of output fields -struct OutputFieldsNames -{ - static constexpr char const* outputIds = "output_ids"; - static constexpr char const* sequenceLength = "sequence_length"; - static constexpr char const* contextLogits = "context_logits"; - static constexpr char const* generationLogits = "generation_logits"; - static constexpr char const* outputLogProbs = "output_log_probs"; - static constexpr char const* cumLogProbs = "cum_log_probs"; -}; - -inline static std::string const kStopInputTensorName = "stop"; -inline static std::string const kStreamingInputTensorName = "streaming"; - -namespace utils -{ - -/// @brief Convert Triton datatype to TRT datatype -nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type); - -using InputTensors = std::unordered_map; - -/// @brief Gather input tenors in a Triton request -/// @return An unordered map with key being input name and value being input tensor -InputTensors readInputsTensors(TRITONBACKEND_Request* request); - -/// @brief Construct executor::SampleConfig from input tensors -executor::SamplingConfig getSamplingConfigFromTensors(InputTensors const& inputsTensors); - -/// @brief Construct executor::OutputConfig from input tensors -executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTensors); - -/// @brief Construct executor::ExternalDraftTokensConfig from input tensors -std::optional getExternalDraftTokensConfigFromTensors( - InputTensors const& inputsTensors); - -/// @brief Construct executor::PromptTuningConfig from input tensors -std::optional getPromptTuningConfigFromTensors(InputTensors const& inputsTensors); - -/// @brief Construct executor::LoraConfig from input tensors -std::optional getLoraConfigFromTensors(InputTensors const& inputsTensors); - -/// @brief Construct executor::Request from input tensors -executor::Request createRequestFromInputTensors( - std::unordered_map const& inputsTensors, - bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType); - -/// @brief get the requestId of the request and update requestIdStrMap -/// @return Returns 0 if not specified. Throws an error if request_id cannot be convert to uint64_t -uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map& requestIdStrMap); - -/// @brief Get the requested output names -std::unordered_set getRequestOutputNames(TRITONBACKEND_Request* request); - -/// @brief Get the value of a boolean tensor -bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, std::string const& inputTensorName); - -/// @brief Get a single value tensor from the input tensors -/// @return true if the value is found else false -template -bool extractSingleton(std::unordered_map const& params, - std::string const& name, Value& value) -{ - if (!params.count(name)) - { - return false; - } - auto const& tensor = params.at(name); - TLLM_CHECK_WITH_INFO(tensor.tensor->getSize() == 1, "Invalid size for tensor " + name); - value = *(static_cast(tensor.tensor->data())); - return true; -} - -/// @brief Get a single value tensor from the input tensors and put it into an optional. Set to std::nullopt if it's not -/// found. -template -void extractOptionalSingleton(std::unordered_map const& params, - std::string const& name, std::optional& optionalValue) -{ - Value value; - if (extractSingleton(params, name, value)) - { - optionalValue = value; - } - else - { - optionalValue = std::nullopt; - } -} - -/// @brief Get a 1d tensor from the input tensors -/// @return true if the tensor is found else false -template -bool extractVector(std::unordered_map const& params, - std::string const& name, std::vector& value) -{ - if (!params.count(name)) - { - return false; - } - auto const& tensor = params.at(name); - int64_t n = tensor.tensor->getSize(); - value.resize(n); - for (int64_t i = 0; i < n; ++i) - { - value[i] = static_cast(tensor.tensor->data())[i]; - } - return true; -} - -int64_t numElements(std::vector const& shape); - -/// @brief Flatten the vector and copy into the buffer -template -void flatten(std::vector const& vec, void* buffer, std::vector const& expectedShape) -{ - TLLM_CHECK_WITH_INFO(static_cast(vec.size()) == numElements(expectedShape), - "Trying to flatten a tensor with unexpected size"); - T* typedBuffer = static_cast(buffer); - std::copy(vec.begin(), vec.end(), typedBuffer); -} - -/// @brief Flatten the vector of vector and copy into the buffer -template -void flatten(std::vector> const& vec, void* buffer, std::vector const& expectedShape) -{ - T* typedBuffer = static_cast(buffer); - int64_t copiedSize = 0; - for (auto const& innerVec : vec) - { - TLLM_CHECK_WITH_INFO(innerVec.size() == vec.at(0).size(), - "The vector of vector to be flattened has mismatched sizes in its inner vectors"); - copiedSize += innerVec.size(); - typedBuffer = std::copy(innerVec.begin(), innerVec.end(), typedBuffer); - } - TLLM_CHECK_WITH_INFO(copiedSize == numElements(expectedShape), "Trying to flatten a tensor with unexpected size"); -} - -/// @brief Flatten the tensor and copy into the buffer -template -void flatten(tensorrt_llm::executor::Tensor const& tensor, void* buffer, std::vector const& expectedShape) -{ - TLLM_CHECK_WITH_INFO(static_cast(tensor.getSize()) == numElements(expectedShape), - "Trying to flatten a tensor with unexpected size"); - Value* typedBuffer = static_cast(buffer); - Value const* ptr = static_cast(tensor.getData()); - std::copy(ptr, ptr + tensor.getSize(), typedBuffer); -} - -/// @brief Query Triton for a buffer that can be used to pass the output tensors -template -void* getResponseBuffer(TRITONBACKEND_Response* tritonResponse, std::vector const& shape, - TRITONSERVER_DataType dtype, std::string const& name) -{ - TRITONBACKEND_Output* output; - TRITONSERVER_Error* err{nullptr}; - err = TRITONBACKEND_ResponseOutput(tritonResponse, &output, name.c_str(), dtype, shape.data(), shape.size()); - if (err != nullptr) - { - auto errMsg = TRITONSERVER_ErrorMessage(err); - TLLM_THROW("Could not get response output for output tensor %s: %s", name.c_str(), errMsg); - } - - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - uint64_t size = 1; - for (auto s : shape) - { - size *= s; - } - auto buffersize = size * sizeof(T); - void* tritonBuffer = 0L; - err = TRITONBACKEND_OutputBuffer(output, &tritonBuffer, buffersize, &memory_type, &memory_type_id); - if (err != nullptr) - { - auto errMsg = TRITONSERVER_ErrorMessage(err); - TLLM_THROW("Could not get output buffer for output tensor %s: %s", name.c_str(), errMsg); - } - return tritonBuffer; -} - -/// @brief Convert a sparse tensor to a list of VecTokens -std::list convertWordList(executor::VecTokens const& sparseList); - -/// @brief Remove the additional size 1 dimension for tensor -void squeezeTensor(std::shared_ptr const& tensor, int32_t expectedNumDims); - -/// Helper functions to parse a csv delimited string to a vector ints -std::vector csvStrToVecInt(std::string const& str); - -/// Helper functions to parse a csv delimited string to a vector of vector ints -std::vector> csvStrToVecVecInt(std::string const& str); - -} // namespace utils -} // namespace triton::backend::inflight_batcher_llm diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index de1735f5..00000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -regex -fire -tritonclient[all] -transformers==4.36.1 -pandas -tabulate diff --git a/scripts/launch_triton_server.py b/scripts/launch_triton_server.py deleted file mode 100644 index e0dcc2ef..00000000 --- a/scripts/launch_triton_server.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse -import os -import subprocess -import sys -from pathlib import Path - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--world_size', - type=int, - default=1, - help='world size, only support tensor parallelism now') - parser.add_argument( - '--tritonserver', - type=str, - help='path to the tritonserver exe', - default='/opt/tritonserver/bin/tritonserver', - ) - parser.add_argument( - '--grpc_port', - type=str, - help='tritonserver grpc port', - default='8001', - ) - parser.add_argument( - '--http_port', - type=str, - help='tritonserver http port', - default='8000', - ) - parser.add_argument( - '--metrics_port', - type=str, - help='tritonserver metrics port', - default='8002', - ) - parser.add_argument( - '--force', - '-f', - action='http://github.com/store_true', - help='launch tritonserver regardless of other instances running') - parser.add_argument('--log', - action='http://github.com/store_true', - help='log triton server stats into log_file') - parser.add_argument( - '--log-file', - type=str, - help='path to triton log gile', - default='triton_log.txt', - ) - - path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt' - parser.add_argument('--model_repo', type=str, default=path) - - parser.add_argument( - '--tensorrt_llm_model_name', - type=str, - help= - 'Name(s) of the tensorrt_llm Triton model in the repo. Use comma to separate if multiple model names', - default='tensorrt_llm', - ) - - parser.add_argument( - '--multi-model', - action='http://github.com/store_true', - help= - 'Enable support for multiple TRT-LLM models in the Triton model repository' - ) - - return parser.parse_args() - - -def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port, - model_repo, log, log_file, tensorrt_llm_model_name): - cmd = ['mpirun', '--allow-run-as-root'] - for i in range(world_size): - cmd += ['-n', '1', tritonserver, f'--model-repository={model_repo}'] - if log and (i == 0): - cmd += ['--log-verbose=3', f'--log-file={log_file}'] - # If rank is not 0, skip loading of models other than `tensorrt_llm_model_name` - if (i != 0): - cmd += ['--model-control-mode=explicit'] - model_names = tensorrt_llm_model_name.split(',') - for name in model_names: - cmd += [f'--load-model={name}'] - cmd += [ - f'--grpc-port={grpc_port}', f'--http-port={http_port}', - f'--metrics-port={metrics_port}', '--disable-auto-complete-config', - f'--backend-config=python,shm-region-prefix-name=prefix{i}_', ':' - ] - return cmd - - -if __name__ == '__main__': - args = parse_arguments() - res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'], - capture_output=True, - encoding='utf-8') - if res.stdout: - pids = res.stdout.replace('\n', ' ').rstrip() - msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.' - if args.force: - print(msg, file=sys.stderr) - else: - raise RuntimeError(msg + ' Or use --force.') - cmd = get_cmd(int(args.world_size), args.tritonserver, args.grpc_port, - args.http_port, args.metrics_port, args.model_repo, args.log, - args.log_file, args.tensorrt_llm_model_name) - env = os.environ.copy() - if args.multi_model: - assert args.world_size == 1, 'World size must be 1 when using multi-model. Processes will be spawned automatically to run the multi-GPU models' - env['TRTLLM_ORCHESTRATOR'] = '1' - subprocess.Popen(cmd, env=env) diff --git a/tensorrt_llm b/tensorrt_llm index 9691e12b..31116825 160000 --- a/tensorrt_llm +++ b/tensorrt_llm @@ -1 +1 @@ -Subproject commit 9691e12bce7ae1c126c435a049eb516eb119486c +Subproject commit 31116825b39f4e6a6a1e127001f5204b73d1dc32 diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tools/fill_template.py b/tools/fill_template.py deleted file mode 100644 index 0524f9ef..00000000 --- a/tools/fill_template.py +++ /dev/null @@ -1,38 +0,0 @@ -#! /usr/bin/env python3 -from argparse import ArgumentParser -from string import Template - - -def main(file_path, substitutions, in_place): - with open(file_path) as f: - pbtxt = Template(f.read()) - - sub_dict = {} - for sub in substitutions.split(","): - key, value = sub.split(":") - sub_dict[key] = value - - pbtxt = pbtxt.safe_substitute(sub_dict) - - if in_place: - with open(file_path, "w") as f: - f.write(pbtxt) - else: - print(pbtxt) - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("file_path", help="path of the .pbtxt to modify") - parser.add_argument( - "substitutions", - help= - "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..." - ) - parser.add_argument("--in_place", - "-i", - action="http://github.com/store_true", - help="do the operation in-place") - args = parser.parse_args() - - main(**vars(args)) diff --git a/tools/gpt/benchmark_core_model.py b/tools/gpt/benchmark_core_model.py deleted file mode 100644 index fdb4e93a..00000000 --- a/tools/gpt/benchmark_core_model.py +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse -import statistics as s -from builtins import range -from datetime import datetime - -import numpy as np -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-w', - '--warm_up', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable warm_up before benchmark') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-p', - '--request_parallelism', - type=int, - default=10, - required=False, - help='Specify request parallelism') - parser.add_argument('-m', - '--mode', - type=str, - required=False, - default='sync', - help='Mode ("sync"/"async").') - parser.add_argument('-b', - '--batch_size', - type=int, - default=8, - required=False, - help='Specify batch size') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-s', - '--start_len', - type=int, - default=8, - required=False, - help='Specify input length') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument( - '-n', - '--num_runs', - type=int, - default=1, - required=False, - help="Spedifty number of runs to get the average latency") - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - input_start_ids = np.random.randint(0, - 50255, - size=(FLAGS.batch_size, - FLAGS.start_len), - dtype=np.int32) - input_len = np.array([[input_start_ids.shape[1]] - for _ in range(input_start_ids.shape[0])], np.int32) - inputs = utils.prepare_inputs(input_start_ids, - input_len, - pad_id=0, - end_id=2, - flags=FLAGS) - - # warm up - if FLAGS.warm_up: - print("[INFO] sending requests to warm up") - with utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - utils.send_requests('tensorrt_llm', - inputs, - client, - request_parallelism=2) - - latencies = [] - for i in range(FLAGS.num_runs): - start_time = datetime.now() - - with utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - if FLAGS.mode == 'sync': - utils.send_requests('tensorrt_llm', inputs, client, - FLAGS.request_parallelism) - else: - if FLAGS.protocol == "http": - async_requests = utils.send_requests_async( - 'tensorrt_llm', inputs, client, FLAGS, - FLAGS.request_parallelism) - results = utils.get_http_results(async_requests) - else: - user_data = utils.send_requests_async( - 'tensorrt_llm', inputs, client, FLAGS, - FLAGS.request_parallelism) - results = utils.get_grpc_results(user_data, - FLAGS.request_parallelism) - - stop_time = datetime.now() - latencies.append((stop_time - start_time).total_seconds() * 1000.0 / - FLAGS.request_parallelism) - - if FLAGS.num_runs > 1: - latency = s.mean(latencies) - else: - latency = latencies[0] - latency = round(latency, 3) - throughput = round(1000 / latency * FLAGS.batch_size, 3) - print( - f"[INFO] Batch size: {FLAGS.batch_size}, Start len: {FLAGS.start_len}, Output len: {FLAGS.output_len}" - ) - print(f"[INFO] Latency: {latency} ms") - print(f"[INFO] Throughput: {throughput} sentences / sec") diff --git a/tools/gpt/client.py b/tools/gpt/client.py deleted file mode 100644 index 4c7973a7..00000000 --- a/tools/gpt/client.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse -from datetime import datetime - -import numpy as np -from transformers import AutoTokenizer -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument( - '-t', - '--text', - type=str, - required=False, - default='Born in north-east France, Soyer trained as a', - help='Input text') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - line = tokenizer.encode(FLAGS.text) - input_start_ids = np.array([line], np.int32) - input_len = np.array([[len(line)]], np.int32) - inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id, - FLAGS) - - start_time = datetime.now() - - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - results = utils.send_requests('tensorrt_llm', - inputs, - client, - request_parallelism=1) - output_ids = results[0].as_numpy("output_ids") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Latency: {latency} ms") - - output_ids = output_ids.reshape( - (output_ids.size, )).tolist()[input_start_ids.shape[1]:] - output_text = tokenizer.decode(output_ids) - print(f'Input: {FLAGS.text}') - print(f'Output: {output_text}') diff --git a/tools/gpt/client_async.py b/tools/gpt/client_async.py deleted file mode 100644 index b2530f3a..00000000 --- a/tools/gpt/client_async.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse -from datetime import datetime - -import numpy as np -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from transformers import AutoTokenizer -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument( - '-t', - '--text', - type=str, - required=False, - default='Born in north-east France, Soyer trained as a', - help='Input text') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - client_util = httpclient if FLAGS.protocol == "http" else grpcclient - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - line = tokenizer.encode(FLAGS.text) - input_start_ids = np.array([line], np.int32) - input_len = np.array([[len(line)]], np.int32) - inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id, - FLAGS) - - start_time = datetime.now() - - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - if FLAGS.protocol == "http": - async_requests = utils.send_requests_async('tensorrt_llm', - inputs, - client, - FLAGS, - request_parallelism=1) - results = utils.get_http_results(async_requests) - else: - user_data = utils.send_requests_async('tensorrt_llm', - inputs, - client, - FLAGS, - request_parallelism=1) - results = utils.get_grpc_results(user_data, request_parallelism=1) - output_ids = results[0].as_numpy("output_ids") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Latency: {latency} ms") - - output_ids = output_ids.reshape( - (output_ids.size, )).tolist()[input_start_ids.shape[1]:] - output_text = tokenizer.decode(output_ids) - print(f'Input: {FLAGS.text}') - print(f'Output: {output_text}') diff --git a/tools/gpt/end_to_end_test.py b/tools/gpt/end_to_end_test.py deleted file mode 100644 index c2e411bd..00000000 --- a/tools/gpt/end_to_end_test.py +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) -import argparse - -import numpy as np -from transformers import AutoTokenizer -from utils import utils - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-c', - '--concurrency', - type=int, - default=1, - required=False, - help='Specify concurrency') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--tokenizer_dir', - type=str, - required=True, - help='Specify tokenizer directory') - - FLAGS = parser.parse_args() - if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print( - "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) - exit(1) - - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - model_name = 'preprocessing' - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - input0 = [["Blackhawks\n The 2015 Hilltoppers"], - ["Data sources you can use to make a decision:"], - ["\n if(angle = 0) { if(angle"], - ["GMs typically get 78% female enrollment, but the "], - ["Previous Chapter | Index | Next Chapter"], - ["Michael, an American Jew, called Jews"], - ["Born in north-east France, Soyer trained as a"], - ["Data sources you can use to make a comparison:"]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * FLAGS.output_len - bad_words_list = np.array( - [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]], - dtype=object) - stop_words_list = np.array( - [[""], [""], [""], [""], [""], [""], [""], ["month, month"]], - dtype=object) - inputs = [ - utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol), - utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list, - FLAGS.protocol), - utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list, - FLAGS.protocol), - utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len, - FLAGS.protocol), - ] - - try: - result = client.infer(model_name, inputs) - output0 = result.as_numpy("INPUT_ID") - output1 = result.as_numpy("REQUEST_INPUT_LEN") - output2 = result.as_numpy("REQUEST_OUTPUT_LEN") - output3 = result.as_numpy("BAD_WORDS_IDS") - output4 = result.as_numpy("STOP_WORDS_IDS") - except Exception as e: - print(e) - - model_name = "tensorrt_llm" - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=1, - verbose=FLAGS.verbose) as client: - inputs = utils.prepare_inputs(output0, output1, pad_id, end_id, FLAGS) - - try: - result = client.infer(model_name, inputs) - output0 = result.as_numpy("output_ids") - except Exception as e: - print(e) - - model_name = "postprocessing" - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - inputs = [ - utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol) - ] - inputs[0].set_data_from_numpy(output0) - - try: - result = client.infer(model_name, inputs) - output0 = result.as_numpy("OUTPUT") - print("============After postprocessing============") - batch_size = len(input0) - output0 = output0.reshape([-1, batch_size]).T.tolist() - output0 = [[char.decode('UTF-8') for char in line] - for line in output0] - output0 = [''.join(line) for line in output0] - for line in output0: - print(f"{line}") - print("===========================================\n\n\n") - except Exception as e: - print(e) - - model_name = "ensemble" - with utils.create_inference_server_client(FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) as client: - input0 = [["Blackhawks\n The 2015 Hilltoppers"], - ["Data sources you can use to make a decision:"], - ["\n if(angle = 0) { if(angle"], - ["GMs typically get 78% female enrollment, but the "], - ["Previous Chapter | Index | Next Chapter"], - ["Michael, an American Jew, called Jews"], - ["Born in north-east France, Soyer trained as a"], - ["Data sources you can use to make a comparison:"]] - bad_words_list = np.array( - [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]], - dtype=object) - stop_words_list = np.array( - [[""], [""], [""], [""], [""], [""], [""], ["month, month"]], - dtype=object) - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * FLAGS.output_len - runtime_top_k = (FLAGS.topk * - np.ones([input0_data.shape[0], 1])).astype(np.int32) - runtime_top_p = FLAGS.topp * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - temperature = 1.0 * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - len_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - repetition_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype( - np.float32) - random_seed = 0 * np.ones([input0_data.shape[0], 1]).astype(np.uint64) - output_log_probs = True * np.ones([input0_data.shape[0], 1 - ]).astype(bool) - beam_width = (FLAGS.beam_width * - np.ones([input0_data.shape[0], 1])).astype(np.int32) - pad_ids = pad_id * \ - np.ones([input0_data.shape[0], 1]).astype(np.int32) - end_ids = end_id * \ - np.ones([input0_data.shape[0], 1]).astype(np.int32) - min_length = 1 * \ - np.ones([input0_data.shape[0], 1]).astype(np.int32) - presence_penalty = 0.0 * \ - np.ones([input0_data.shape[0], 1]).astype(np.float32) - frequency_penalty = 0.0 * \ - np.ones([input0_data.shape[0], 1]).astype(np.float32) - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - utils.prepare_tensor("pad_id", pad_ids, FLAGS.protocol), - utils.prepare_tensor("end_id", end_ids, FLAGS.protocol), - utils.prepare_tensor("beam_width", beam_width, FLAGS.protocol), - utils.prepare_tensor("top_k", runtime_top_k, FLAGS.protocol), - utils.prepare_tensor("top_p", runtime_top_p, FLAGS.protocol), - utils.prepare_tensor("temperature", temperature, FLAGS.protocol), - utils.prepare_tensor("length_penalty", len_penalty, - FLAGS.protocol), - utils.prepare_tensor("repetition_penalty", repetition_penalty, - FLAGS.protocol), - utils.prepare_tensor("min_length", min_length, FLAGS.protocol), - utils.prepare_tensor("presence_penalty", presence_penalty, - FLAGS.protocol), - utils.prepare_tensor("frequency_penalty", frequency_penalty, - FLAGS.protocol), - utils.prepare_tensor("random_seed", random_seed, FLAGS.protocol), - utils.prepare_tensor("output_log_probs", output_log_probs, - FLAGS.protocol), - ] - - try: - result = client.infer(model_name, inputs) - ensemble_output0 = result.as_numpy("text_output") - print("============After ensemble============") - batch_size = len(input0) - ensemble_output0 = ensemble_output0.reshape([-1, batch_size - ]).T.tolist() - ensemble_output0 = [[char.decode('UTF-8') for char in line] - for line in ensemble_output0] - ensemble_output0 = [''.join(line) for line in ensemble_output0] - for line in ensemble_output0: - print(f"{line}") - except Exception as e: - print(e) - - assert output0 == ensemble_output0 diff --git a/tools/gpt/gen_input_data.py b/tools/gpt/gen_input_data.py deleted file mode 100644 index 809771a0..00000000 --- a/tools/gpt/gen_input_data.py +++ /dev/null @@ -1,108 +0,0 @@ -import argparse -import json - -import numpy as np - - -def add_sample(sample, name, array): - sample[name] = {'content': array.flatten().tolist(), 'shape': array.shape} - - -def main(args): - data = {'data': []} - input_start_ids = np.random.randint(0, - 50255, - size=(args.start_len), - dtype=np.int32) - input_len = np.array([input_start_ids.shape[0]], np.int32) - output_len = np.ones([1]).astype(np.int32) * args.output_len - runtime_top_k = (args.topk * np.ones([1])).astype(np.int32) - runtime_top_p = args.topp * np.ones([1]).astype(np.float32) - beam_search_diversity_rate = 0.0 * np.ones([1]).astype(np.float32) - temperature = 1.0 * np.ones([1]).astype(np.float32) - len_penalty = 1.0 * np.ones([1]).astype(np.float32) - repetition_penalty = 1.0 * np.ones([1]).astype(np.float32) - random_seed = 0 * np.ones([1]).astype(np.uint64) - # is_return_log_probs = True * np.ones([1]).astype(bool) - beam_width = (args.beam_width * np.ones([1])).astype(np.int32) - # start_ids = 50256 * np.ones([1]).astype(np.int32) - # end_ids = 50256 * np.ones([1]).astype(np.int32) - # bad_words_list = np.concatenate([ - # np.zeros([1, 1]).astype(np.int32), - # (-1 * np.ones([1, 1])).astype(np.int32) - # ], - # axis=1) - # stop_word_list = np.concatenate([ - # np.zeros([1, 1]).astype(np.int32), - # (-1 * np.ones([1, 1])).astype(np.int32) - # ], - # axis=1) - - for _ in range(args.num_samples): - sample = {} - add_sample(sample, 'input_ids', input_start_ids) - add_sample(sample, 'input_lengths', input_len) - add_sample(sample, 'request_output_len', output_len) - add_sample(sample, 'runtime_top_k', runtime_top_k) - add_sample(sample, 'runtime_top_p', runtime_top_p) - add_sample(sample, 'beam_search_diversity_rate', - beam_search_diversity_rate) - add_sample(sample, 'temperature', temperature) - add_sample(sample, 'len_penalty', len_penalty) - add_sample(sample, 'repetition_penalty', repetition_penalty) - add_sample(sample, 'random_seed', random_seed) - add_sample(sample, 'beam_width', beam_width) - # add_sample(sample, 'top_p_decay', top_p_decay) - # add_sample(sample, 'top_p_min', top_p_min) - # add_sample(sample, 'top_p_reset_ids', top_p_reset_ids) - data['data'].append(sample) - - with open('input_data.json', 'w') as f: - json.dump(data, f, indent=4) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-b', - '--batch_size', - type=int, - default=8, - required=False, - help='Specify batch size') - parser.add_argument('-beam', - '--beam_width', - type=int, - default=1, - required=False, - help='Specify beam width') - parser.add_argument('-topk', - '--topk', - type=int, - default=1, - required=False, - help='topk for sampling') - parser.add_argument('-topp', - '--topp', - type=float, - default=0.0, - required=False, - help='topp for sampling') - parser.add_argument('-s', - '--start_len', - type=int, - default=8, - required=False, - help='Specify input length') - parser.add_argument('-o', - '--output_len', - type=int, - default=10, - required=False, - help='Specify output length') - parser.add_argument('--num_samples', - type=int, - default=10000, - required=False, - help='Specify number of samples to generate') - args = parser.parse_args() - main(args) diff --git a/tools/inflight_batcher_llm/benchmark_core_model.py b/tools/inflight_batcher_llm/benchmark_core_model.py deleted file mode 100644 index 3aa53372..00000000 --- a/tools/inflight_batcher_llm/benchmark_core_model.py +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import json -import sys -import time -from datetime import datetime -from functools import partial - -import numpy as np -from transformers import AutoTokenizer -from utils import utils - - -def callback(user_data, result, error): - user_data._completed_requests.put((result, error)) - if result is None: - # There was an error. - return - try: - # GRPC - req_id = result.get_response().id - except: - # HTTP - req_id = result.get_response()["id"] - start_time = user_data._start_time_dict[req_id] - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - user_data._latencies.append(latency) - user_data._latency_dict[req_id] = latency - user_data._stop_time_dict[req_id] = stop_time - - -def append_pad_id_to_tensors(pad_id, inputs): - if pad_id is not None: - pad_id_data = np.array([[pad_id]], dtype=np.int32) - else: - pad_id_data = np.ones_like([[1]]).astype(np.int32) * 0 - - inputs += [utils.prepare_tensor("pad_id", pad_id_data, FLAGS.protocol)] - - -def append_end_id_to_tensors(end_id, inputs): - if end_id is not None: - end_id_data = np.array([[end_id]], dtype=np.int32) - else: - end_id_data = np.ones_like([[1]]).astype(np.int32) * 1 - - inputs += [utils.prepare_tensor("end_id", end_id_data, FLAGS.protocol)] - - -def test_performance(client, - input_start_ids, - input_lens, - output_lens, - delays, - FLAGS, - pad_id=None, - end_id=None): - model_name = "tensorrt_llm" - - print(f"[INFO] Warm up for benchmarking.") - if FLAGS.decoupled: - client.start_stream(callback=lambda result, error: None, - stream_timeout=FLAGS.stream_timeout) - for i in range(10): - model_name = FLAGS.tensorrt_llm_model_name[i % len( - FLAGS.tensorrt_llm_model_name)] - output0_len = np.ones_like([[1]]).astype(np.int32) * 100 - inputs = [ - utils.prepare_tensor("input_ids", input_start_ids[0], - FLAGS.protocol), - utils.prepare_tensor("input_lengths", input_lens[0], - FLAGS.protocol), - utils.prepare_tensor("request_output_len", output0_len, - FLAGS.protocol), - ] - - append_pad_id_to_tensors(pad_id, inputs) - append_end_id_to_tensors(end_id, inputs) - if FLAGS.decoupled: - client.async_stream_infer(model_name, inputs, request_id=str(i)) - else: - client.infer(model_name, inputs, request_id=str(i)) - if FLAGS.decoupled: - client.stop_stream() - - print(f"[INFO] Start benchmarking on {len(input_start_ids)} prompts.") - latency = 0 - async_requests = [] - start_time = datetime.now() - user_data = utils.UserData() - - if FLAGS.decoupled: - client.start_stream(callback=partial(callback, user_data), - stream_timeout=FLAGS.stream_timeout) - for i, ids in enumerate(input_start_ids): - model_name = FLAGS.tensorrt_llm_model_name[i % len( - FLAGS.tensorrt_llm_model_name)] - output0_len = np.ones_like([[1]]).astype(np.int32) * output_lens[i] - inputs = [ - utils.prepare_tensor("input_ids", ids, FLAGS.protocol), - utils.prepare_tensor("input_lengths", input_lens[i], - FLAGS.protocol), - utils.prepare_tensor("request_output_len", output0_len, - FLAGS.protocol), - ] - - append_pad_id_to_tensors(pad_id, inputs) - append_end_id_to_tensors(end_id, inputs) - - time.sleep(delays[i]) - - user_data._start_time_dict[str(i)] = datetime.now() - if FLAGS.protocol == "http": - async_requests.append( - client.async_infer(model_name, inputs, request_id=str(i))) - elif FLAGS.protocol == "grpc": - if FLAGS.decoupled: - client.async_stream_infer(model_name, - inputs, - request_id=str(i)) - else: - async_requests.append( - client.async_infer(model_name, - inputs, - callback=partial(callback, user_data), - request_id=str(i))) - if FLAGS.decoupled: - client.stop_stream() - try: - if FLAGS.protocol == "http": - utils.get_http_results(async_requests) - elif FLAGS.protocol == "grpc": - responses = utils.get_grpc_results(user_data, len(input_start_ids)) - else: - raise RuntimeError("Invalid protocol") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Total Latency: {latency} ms") - - # TODO(kaiyu): support `extract_print_stats` for http - data_dict = None - if FLAGS.protocol == "grpc": - request_latencies = 0.0 - for latency in user_data._latencies: - request_latencies += latency - print(f"[INFO] Total request latencies: {request_latencies} ms") - - ip_token_len_list = [] - for ip in input_lens: - ip_token_len_list.append( - ip[0][0]) #for some reason, two level nesting - - data_dict = utils.extract_print_stats(ip_token_len_list, responses, - user_data, FLAGS) - - if FLAGS.check_perf_json: - check_performance(data_dict, FLAGS) - - except Exception as e: - print("Failed receiving responses: " + str(e)) - sys.exit(1) - - -def check_performance(data_dict, FLAGS): - if not data_dict: - print( - "[ERROR] --check-perf-json was used, but no data was collected. Please use grpc protocol." - ) - ref = json.load(open(FLAGS.check_perf_json, "r")) - if FLAGS.check_perf_key not in ref or len(ref[FLAGS.check_perf_key]) == 0: - print( - f"[ERROR] There are no reference numbers for {FLAGS.check_perf_key}, so the performance is not checked. Please add an entry to {FLAGS.check_perf_json}." - ) - sys.exit(1) - for metric in ref[FLAGS.check_perf_key]: - if metric not in data_dict: - print(f"[ERROR] Data for '{metric}' was not found.") - np.testing.assert_allclose( - data_dict[metric], - ref[FLAGS.check_perf_key][metric], - rtol=FLAGS.check_perf_rtol, - atol=FLAGS.check_perf_atol, - err_msg= - f"'{metric}' check failed - did not match reference in '{FLAGS.check_perf_json}' for '{FLAGS.check_perf_key}'" - ) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - subparsers = parser.add_subparsers(dest='workload') - - parser_dataset = subparsers.add_parser('dataset') - parser_dataset.add_argument('--dataset', - type=str, - required=True, - help='Dataset path used for the test.') - parser_dataset.add_argument('--tokenizer-dir', - type=str, - required=True, - help='Specify tokenizer directory') - parser_dataset.add_argument('--tokenizer-type', - type=str, - default='auto', - required=False, - choices=['auto', 't5', 'llama'], - help='Specify tokenizer type') - parser_dataset.add_argument( - '--op-tokens-per-word', - type=float, - default=1.3, - required=False, - help= - 'Specify op tokens/word ratio. Useful to have model generate exactly as many tokens as needed by the dataset' - ) - - parser_token_norm_dist = subparsers.add_parser('token-norm-dist') - parser_token_norm_dist.add_argument( - '--input-mean', - type=int, - required=True, - help='normal dist mean for input tokens') - parser_token_norm_dist.add_argument( - '--input-stdev', - type=int, - required=True, - help='normal dist stdev for input tokens') - parser_token_norm_dist.add_argument( - '--output-mean', - type=int, - required=True, - help='normal dist mean for output tokens') - parser_token_norm_dist.add_argument( - '--output-stdev', - type=int, - required=True, - help='normal dist stdev for output tokens') - - parser_token_from_hist = subparsers.add_parser('token-from-histogram') - parser_token_from_hist.add_argument( - '--histogram-key', - type=str, - required=True, - help='key to retrieve histogram buckets,freqs defined in utils') - - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - choices=['http', 'grpc'], - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument( - '--decoupled', - action="http://github.com/store_true", - required=False, - default=False, - help= - 'Uses async_stream_infer which allows decoupled backends (must use grpc protocol)' - ), - parser.add_argument( - "-t", - "--stream-timeout", - type=float, - required=False, - default=None, - help="Stream timeout in seconds. Default is None.", - ) - parser.add_argument( - "--tensorrt-llm-model-name", - type=str, - required=False, - default=["tensorrt_llm"], - action="http://github.com/append", - help= - "Specify the name of the TensorRT-LLM model. Can be specified multiple times to use multiple models." - ) - parser.add_argument('-c', - '--concurrency', - type=int, - default=128, - required=False, - help='Specify concurrency') - parser.add_argument('--max-input-len', - type=int, - required=True, - help='Specify max input length') - parser.add_argument('--request-rate', - type=float, - required=False, - help="# of reqs/sec. -1 indicates SOL/Offline", - default=-1.0) - parser.add_argument('--time-delay-dist', - type=str, - required=False, - choices=["constant", "exponential_dist"], - default="exponential_dist", - help="# of reqs/sec. -1 indicates SOL/Offline") - parser.add_argument( - '--dump-perfetto-trace', - action="http://github.com/store_true", - required=False, - default=False, - help= - 'Dumps trace of requests in a json (perfetto.json) to be visualized in perfetto' - ), - parser.add_argument('--op-stats-csv', - type=str, - default=None, - help='csv filename to dump stats'), - parser.add_argument( - "--exclude-input-in-output", - action="http://github.com/store_true", - required=False, - default=False, - help="Expect that output IDs do not contain input IDs", - ) - parser.add_argument( - '--num-requests', - type=int, - required=False, - default=30000, - help= - 'For dataset, requests = min(dataset, num_requests). number of requests to be generated by the client' - ) - parser.add_argument( - '--check-perf-json', - type=str, - required=False, - help= - 'If set, this will compare the latency to the value in this file under the key from --check-perf-key' - ) - parser.add_argument( - '--check-perf-key', - type=str, - required=False, - help= - 'Used with --check-perf-json to specify which entry in the file to compare with' - ) - parser.add_argument('--check-perf-atol', - type=float, - required=False, - help="Absolute tolerance for performance check", - default=50) - parser.add_argument('--check-perf-rtol', - type=float, - required=False, - help="Relative tolerance for performance check", - default=0.05) - - FLAGS = parser.parse_args() - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - if FLAGS.decoupled and FLAGS.protocol != 'grpc': - print("Protocol must be set to 'grpc' when using '--decoupled'.") - sys.exit(1) - - try: - client = utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) - except Exception as e: - print("channel creation failed: " + str(e)) - sys.exit(1) - - if FLAGS.request_rate == -1: - mean_time_bet_reqs = 0 - else: - mean_time_bet_reqs = 1.0 / FLAGS.request_rate - - input_start_ids = [] - input_lens = [] - output_lens = [] - ratio = [] - - print(FLAGS.workload) - if FLAGS.workload == "dataset": - tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir, - legacy=False, - padding_side='left') - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - pad_id = tokenizer.encode(tokenizer.pad_token, - add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, - add_special_tokens=False)[0] - - prompt_cnt = 0 - - with open(FLAGS.dataset, 'r') as f: - data_dict = json.load(f) - for req in data_dict: - prompt = req['input'] + ' ' + req['instruction'] - output = req['output'] - line = tokenizer.encode(prompt) - if len(line) > FLAGS.max_input_len: - continue - - prompt_cnt += 1 - if prompt_cnt > FLAGS.num_requests: - break - - input_start_ids.append(np.array([line], np.int32)) - input_lens.append(np.array([[len(line)]], np.int32)) - output_lens.append( - int(len(output.split(' ')) * FLAGS.op_tokens_per_word)) - prompt_tokens = len(line) - prompt_words = len(prompt.split()) - ratio.append(prompt_tokens / prompt_words) - - print("Tokenizer: Tokens per word = ", round(np.mean(ratio), 3)) - num_reqs = len(input_lens) - delays = utils.get_list_of_delays(FLAGS.time_delay_dist, - mean_time_bet_reqs, num_reqs) - test_performance(client, input_start_ids, input_lens, output_lens, - delays, FLAGS, pad_id, end_id) - - elif FLAGS.workload == "token-norm-dist": - input_lens = utils.get_norm_dist_tokens(FLAGS.input_mean, - FLAGS.input_stdev, - FLAGS.num_requests) - pruned_ip_list = [ - ip_len for ip_len in input_lens if ip_len <= FLAGS.max_input_len - ] - num_reqs = len(pruned_ip_list) - ip_lens_2d_array = [ - np.array([[ip_len]], np.int32) for ip_len in pruned_ip_list - ] - output_lens = utils.get_norm_dist_tokens(FLAGS.output_mean, - FLAGS.output_stdev, num_reqs) - delays = utils.get_list_of_delays(FLAGS.time_delay_dist, - mean_time_bet_reqs, num_reqs) - - input_start_ids = utils.gen_random_start_ids(pruned_ip_list) - test_performance(client, input_start_ids, ip_lens_2d_array, - output_lens, delays, FLAGS) - - elif FLAGS.workload == "token-from-histogram": - input_lens_orig = utils.get_token_list_from_histogram( - FLAGS.histogram_key + "_ip") - output_lens_orig = utils.get_token_list_from_histogram( - FLAGS.histogram_key + "_op") - - final_lens = min(len(input_lens_orig), len(output_lens_orig)) - input_lens = input_lens_orig[:final_lens] - output_lens = output_lens_orig[:final_lens] - - num_reqs = len(input_lens) - ip_lens_2d_array = [ - np.array([[ip_len]], np.int32) for ip_len in input_lens - ] - output_lens = utils.get_token_list_from_histogram(FLAGS.histogram_key + - "_op") - print(len(input_lens), len(output_lens)) - assert (len(input_lens) == len(output_lens)) - - delays = utils.get_list_of_delays(FLAGS.time_delay_dist, - mean_time_bet_reqs, num_reqs) - - input_start_ids = utils.gen_random_start_ids(input_lens) - test_performance(client, input_start_ids, ip_lens_2d_array, - output_lens, delays, FLAGS) diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py deleted file mode 100644 index 9361de46..00000000 --- a/tools/inflight_batcher_llm/end_to_end_test.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - -import argparse -import json -import sys -from datetime import datetime -from functools import partial - -import numpy as np -from utils import utils - - -def callback(user_data, start_time, result, error): - user_data._completed_requests.put((result, error)) - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - user_data._latencies.append(latency) - - -def test_functionality(client, - prompts, - output_lens, - vocabSizePadded=50257, - return_context_logits=False, - return_generation_logits=False, - test_bls=False): - print(f"[INFO] Start testing on {len(prompts)} prompts.") - for i, prompt in enumerate(prompts): - - # 1. Ensemble models manually: preprocessing -> tensorrt_llm -> postprocessing - model_name = 'preprocessing' - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol), - utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list, - FLAGS.protocol), - utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list, - FLAGS.protocol), - utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len, - FLAGS.protocol), - ] - result = client.infer(model_name, inputs, request_id=str(i)) - output0 = result.as_numpy("INPUT_ID") - output1 = result.as_numpy("REQUEST_INPUT_LEN") - output2 = result.as_numpy("REQUEST_OUTPUT_LEN") - decoder_input_id = result.as_numpy("DECODER_INPUT_ID") - output_end_id = result.as_numpy("OUT_END_ID") - output_pad_id = result.as_numpy("OUT_PAD_ID") - inputIds = output0 # Use to check context logits shape - - model_name = "tensorrt_llm" - inputs = [ - utils.prepare_tensor("input_ids", output0, FLAGS.protocol), - utils.prepare_tensor("decoder_input_ids", decoder_input_id, - FLAGS.protocol), - utils.prepare_tensor("input_lengths", output1, FLAGS.protocol), - utils.prepare_tensor("request_output_len", output2, - FLAGS.protocol), - utils.prepare_tensor("end_id", output_end_id, FLAGS.protocol), - utils.prepare_tensor("pad_id", output_pad_id, FLAGS.protocol), - ] - if return_context_logits: - return_context_logits_flag = np.array([[True]], dtype=bool) - inputs += [ - utils.prepare_tensor("return_context_logits", - return_context_logits_flag, - FLAGS.protocol), - ] - if return_generation_logits: - return_generation_logits_flag = np.array([[True]], dtype=bool) - inputs += [ - utils.prepare_tensor("return_generation_logits", - return_generation_logits_flag, - FLAGS.protocol), - ] - - result = client.infer(model_name, inputs, request_id=str(i)) - output0 = result.as_numpy("output_ids").astype(np.int32) - seq_lengths = result.as_numpy("sequence_length") - cum_log_probs = result.as_numpy("cum_log_probs").astype(np.float32) - output_log_probs = result.as_numpy("output_log_probs").astype( - np.float32) - context_logits = result.as_numpy("context_logits").astype(np.float32) - generation_logits = result.as_numpy("generation_logits").astype( - np.float32) - - print(f"context_logits.shape: {context_logits.shape}") - print(f"generation_logits.shape: {generation_logits.shape}") - - model_name = "postprocessing" - inputs = [ - utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol), - utils.prepare_tensor("SEQUENCE_LENGTH", seq_lengths, - FLAGS.protocol), - utils.prepare_tensor("CUM_LOG_PROBS", cum_log_probs, - FLAGS.protocol), - utils.prepare_tensor("OUTPUT_LOG_PROBS", output_log_probs, - FLAGS.protocol), - utils.prepare_tensor("CONTEXT_LOGITS", context_logits, - FLAGS.protocol), - utils.prepare_tensor("GENERATION_LOGITS", generation_logits, - FLAGS.protocol) - ] - inputs[0].set_data_from_numpy(output0) - inputs[1].set_data_from_numpy(seq_lengths) - inputs[2].set_data_from_numpy(cum_log_probs) - inputs[3].set_data_from_numpy(output_log_probs) - inputs[4].set_data_from_numpy(context_logits) - inputs[5].set_data_from_numpy(generation_logits) - - result = client.infer(model_name, inputs, request_id=str(i)) - output0 = result.as_numpy("OUTPUT") - post_gen_logits = result.as_numpy("OUT_GENERATION_LOGITS") - assert (generation_logits == post_gen_logits).all() - - # 2. Use ensemble model - model_name = "ensemble" - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - if return_context_logits: - return_context_logits_flag = np.array([[True]], dtype=bool) - inputs += [ - utils.prepare_tensor("return_context_logits", - return_context_logits_flag, - FLAGS.protocol), - ] - if return_generation_logits: - return_generation_logits_flag = np.array([[True]], dtype=bool) - inputs += [ - utils.prepare_tensor("return_generation_logits", - return_generation_logits_flag, - FLAGS.protocol), - ] - - result = client.infer(model_name, inputs, request_id=str(i)) - - # 3. Check the results between manually ensembled models and the ensemble model - ensemble_output = result.as_numpy('text_output') - ensemble_cum_log_probs = result.as_numpy('cum_log_probs') - ensemble_output_log_probs = result.as_numpy('output_log_probs') - ensemble_context_logits = result.as_numpy('context_logits') - ensemble_generation_logits = result.as_numpy('generation_logits') - - assert output0 == ensemble_output - assert cum_log_probs == ensemble_cum_log_probs - assert (output_log_probs == ensemble_output_log_probs).all() - assert (context_logits == ensemble_context_logits).all() - assert (generation_logits == ensemble_generation_logits).all() - - ensemble_context_logits_shape = ensemble_context_logits.shape - assert (len(ensemble_context_logits_shape) == 3) - if return_context_logits: - # Expect shape [1, prompt_length, vocabSizePadded] - assert (ensemble_context_logits_shape[0] == 1) # One request - assert (ensemble_context_logits_shape[1] == inputIds.size - ) # Prompt length - assert (ensemble_context_logits_shape[2] == vocabSizePadded - ) # VocabSizePadded - else: - # Expect shape [1, 1, 1] - assert (ensemble_context_logits_shape[0] == 1) - assert (ensemble_context_logits_shape[1] == 1) - assert (ensemble_context_logits_shape[2] == 1) - assert (ensemble_context_logits[0][0][0] == 0 - ) # Dummy tensor's value is 0 - - ensemble_generation_logits_shape = ensemble_generation_logits.shape - assert (len(ensemble_generation_logits_shape) == 4) - - if return_generation_logits: - # Expect shape [1, beam_width, output_length, vocabSizePadded] - assert (ensemble_generation_logits_shape[0] == 1) # One request - assert (ensemble_generation_logits_shape[1] == 1 - ) # Beam width (default) - assert (ensemble_generation_logits_shape[2] == output_lens[i] - ) # Output length - assert (ensemble_generation_logits_shape[3] == vocabSizePadded - ) # VocabSizePadded - else: - assert (ensemble_generation_logits_shape[0] == 1) - assert (ensemble_generation_logits_shape[1] == 1) - assert (ensemble_generation_logits_shape[2] == 1) - assert (ensemble_generation_logits_shape[3] == 1) - assert (ensemble_generation_logits[0][0][0][0] == 0 - ) # Dummy tensor's value is 0 - - if test_bls: - # 4. Use bls - model_name = "tensorrt_llm_bls" - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype( - np.int32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, - FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, - FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, - FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - if return_context_logits: - return_context_logits_flag = np.array([[True]], dtype=bool) - inputs += [ - utils.prepare_tensor("return_context_logits", - return_context_logits_flag, - FLAGS.protocol), - ] - if return_generation_logits: - return_generation_logits_flag = np.array([[True]], dtype=bool) - inputs += [ - utils.prepare_tensor("return_generation_logits", - return_generation_logits_flag, - FLAGS.protocol), - ] - - result = client.infer(model_name, inputs, request_id=str(i)) - - # 5. Check the results between manually ensembled models and the bls model - bls_output = result.as_numpy('text_output') - bls_cum_log_probs = result.as_numpy('cum_log_probs') - bls_output_log_probs = result.as_numpy('output_log_probs') - bls_context_logits = result.as_numpy('context_logits') - bls_generation_logits = result.as_numpy('generation_logits') - continue - - assert output0 == bls_output - assert cum_log_probs == bls_cum_log_probs - assert (output_log_probs == bls_output_log_probs).all() - assert (context_logits == bls_context_logits).all() - assert (generation_logits == bls_generation_logits).all() - - bls_context_logits_shape = bls_context_logits.shape - assert (len(bls_context_logits_shape) == 3) - if return_context_logits: - # Expect shape [1, prompt_length, vocabSizePadded] - assert (bls_context_logits_shape[0] == 1) # One request - assert (bls_context_logits_shape[1] == inputIds.size - ) # Prompt length - assert (bls_context_logits_shape[2] == vocabSizePadded - ) # VocabSizePadded - else: - # Expect shape [1, 1, 1] - assert (bls_context_logits_shape[0] == 1) - assert (bls_context_logits_shape[1] == 1) - assert (bls_context_logits_shape[2] == 1) - assert (bls_context_logits[0][0][0] == 0 - ) # Dummy tensor's value is 0 - - bls_generation_logits_shape = bls_generation_logits.shape - assert (len(bls_generation_logits_shape) == 4) - - if return_generation_logits: - # Expect shape [1, beam_width, output_length, vocabSizePadded] - assert (bls_generation_logits_shape[0] == 1) # One request - assert (bls_generation_logits_shape[1] == 1 - ) # Beam width (default) - assert (bls_generation_logits_shape[2] == output_lens[i] - ) # Output length - assert (bls_generation_logits_shape[3] == vocabSizePadded - ) # VocabSizePadded - else: - assert (bls_generation_logits_shape[0] == 1) - assert (bls_generation_logits_shape[1] == 1) - assert (bls_generation_logits_shape[2] == 1) - assert (bls_generation_logits_shape[3] == 1) - assert (bls_generation_logits[0][0][0][0] == 0 - ) # Dummy tensor's value is 0 - - if FLAGS.verbose: - print('Response: {}'.format(result.get_response())) - print('Output: {}'.format(ensemble_output)) - print(f"[INFO] Functionality test succeed.") - - -def test_performance(client, prompts, output_lens): - model_name = "ensemble" - - print(f"[INFO] Warm up for benchmarking.") - for i in range(min(10, len(prompts))): - input0 = [[prompts[0]]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - - client.infer(model_name, inputs, request_id=str(i)) - - print(f"[INFO] Start benchmarking on {len(prompts)} prompts.") - latency = 0 - async_requests = [] - start_time = datetime.now() - user_data = utils.UserData() - for i, prompt in enumerate(prompts): - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i] - bad_words_list = np.array([[""]], dtype=object) - stop_words_list = np.array([[""]], dtype=object) - - inputs = [ - utils.prepare_tensor("text_input", input0_data, FLAGS.protocol), - utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol), - utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol), - utils.prepare_tensor("stop_words", stop_words_list, - FLAGS.protocol), - ] - - if FLAGS.protocol == "http": - async_requests.append( - client.async_infer(model_name, inputs, request_id=str(i))) - elif FLAGS.protocol == "grpc": - async_requests.append( - client.async_infer(model_name, - inputs, - callback=partial(callback, user_data, - datetime.now()), - request_id=str(i))) - - if FLAGS.protocol == "http": - utils.get_http_results(async_requests) - elif FLAGS.protocol == "grpc": - utils.get_grpc_results(user_data, len(prompts)) - else: - raise RuntimeError("Invalid protocol") - - stop_time = datetime.now() - latency = (stop_time - start_time).total_seconds() * 1000.0 - latency = round(latency, 3) - print(f"[INFO] Total Latency: {latency} ms") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') - parser.add_argument( - '-i', - '--protocol', - type=str, - required=False, - default='http', - choices=['http', 'grpc'], - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-c', - '--concurrency', - type=int, - default=128, - required=False, - help='Specify concurrency') - parser.add_argument('--max-input-len', - type=int, - required=True, - help='Specify max input length') - - parser.add_argument('--dataset', - type=str, - required=True, - help='Dataset path used for the test.') - - parser.add_argument('--return-context-logits', - action="http://github.com/store_true", - default=False, - help='Return context logits.') - - parser.add_argument('--return-generation-logits', - action="http://github.com/store_true", - default=False, - help='Return generation logits.') - - parser.add_argument('--test-bls', - action="http://github.com/store_true", - default=False, - help="test BLS model") - - FLAGS = parser.parse_args() - if FLAGS.url is None: - FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" - - try: - client = utils.create_inference_server_client( - FLAGS.protocol, - FLAGS.url, - concurrency=FLAGS.concurrency, - verbose=FLAGS.verbose) - except Exception as e: - print("Encountered error: " + str(e)) - sys.exit(1) - - prompts = [] - output_lens = [] - with open(FLAGS.dataset, 'r') as f: - data_dict = json.load(f) - for req in data_dict: - prompt = req['input'] + ' ' + req['instruction'] - output = req['output'] - # 1.3 is a magic number that converts number of words to number of tokens - if int(len(prompt.split(' ')) / 1.3) > FLAGS.max_input_len: - continue - prompts.append(prompt) - # 1.3 is a magic number that converts number of words to number of tokens - output_lens.append(int(len(output.split(' ')) * 1.3)) - - vocabSizePadded = 50257 # gpt - test_functionality(client, prompts, output_lens, vocabSizePadded, - FLAGS.return_context_logits, - FLAGS.return_generation_logits, FLAGS.test_bls) - test_performance(client, prompts, output_lens) diff --git a/tools/inflight_batcher_llm/speculative_decoding_test.py b/tools/inflight_batcher_llm/speculative_decoding_test.py deleted file mode 100644 index 22a99491..00000000 --- a/tools/inflight_batcher_llm/speculative_decoding_test.py +++ /dev/null @@ -1,321 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -utils_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -root_path = os.path.dirname(utils_path) -sys.path.append(utils_path) -sys.path.append(os.path.join(root_path, "inflight_batcher_llm")) - -import argparse -import json -import sys - -import numpy as np -import tritonclient.grpc as grpcclient -from client import e2e_grpc_speculative_decoding_client, end_to_end_grpc_client - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - - parser.add_argument('--url-target', - type=str, - required=True, - help='Inference server URL for the target model') - - parser.add_argument('--url-draft', - type=str, - required=True, - help='Inference server URL for the draft model') - - parser.add_argument('--max-input-len', - type=int, - required=True, - help='Max input length for input prompts') - - parser.add_argument( - '--preprocessor-model-name', - type=str, - required=False, - default="preprocessing", - help='Name of the preprocessor model (should be hosted at url-draft)') - - parser.add_argument( - '--postprocessor-model-name', - type=str, - required=False, - default="postprocessing", - help='Name of the postprocessor model (should be hosted at url-target)' - ) - - parser.add_argument( - '--draft-tensorrt-llm-model-name', - type=str, - required=False, - default="tensorrt_llm", - help='Name of the tensorrt_llm draft model (hosted at url-draft)') - - parser.add_argument( - '--target-tensorrt-llm-model-name', - type=str, - required=False, - default="tensorrt_llm", - help='Name of the tensorrt_llm target model (hosted at url-target)') - - parser.add_argument( - '--bls-speculative-tensorrt-llm-model-name', - type=str, - required=False, - default="tensorrt_llm_bls", - help= - 'Name of the tensorrt_llm bls model (only supports the case of url-target == url-draft)' - ) - - parser.add_argument( - '--execute-bls-speculative-decoding', - action='http://github.com/store_true', - help='Executes the BLS speculative decoding model if set') - - parser.add_argument( - "-b", - "--beam-width", - required=False, - type=int, - default=1, - help="Beam width value", - ) - - parser.add_argument( - "--temperature", - type=float, - required=False, - default=1.0, - help="temperature value", - ) - - parser.add_argument( - "--repetition-penalty", - type=float, - required=False, - default=None, - help="The repetition penalty value", - ) - - parser.add_argument( - "--presence-penalty", - type=float, - required=False, - default=None, - help="The presence penalty value", - ) - - parser.add_argument( - "--frequency-penalty", - type=float, - required=False, - default=None, - help="The frequency penalty value", - ) - - parser.add_argument('-o', - '--output-len', - type=int, - default=100, - required=False, - help='Specify output length') - - parser.add_argument( - '--num-draft-tokens', - type=int, - default=5, - required=False, - help= - 'Specify the number of speculative tokens for the draft model to generate per lookahead.' - ) - parser.add_argument( - '--use-draft-logits', - default=False, - required=False, - action='http://github.com/store_true', - help='Use logits from draft model when performing speculative decoding' - ) - parser.add_argument('--return-context-logits', - default=False, - required=False, - action='http://github.com/store_true', - help='Return context logits') - parser.add_argument('--return-generation-logits', - default=False, - required=False, - action='http://github.com/store_true', - help='Return generation logits') - - parser.add_argument('--end-id', - type=int, - default=None, - required=False, - help='The end if token') - - parser.add_argument('--pad-id', - type=int, - default=None, - required=False, - help='The pad if token') - - parser.add_argument('--stop-words', - nargs='+', - default=[], - help='The stop words') - - parser.add_argument('--bad-words', - nargs='+', - default=[], - help='The bad words') - - parser.add_argument('--dataset', - type=str, - required=True, - help='Dataset path used for the test.') - - parser.add_argument('--disable-output-comparison', - action='http://github.com/store_true', - required=False, - help='disable output check') - - parser.add_argument( - "--return-draft-model-draft-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine" - ) - - parser.add_argument( - "--return-target-model-accepted-token-logits", - action="http://github.com/store_true", - required=False, - default=False, - help= - "Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine", - ) - - FLAGS = parser.parse_args() - if not FLAGS.url_target: - FLAGS.url_target = "localhost:8001" - - if not FLAGS.url_draft: - FLAGS.url_draft = FLAGS.url_target - - try: - client_target = grpcclient.InferenceServerClient(url=FLAGS.url_target) - client_draft = grpcclient.InferenceServerClient( - url=FLAGS.url_draft) if ( - FLAGS.url_target != FLAGS.url_draft) else client_target - except Exception as e: - print("client creation failed: " + str(e)) - sys.exit(1) - - if (FLAGS.beam_width > 1): - raise Exception( - 'Beam width > 1 is not yet supported with speculative decoding') - - request_id = 1 - total_count = 0 - failed_count = 0 - with open(FLAGS.dataset, 'r') as f: - data_dict = json.load(f) - for req in data_dict: - prompt = req['input'] + ' ' + req['instruction'] - output = req['output'] - # 1.3 is a magic number that converts number of words to number of tokens - if int(len(prompt.split(' ')) * 1.3) > FLAGS.max_input_len: - continue - # 1.3 is a magic number that converts number of words to number of tokens - output_len = int(len(output.split(' ')) * 1.3) - if FLAGS.verbose: - print(f"Prompt: {prompt}") - print(f"Output len: {output_len}") - - # Calling target model only - if FLAGS.verbose: - print(f"Calling target model", flush=True) - output_target = end_to_end_grpc_client.run_inference( - client_target, prompt, output_len, str(request_id), - FLAGS.repetition_penalty, FLAGS.presence_penalty, - FLAGS.frequency_penalty, FLAGS.temperature, FLAGS.stop_words, - FLAGS.bad_words, [], [], "ensemble", False, 1, False, None, - None, FLAGS.end_id, FLAGS.pad_id, FLAGS.verbose) - if FLAGS.verbose: - print(f"output_target: {output_target}", flush=True) - print(f"flags: {FLAGS}") - print(f"prompt: {prompt}") - print(f"output_len: {output_len}") - - # Calling BLS speculative decoding - if FLAGS.execute_bls_speculative_decoding: - if FLAGS.verbose: - print(f"Calling BLS speculative decoding model", - flush=True) - output_speculative = end_to_end_grpc_client.run_inference( - client_target, prompt, output_len, str(request_id), - FLAGS.repetition_penalty, FLAGS.presence_penalty, - FLAGS.frequency_penalty, FLAGS.temperature, - FLAGS.stop_words, FLAGS.bad_words, [], [], - "tensorrt_llm_bls", False, 1, False, None, - np.array([[FLAGS.return_generation_logits]], dtype=bool), - FLAGS.end_id, FLAGS.pad_id, FLAGS.verbose, - FLAGS.num_draft_tokens, FLAGS.use_draft_logits) - if FLAGS.verbose: - print(f"output_bls_speculative: {output_speculative}", - flush=True) - else: - # Calling client-side coordination of speculative decoding - if FLAGS.verbose: - print(f"Calling speculative client", flush=True) - output_speculative = e2e_grpc_speculative_decoding_client.run_speculative_inference( - client_draft, - client_target, prompt, output_len, FLAGS.num_draft_tokens, - str(request_id), FLAGS.repetition_penalty, - FLAGS.presence_penalty, FLAGS.frequency_penalty, - FLAGS.temperature, FLAGS.stop_words, FLAGS.bad_words, - FLAGS.end_id, FLAGS.pad_id, FLAGS.beam_width, - FLAGS.preprocessor_model_name, - FLAGS.draft_tensorrt_llm_model_name, - FLAGS.target_tensorrt_llm_model_name, - FLAGS.postprocessor_model_name, - FLAGS.return_draft_model_draft_logits, - FLAGS.return_target_model_accepted_token_logits, - FLAGS.verbose) - if FLAGS.verbose: - print(f"output_speculative: {output_speculative}", - flush=True) - - total_count = total_count + 1 - if not FLAGS.disable_output_comparison: - if (output_target != output_speculative): - failed_count = failed_count + 1 - print(f"{total_count}: Outputs don't match") - print(f"Prompt:") - print(f"{prompt}") - print(f"Output target:") - print(f"{output_target}") - print(f"Output speculative:") - print(f"{output_speculative}") - else: - print(f"{total_count}: Outputs match") - else: - print("Not checking output") - if output_speculative == "": - failed_count += 1 - request_id = request_id + 1 - - print(f"failed/total: {failed_count}/{total_count}") - sys.exit(failed_count > 0) diff --git a/tools/utils.sh b/tools/utils.sh deleted file mode 100644 index 27ef4cd1..00000000 --- a/tools/utils.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on -# success, 1 on failure - -function wait_for_server_ready() { - - local spid="$1"; - local wait_time_secs="${2:-30}"; - local triton_http_port="${3:-8000}" - WAIT_RET=0 - - local wait_secs=$wait_time_secs - until test $wait_secs -eq 0 ; do - if ! kill -0 $spid; then - echo "=== Server not running." - WAIT_RET=1 - return - fi - - sleep 1; - - set +e - code=`curl -s -w %{http_code} localhost:${triton_http_port}/v2/health/ready` - set -e - if [ "$code" == "200" ]; then - return - fi - - ((wait_secs--)); - done - - echo "=== Timeout $wait_time_secs secs. Server not ready." - WAIT_RET=1 -} diff --git a/tools/utils/__init__.py b/tools/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tools/utils/utils.py b/tools/utils/utils.py deleted file mode 100644 index 501aac32..00000000 --- a/tools/utils/utils.py +++ /dev/null @@ -1,449 +0,0 @@ -import csv -import json -import math -import queue -import random -from datetime import timedelta -from functools import partial - -import numpy as np -import pandas as pd -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from tabulate import tabulate -from tritonclient.utils import np_to_triton_dtype - - -class UserData: - - def __init__(self): - self._completed_requests = queue.Queue() - self._latencies = [] - self._latency_dict = {} - self._start_time_dict = {} - self._stop_time_dict = {} - - -# Callback function used for async_stream_infer() -def completion_callback(user_data, result, error): - # passing error raise and handling out - user_data._completed_requests.put((result, error)) - - -def prepare_tensor(name, input, protocol): - client_util = httpclient if protocol == "http" else grpcclient - t = client_util.InferInput(name, input.shape, - np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -def prepare_inputs(input_start_ids, input_len, pad_id, end_id, flags): - output_len = np.ones([input_start_ids.shape[0], 1]).astype( - np.int32) * flags.output_len - runtime_top_k = (flags.topk * - np.ones([input_start_ids.shape[0], 1])).astype(np.int32) - runtime_top_p = flags.topp * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - beam_search_diversity_rate = 0.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - temperature = 1.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - len_penalty = 1.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - repetition_penalty = 1.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - random_seed = 0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.uint64) - output_log_probs = True * \ - np.ones([input_start_ids.shape[0], 1]).astype(bool) - beam_width = (flags.beam_width * - np.ones([input_start_ids.shape[0], 1])).astype(np.int32) - pad_ids = pad_id * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.int32) - end_ids = end_id * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.int32) - min_length = 1 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.int32) - presence_penalty = 0.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - frequency_penalty = 0.0 * \ - np.ones([input_start_ids.shape[0], 1]).astype(np.float32) - bad_words_list = np.concatenate([ - np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32), - (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32) - ], - axis=1) - stop_word_list = np.concatenate([ - np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32), - (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32) - ], - axis=1) - inputs = [ - prepare_tensor("input_ids", input_start_ids, flags.protocol), - prepare_tensor("input_lengths", input_len, flags.protocol), - prepare_tensor("request_output_len", output_len, flags.protocol), - prepare_tensor("pad_id", pad_ids, flags.protocol), - prepare_tensor("end_id", end_ids, flags.protocol), - prepare_tensor("beam_width", beam_width, flags.protocol), - prepare_tensor("temperature", temperature, flags.protocol), - prepare_tensor("runtime_top_k", runtime_top_k, flags.protocol), - prepare_tensor("runtime_top_p", runtime_top_p, flags.protocol), - prepare_tensor("len_penalty", len_penalty, flags.protocol), - prepare_tensor("repetition_penalty", repetition_penalty, - flags.protocol), - prepare_tensor("min_length", min_length, flags.protocol), - prepare_tensor("presence_penalty", presence_penalty, flags.protocol), - prepare_tensor("frequency_penalty", frequency_penalty, flags.protocol), - prepare_tensor("random_seed", random_seed, flags.protocol), - prepare_tensor("output_log_probs", output_log_probs, flags.protocol), - # prepare_tensor("bad_words_list", bad_words_list, flags.protocol), - # prepare_tensor("stop_words_list", stop_word_list, flags.protocol), - ] - return inputs - - -def create_inference_server_client(protocol, url, concurrency, verbose): - client_util = httpclient if protocol == "http" else grpcclient - if protocol == "http": - return client_util.InferenceServerClient(url, - concurrency=concurrency, - verbose=verbose) - elif protocol == "grpc": - return client_util.InferenceServerClient(url, verbose=verbose) - - -def send_requests(model_name, inputs, client, request_parallelism): - results = [] - for _ in range(request_parallelism): - result = client.infer(model_name, inputs) - results.append(result) - return results - - -def send_requests_async(model_name, inputs, client, flags, - request_parallelism): - if flags.protocol == "http": - async_requests = [] - for _ in range(request_parallelism): - async_requests.append(client.async_infer(model_name, inputs)) - return async_requests - else: - user_data = UserData() - for _ in range(request_parallelism): - client.async_infer(model_name, inputs, - partial(completion_callback, user_data)) - return user_data - - -def get_http_results(async_requests): - results = [] - for async_request in async_requests: - results.append(async_request.get_result()) - return results - - -def get_grpc_results(user_data, request_parallelism): - results = [] - processed_count = 0 - while processed_count < request_parallelism: - (result, error) = user_data._completed_requests.get() - processed_count += 1 - if error is not None: - raise RuntimeError(error) - results.append(result) - return results - - -def append_start_and_end_ids(inputs, - batch_size, - flags, - start_id=None, - end_id=None): - if start_id is not None: - start_ids = start_id * np.ones([batch_size, 1]).astype(np.int32) - inputs.append(prepare_tensor("start_id", start_ids, flags.protocol)) - if end_id is not None: - end_ids = end_id * np.ones([batch_size, 1]).astype(np.int32) - inputs.append(prepare_tensor("end_id", end_ids, flags.protocol)) - - -def generate_histogram(range_buckets, frequencies): - histogram = [] - - for i in range(len(range_buckets)): - bucket = range_buckets[i] - frequency = frequencies[i] - - # Split the bucket range into min and max values - min_range, max_range = bucket - - # Generate 'frequency' random values within the specified range - random.seed(420) - random_values = [ - random.randint(min_range, max_range) for _ in range(frequency) - ] - - # Extend the histogram with the random values - histogram.extend(random_values) - - # Randomize the order of values in the histogram - random.shuffle(histogram) - - return histogram - - -def get_token_list_from_histogram(histogram_key): - - histogram_buckets = { - "example_ip": [(151, 175), (176, 200), (201, 225), (226, 250), - (251, 275)], - "example_op": [(6, 10), (11, 15), (16, 20), (21, 25), (26, 30)] - } - histogram_freq = { - "example_ip": [220, 225, 150, 150, 140], - "example_op": [76, 210, 174, 130, 152] - } - - range_buckets = histogram_buckets[histogram_key] - freqs = histogram_freq[histogram_key] - assert (len(range_buckets) == len(freqs)) - - return generate_histogram(range_buckets, freqs) - - -def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs): - if delay_dist == "constant": - delays = [mean_time_bet_reqs] * num_reqs - elif delay_dist == "exponential_dist": - delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs) - - return delays - - -def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs): - # set seed for determinism - np.random.seed(420) - return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist() - - -def get_norm_dist_tokens(mean, stdev, num_reqs): - # set seed for determinism - np.random.seed(420) - numbers_list = np.random.normal(loc=mean, scale=stdev, - size=num_reqs).tolist() - return [max(1, math.ceil(x)) for x in numbers_list] - - -def gen_random_start_ids(ip_lens): - input_start_ids = [] - for ip_len in ip_lens: - start_ids = list( - np.random.randint(low=0, - high=np.iinfo(np.int32).max, - size=ip_len, - dtype=np.int32)) - input_start_ids.append(np.array([start_ids])) - - return input_start_ids - - -def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs): - if delay_dist == "constant": - delays = [mean_time_bet_reqs] * num_reqs - elif delay_dist == "exponential_dist": - delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs) - - return delays - - -def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs): - return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist() - - -def get_norm_dist_tokens(mean, stdev, num_reqs): - numbers_list = np.random.normal(loc=mean, scale=stdev, - size=num_reqs).tolist() - return [max(1, math.ceil(x)) for x in numbers_list] - - -def get_inflight_reqs_profile(start_times, end_times, requests_per_sec): - """ - Receives start and end times of all requests, - divides total E2E time into equal intervals and assigns how many requests are in flight - in each interval. - """ - # Calculate min of start time and max of end time - min_start_time = min(start_times) - max_end_time = max(end_times) - - # need to have enough resolution intervals depending on avg. latency per request. 10 times smaller than request processing time - sec_per_request = 1.0 / requests_per_sec - NUM_INTERVALS = int((max_end_time - min_start_time) / - timedelta(seconds=(sec_per_request / 10))) - print(NUM_INTERVALS) - # Calculate interval length - interval_length = (max_end_time - min_start_time) / NUM_INTERVALS - - # Initialize a list to store the count of requests in each interval - interval_counts = [0] * NUM_INTERVALS - - # Iterate through the requests and update interval counts - for i in range(len(start_times)): - start = start_times[i] - end = end_times[i] - - # Calculate which interval the request falls into - interval_index = int((start - min_start_time) / interval_length) - - # Increment the count for that interval and subsequent intervals until end - while start < end and interval_index < NUM_INTERVALS: - interval_counts[interval_index] += 1 - interval_index += 1 - start += interval_length - - return interval_counts - - -def extract_print_stats(ip_token_len_list, responses, user_data, FLAGS): - - #### Gather info about requests - op_token_len_list = [] - op_token_len_ooo = {} - - for response in responses: - #JG: long sequence to extract output length from response json dict. Responses are out of order - op_token_len_ooo[response.get_response(as_json=True)['id']] = \ - int(response.get_response(as_json=True)['outputs'][0]['shape'][2]) - - op_token_len_list = [ - value for key, value in sorted(op_token_len_ooo.items()) - ] - - assert (len(op_token_len_list) == len(ip_token_len_list)) - if not FLAGS.exclude_input_in_output: - for i in range(len(op_token_len_list)): - op_token_len_list[i] = op_token_len_list[i] - ip_token_len_list[i] - - # Get latencies per request - # Order latencies based on issue order. - latency_list_in_order = [ - value for key, value in sorted(user_data._latency_dict.items()) - ] - start_time_list_in_order = [ - value for key, value in sorted(user_data._start_time_dict.items()) - ] - stop_time_list_in_order = [ - value for key, value in sorted(user_data._stop_time_dict.items()) - ] - - latency_sorted = np.sort(latency_list_in_order) - index_99 = math.ceil(len(latency_sorted) * 0.99) - index_90 = math.ceil(len(latency_sorted) * 0.90) - - data = { - 'latency': latency_list_in_order, - 'start_time': start_time_list_in_order, - 'stop_time': stop_time_list_in_order, - 'num_ip_tokens': ip_token_len_list, - 'num_op_tokens': op_token_len_list - } - - # Bundle everything in a single DF - df = pd.DataFrame(data) - - #stats - df['num_ip_tokens'].sum() - avg_ip_tokens = df['num_ip_tokens'].mean() - df['num_ip_tokens'].median() - df['num_ip_tokens'].std() - total_op_tokens = df['num_op_tokens'].sum() - avg_op_tokens = df['num_op_tokens'].mean() - df['num_op_tokens'].median() - df['num_op_tokens'].std() - - tend = max(df['stop_time'].tolist()) - t0 = min(df['start_time'].tolist()) - total_latency = (tend - t0).total_seconds() - requests_per_sec = len(responses) / total_latency - tokens_generated_per_sec = total_op_tokens / total_latency - - avg_in_flight_requests = 0 - - print_data_dict = {} - print_data_dict["Requests/Sec"] = requests_per_sec - print_data_dict["OP tokens/sec"] = tokens_generated_per_sec - print_data_dict["Avg. latency (ms)"] = np.mean(latency_list_in_order) - print_data_dict["P99 latency (ms)"] = latency_sorted[index_99 - 1] - print_data_dict["P90 latency (ms)"] = latency_sorted[index_90 - 1] - print_data_dict["Avg. Input tokens per request"] = avg_ip_tokens - print_data_dict["Avg. Output tokens per request"] = avg_op_tokens - print_data_dict["Avg. InFlight requests"] = avg_in_flight_requests - print_data_dict["Total latency (ms)"] = total_latency * 1000 - print_data_dict["Total requests"] = len(responses) - - print_data = [["Requests/Sec", requests_per_sec], - ["OP tokens/sec", tokens_generated_per_sec], - ["Avg. latency (ms)", - np.mean(latency_list_in_order)], - ["P99 latency (ms)", latency_sorted[index_99 - 1]], - ["P90 latency (ms)", latency_sorted[index_90 - 1]], - ["Avg. IP tokens per request", avg_ip_tokens], - ["Avg. OP tokens per request", avg_op_tokens], - ["Avg. InFlight requests", avg_in_flight_requests], - ["Total latency (ms)", total_latency * 1000], - ["Total requests", len(responses)]] - - # Format numerical values to 2 decimal places - formatted_data = [[item, f"{value:.2f}"] for item, value in print_data] - headers = ["Stat", "Value"] - table = tabulate(formatted_data, headers=headers, tablefmt="pretty") - - if FLAGS.op_stats_csv is not None: - with open(FLAGS.op_stats_csv, "a", newline="") as file: - filednames = print_data_dict.keys() - writer = csv.DictWriter(file, fieldnames=filednames) - - # Check if the file is empty, and write the header if needed - if file.tell() == 0: - writer.writeheader() - - # Write the dictionaries as new rows - writer.writerow(print_data_dict) - - print(table) - - if FLAGS.dump_perfetto_trace: - json_dict = [] - for i in range(len(op_token_len_list)): - req_dict = {} - req_dict['name'] = 'req_{}'.format(i) - req_dict["cat"] = "batch" - req_dict["ph"] = "X" - req_dict["ts"] = (start_time_list_in_order[i].timestamp() - - t0.timestamp()) * 1000000 #perfetto expects us - req_dict["dur"] = ( - stop_time_list_in_order[i] - - start_time_list_in_order[i]).total_seconds() * 1000000 - req_dict["pid"] = "1" - req_dict["args"] = { - "isl": int(ip_token_len_list[i]), - "osl": int(op_token_len_list[i]) - } - json_dict.append(req_dict) - - with open("prfetto_dump.json", "w") as file: - json.dump(json_dict, file, indent=4) - - return print_data_dict - - -def extract_string_from_nested_list(nested_list): - if isinstance(nested_list, str): - return nested_list - elif isinstance(nested_list, list): - for item in nested_list: - extracted_string = extract_string_from_nested_list(item) - if extracted_string: - return extracted_string - return "" diff --git a/tools/version.txt b/tools/version.txt deleted file mode 100644 index dfc4d6d2..00000000 --- a/tools/version.txt +++ /dev/null @@ -1 +0,0 @@ -73b896d12a81662027fa6746ab3ed99450150e18