diff --git a/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript b/.github/workflows/pre-commit.yml
similarity index 83%
rename from inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript
rename to .github/workflows/pre-commit.yml
index 748714d1..4fa18732 100644
--- a/inflight_batcher_llm/src/libtriton_tensorrtllm.ldscript
+++ b/.github/workflows/pre-commit.yml
@@ -1,4 +1,4 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -23,8 +23,16 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-{
-  global:
-    TRITONBACKEND_*;
-  local: *;
-};
+
+name: pre-commit
+
+on:
+  pull_request:
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v5.0.0
+    - uses: actions/setup-python@v6.0.0
+    - uses: pre-commit/action@v3.0.1
diff --git a/.gitignore b/.gitignore
index f4c2f069..a8cb1c8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ build/
 .coverage
 *.onnx
 tmp/
+.idea
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index caca92b3..ee803e26 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,29 @@
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 repos:
 -   repo: https://github.com/pycqa/isort
     rev: 5.12.0
@@ -7,12 +33,12 @@ repos:
     rev: v1.1.13
     hooks:
     -   id: remove-crlf
--   repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
+-   repo: https://github.com/google/yapf
+    rev: v0.43.0
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.1.0
+    rev: v6.0.0
     hooks:
     -   id: check-added-large-files
         exclude: 'tensorrt_llm/'
diff --git a/README.md b/README.md
index 9d9e0088..9c64ed57 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -34,6 +34,12 @@ models with Triton Inference Server. The [inflight_batcher_llm](./inflight_batch
 directory contains the C++ implementation of the backend supporting inflight
 batching, paged attention and more.
 
+> [!NOTE]
+>
+> Please note that the Triton backend source code and test have been moved
+> to [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) under the
+> `triton_backend` directory.
+
 Where can I ask general questions about Triton and Triton backends?
 Be sure to read all the information below as well as the [general
 Triton documentation](https://github.com/triton-inference-server/server#triton-inference-server)
@@ -41,120 +47,93 @@ available in the main [server](https://github.com/triton-inference-server/server
 repo. If you don't find your answer there you can ask questions on the
 [issues page](https://github.com/triton-inference-server/tensorrtllm_backend/issues).
 
-## Accessing the TensorRT-LLM Backend
-
-There are several ways to access the TensorRT-LLM Backend.
-
-### Run the Pre-built Docker Container
+## Table of Contents
+- [TensorRT-LLM Backend](#tensorrt-llm-backend)
+  - [Table of Contents](#table-of-contents)
+  - [Getting Started](#getting-started)
+    - [Quick Start](#quick-start)
+      - [Launch Triton TensorRT-LLM container](#launch-triton-tensorrt-llm-container)
+      - [Prepare TensorRT-LLM engines](#prepare-tensorrt-llm-engines)
+      - [Prepare the Model Repository](#prepare-the-model-repository)
+      - [Modify the Model Configuration](#modify-the-model-configuration)
+      - [Serving with Triton](#serving-with-triton)
+      - [Send an Inference Request](#send-an-inference-request)
+        - [Using the generate endpoint](#using-the-generate-endpoint)
+        - [Using the client scripts](#using-the-client-scripts)
+          - [Early stopping](#early-stopping)
+          - [Return context logits and/or generation logits](#return-context-logits-andor-generation-logits)
+        - [Requests with batch size \> 1](#requests-with-batch-size--1)
+  - [Building from Source](#building-from-source)
+  - [Supported Models](#supported-models)
+  - [Model Config](#model-config)
+  - [Model Deployment](#model-deployment)
+    - [TRT-LLM Multi-instance Support](#trt-llm-multi-instance-support)
+      - [Leader Mode](#leader-mode)
+      - [Orchestrator Mode](#orchestrator-mode)
+      - [Running Multiple Instances of LLaMa Model](#running-multiple-instances-of-llama-model)
+    - [Multi-node Support](#multi-node-support)
+    - [Model Parallelism](#model-parallelism)
+      - [Tensor Parallelism, Pipeline Parallelism and Expert Parallelism](#tensor-parallelism-pipeline-parallelism-and-expert-parallelism)
+    - [MIG Support](#mig-support)
+    - [Scheduling](#scheduling)
+    - [Key-Value Cache](#key-value-cache)
+    - [Decoding](#decoding)
+      - [Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle](#decoding-modes---top-k-top-p-top-k-top-p-beam-search-medusa-redrafter-lookahead-and-eagle)
+      - [Speculative Decoding](#speculative-decoding)
+    - [Chunked Context](#chunked-context)
+    - [Quantization](#quantization)
+    - [LoRa](#lora)
+  - [Launch Triton server *within Slurm based clusters*](#launch-triton-server-within-slurm-based-clusters)
+    - [Prepare some scripts](#prepare-some-scripts)
+    - [Submit a Slurm job](#submit-a-slurm-job)
+  - [Triton Metrics](#triton-metrics)
+  - [Benchmarking](#benchmarking)
+  - [Testing the TensorRT-LLM Backend](#testing-the-tensorrt-llm-backend)
+
+## Getting Started
+
+### Quick Start
 
-Starting with Triton 23.10 release, Triton includes a container with the TensorRT-LLM
-Backend and Python Backend. This container should have everything to run a
-TensorRT-LLM model. You can find this container on the
-[Triton NGC page](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver).
+Below is an example of how to serve a TensorRT-LLM model with the Triton
+TensorRT-LLM Backend on a 4-GPU environment. The example uses the GPT model from
+the
+[TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/tree/v0.11.0/examples/gpt)
+with the
+[NGC Triton TensorRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver).
+Make sure you are cloning the same version of TensorRT-LLM backend as the
+version of TensorRT-LLM in the container. Please refer to the
+[support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+to see the aligned versions.
 
-### Build the Docker Container
+In this example, we will use Triton 24.07 with TensorRT-LLM v0.11.0.
 
-#### Option 1. Build via the `build.py` Script in Server Repo
 
-Starting with Triton 23.10 release, you can follow steps described in the
-[Building With Docker](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/build.md#building-with-docker)
-guide and use the
-[build.py](https://github.com/triton-inference-server/server/blob/main/build.py)
-script to build the TRT-LLM backend.
+#### Launch Triton TensorRT-LLM container
 
-The below commands will build the same Triton TRT-LLM container as the one on the NGC.
+Launch Triton docker container `nvcr.io/nvidia/tritonserver:<xx.yy>-trtllm-python-py3`
+with TensorRT-LLM backend.
 
-```bash
-# Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
-cd tensorrtllm_backend
-git lfs install
-git submodule update --init --recursive
-
-# Specify the build args for the dockerfile.
-BASE_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
-TRT_VERSION=10.0.1.6
-TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
-TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.ubuntu-22.04.aarch64-gnu.cuda-12.4.tar.gz
-
-docker build -t trtllm_base \
-             --build-arg BASE_IMAGE="${BASE_IMAGE}" \
-             --build-arg TRT_VER="${TRT_VERSION}" \
-             --build-arg RELEASE_URL_TRT_x86="${TRT_URL_x86}" \
-             --build-arg RELEASE_URL_TRT_ARM="${TRT_URL_ARM}" \
-             -f dockerfile/Dockerfile.triton.trt_llm_backend .
-
-# Run the build script from Triton Server repo. The flags for some features or
-# endpoints can be removed if not needed. Please refer to the support matrix to
-# see the aligned versions: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-TRTLLM_BASE_IMAGE=trtllm_base
-TENSORRTLLM_BACKEND_REPO_TAG=rel
-PYTHON_BACKEND_REPO_TAG=r24.04
-
-cd server
-./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \
-              --enable-metrics --enable-gpu-metrics --enable-cpu-metrics \
-              --filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
-              --endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
-              --backend=ensemble --enable-gpu --endpoint=http --endpoint=grpc \
-              --no-container-pull \
-              --image=base,${TRTLLM_BASE_IMAGE} \
-              --backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
-              --backend=python:${PYTHON_BACKEND_REPO_TAG}
-```
-
-The `TRTLLM_BASE_IMAGE` is the base image that will be used to build the
-container. The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are
-the tags of the TensorRT-LLM backend and Python backend repositories that will
-be used to build the container. You can also remove the features or endpoints
-that you don't need by removing the corresponding flags.
-
-#### Option 2. Build via Docker
-
-The version of Triton Server used in this build option can be found in the
-[Dockerfile](./dockerfile/Dockerfile.trt_llm_backend).
+Make an `engines` folder outside docker to reuse engines for future runs. Make
+sure to replace the `<xx.yy>` with the version of Triton that you want to use.
 
 ```bash
-# Update the submodules
-cd tensorrtllm_backend
-git lfs install
-git submodule update --init --recursive
-
-# Use the Dockerfile to build the backend in a container
-# For x86_64
-DOCKER_BUILDKIT=1 docker build -t triton_trt_llm -f dockerfile/Dockerfile.trt_llm_backend .
-# For aarch64
-DOCKER_BUILDKIT=1 docker build -t triton_trt_llm --build-arg TORCH_INSTALL_TYPE="src_non_cxx11_abi" -f dockerfile/Dockerfile.trt_llm_backend .
+docker run --rm -it --net host --shm-size=2g \
+    --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \
+    -v </path/to/engines>:/engines \
+    nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
 ```
 
-## Using the TensorRT-LLM Backend
-
-Below is an example of how to serve a TensorRT-LLM model with the Triton
-TensorRT-LLM Backend on a 4-GPU environment. The example uses the GPT model from
-the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt).
-
-### Prepare TensorRT-LLM engines
+#### Prepare TensorRT-LLM engines
 
 You can skip this step if you already have the engines ready.
-Follow the [guide](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt) in
-TensorRT-LLM repository for more details on how to to prepare the engines for deployment.
+Follow the [guide](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) in
+TensorRT-LLM repository for more details on how to to prepare the engines for
+all the supported models. You can also check out the
+[tutorials](https://github.com/triton-inference-server/tutorials) to see more
+examples with serving TensorRT-LLM models.
 
 ```bash
-# Update the submodule TensorRT-LLM repository
-git submodule update --init --recursive
-git lfs install
-git lfs pull
-
-# TensorRT-LLM is required for generating engines. You can skip this step if
-# you already have the package installed. If you are generating engines within
-# the Triton container, you have to install the TRT-LLM package.
-(cd tensorrt_llm &&
-    bash docker/common/install_cmake.sh &&
-    export PATH=/usr/local/cmake/bin:$PATH &&
-    python3 ./scripts/build_wheel.py --trt_root="/usr/local/tensorrt" &&
-    pip3 install ./build/tensorrt_llm*.whl)
-
-# Go to the tensorrt_llm/examples/gpt directory
-cd tensorrt_llm/examples/gpt
+cd /app/tensorrt_llm/examples/models/core/gpt
 
 # Download weights from HuggingFace Transformers
 rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2
@@ -170,236 +149,145 @@ python3 convert_checkpoint.py --model_dir gpt2 \
 trtllm-build --checkpoint_dir ./c-model/gpt2/fp16/4-gpu \
         --gpt_attention_plugin float16 \
         --remove_input_padding enable \
-        --paged_kv_cache enable \
+        --kv_cache_type paged \
         --gemm_plugin float16 \
-        --output_dir engines/fp16/4-gpu
-```
-
-### Create the model repository
-
-There are five models in the [`all_models/inflight_batcher_llm`](./all_models/inflight_batcher_llm/)
-directory that will be used in this example:
-
-#### preprocessing
-
-This model is used for tokenizing, meaning the conversion from
-prompts(string) to input_ids(list of ints).
-
-#### tensorrt_llm
-
-This model is a wrapper of your TensorRT-LLM model and is used
-for inferencing.
-Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/inference-request.md)
-
-#### postprocessing
-
-This model is used for de-tokenizing, meaning the conversion
-from output_ids(list of ints) to outputs(string).
-
-#### ensemble
-
-This model can be used to chain the preprocessing, tensorrt_llm
-and postprocessing models together.
-
-#### tensorrt_llm_bls
-
-This model can also be used to chain the preprocessing,
-tensorrt_llm and postprocessing models together.
-
-When using the BLS model instead of the ensemble, you should set the number of model instances to
-the maximum batch size supported by the TRT engine to allow concurrent request execution. This
-can be done by modifying the `count` value in the `instance_group` section of the BLS model `config.pbtxt`.
-
-The BLS model has an optional parameter `accumulate_tokens` which can be used in streaming mode to call the
-postprocessing model with all accumulated tokens, instead of only one token.
-This might be necessary for certain tokenizers.
-
-The BLS model supports speculative decoding.  Target and draft triton models are set with the parameters `tensorrt_llm_model_name` `tensorrt_llm_draft_model_name`.  Speculative decoding is performed by setting `num_draft_tokens` in the request.  `use_draft_logits` may be set to use logits comparison speculative decoding. Note that `return_generation_logits` and `return_context_logits` are not supported when using speculative decoding.
-
-BLS Inputs
-
-| Name | Shape | Type | Description |
-| :------------: | :---------------: | :-----------: | :--------: |
-| `text_input` | [ -1 ] | `string` | Prompt text |
-| `max_tokens` | [ -1 ] | `int32` | number of tokens to generate |
-| `bad_words` | [2, num_bad_words] | `int32` | Bad words list |
-| `stop_words` | [2, num_stop_words] | `int32` | Stop words list |
-| `end_id` | [1] | `int32` | End token Id. If not specified, defaults to -1 |
-| `pad_id` | [1] | `int32` | Pad token Id |
-| `temperature` | [1] | `float32` | Sampling Config param: `temperature` |
-| `top_k` | [1] | `int32` | Sampling Config param: `topK` |
-| `top_p` | [1] | `float32` | Sampling Config param: `topP` |
-| `len_penalty` | [1] | `float32` | Sampling Config param: `lengthPenalty` |
-| `repetition_penalty` | [1] | `float` | Sampling Config param: `repetitionPenalty` |
-| `min_length` | [1] | `int32_t` | Sampling Config param: `minLength` |
-| `presence_penalty` | [1] | `float` | Sampling Config param: `presencePenalty` |
-| `frequency_penalty` | [1] | `float` | Sampling Config param: `frequencyPenalty` |
-| `random_seed` | [1] | `uint64_t` | Sampling Config param: `randomSeed` |
-| `return_log_probs` | [1] | `bool` | When `true`, include log probs in the output |
-| `return_context_logits` | [1] | `bool` | When `true`, include context logits in the output |
-| `return_generation_logits` | [1] | `bool` | When `true`, include generation logits in the output |
-| `beam_width` | [1] | `int32_t` | (Default=1) Beam width for this request; set to 1 for greedy sampling |
-| `stream` | [1] | `bool` | (Default=`false`). When `true`, stream out tokens as they are generated. When `false` return only when the full generation has completed.  |
-| `prompt_embedding_table` | [1] | `float16` (model data type) | P-tuning prompt embedding table |
-| `prompt_vocab_size` | [1] | `int32` | P-tuning prompt vocab size |
-| `lora_task_id` | [1] | `uint64` | Task ID for the given lora_weights.  This ID is expected to be globally unique.  To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.  The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached |
-| `lora_weights` | [ num_lora_modules_layers, D x Hi + Ho x D ] | `float` (model data type) | weights for a lora adapter. see [lora docs](lora.md#lora-tensor-format-details) for more details. |
-| `lora_config` | [ num_lora_modules_layers, 3] | `int32t` | lora configuration tensor. `[ module_id, layer_idx, adapter_size (D aka R value) ]` see [lora docs](lora.md#lora-tensor-format-details) for more details. |
-| `embedding_bias_words` | [-1] | `string` | Embedding bias words |
-| `embedding_bias_weights` | [-1] | `float32` | Embedding bias weights |
-| `num_draft_tokens` | [1] | int32 | number of tokens to get from draft model during speculative decoding |
-| `use_draft_logits` | [1] | `bool` | use logit comparison during speculative decoding |
-
-BLS Outputs
-
-| Name | Shape | Type | Description |
-| :------------: | :---------------: | :-----------: | :--------: |
-| `text_output` | [-1] | `string` | text output |
-| `cum_log_probs` | [-1] | `float` | cumulative probabilities for each output |
-| `output_log_probs` | [beam_width, -1] | `float` | log probabilities for each output |
-| `context_logits` | [-1, vocab_size] | `float` |  context logits for input |
-| `generation_logtis` | [beam_width, seq_len, vocab_size] | `float` | generatiion logits for each output |
+        --output_dir /engines/gpt/fp16/4-gpu
+```
+
+See [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt) for
+more details on the parameters.
+
+#### Prepare the Model Repository
+
+Next, create the
+[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md)
+that will be used by the Triton server. The models can be found in the
+[all_models](./tensorrt_llm/triton_backend/all_models) folder. The folder contains six groups of models:
+- [`disaggregated_serving`](./tensorrt_llm/triton_backend/all_models/disaggregated_serving): Using the C++ TensorRT-LLM backend to run disaggregated serving.
+- [`gpt`](./tensorrt_llm/triton_backend/all_models/gpt): Using TensorRT-LLM pure Python runtime. This model is deprecated and will be removed in a future release.
+- [`inflight_batcher_llm`](./tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/)`: Using the C++
+TensorRT-LLM backend with the executor API, which includes the latest features
+including inflight batching.
+
+There are five models in
+[all_models/inflight_batcher_llm](./tensorrt_llm/triton_backend/all_models/inflight_batcher_llm) that will
+be used in this example:
+
+| Model | Description |
+| :------------: | :---------------: |
+| `ensemble` | This model is used to chain the preprocessing, tensorrt_llm and postprocessing models together. |
+| `preprocessing` | This model is used for tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints). |
+| `tensorrt_llm` | This model is a wrapper of your TensorRT-LLM model and is used for inferencing. Input specification can be found [here](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/inference-request.md) |
+| `postprocessing` | This model is used for de-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string). |
+| `tensorrt_llm_bls` | This model can also be used to chain the preprocessing, tensorrt_llm and postprocessing models together. |
 
 To learn more about ensemble and BLS models, please see the
 [Ensemble Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models)
-and [Business Logic Scripting](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
-sections of the Triton Inference Server documentation.
+and
+[Business Logic Scripting](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
+documentation.
 
-```bash
-# Create the model repository that will be used by the Triton server
-cd tensorrtllm_backend
-mkdir triton_model_repo
-
-# Copy the example models to the model repository
-cp -r all_models/inflight_batcher_llm/* triton_model_repo/
-
-# Copy the TRT engine to triton_model_repo/tensorrt_llm/1/
-cp tensorrt_llm/examples/gpt/engines/fp16/4-gpu/* triton_model_repo/tensorrt_llm/1
-```
-
-### Modify the model configuration
-The following table shows the fields that may to be modified before deployment:
-
-*triton_model_repo/preprocessing/config.pbtxt*
-
-| Name | Description
-| :----------------------: | :-----------------------------: |
-| `tokenizer_dir` | The path to the tokenizer for the model. In this example, the path should be set to `/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2` as the tensorrtllm_backend directory will be mounted to `/tensorrtllm_backend` within the container |
-
-*triton_model_repo/tensorrt_llm/config.pbtxt*
-
-| Name | Description
-| :----------------------: | :-----------------------------: |
-| `gpt_model_type` | Mandatory. Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` |
-| `gpt_model_path` | Mandatory. Path to the TensorRT-LLM engines for deployment. In this example, the path should be set to `/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1` as the tensorrtllm_backend directory will be mounted to `/tensorrtllm_backend` within the container |
-| `batch_scheduler_policy` | Mandatory. Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused.|
-| `decoupled` | Optional (default=`false`). Controls streaming. Decoupled mode must be set to `true` if using the streaming option from the client. |
-| `max_beam_width` | Optional (default=1). The maximum beam width that any request may ask for when using beam search.|
-| `max_tokens_in_paged_kv_cache` | Optional (default=unspecified). The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. |
-| `max_attention_window_size` | Optional (default=max_sequence_length). When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. |
-| `kv_cache_free_gpu_mem_fraction` | Optional (default=0.9). Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache.|
-| `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
-| `cancellation_check_period_ms` | Optional (default=100). The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. |
-| `stats_check_period_ms` | Optional (default=100). The time for the statistics reporting thread to sleep before doing the next check. |
-| `iter_stats_max_iterations` | Optional (default=executor::kDefaultIterStatsMaxIterations). The numbers of iteration stats to be kept. |
-| `request_stats_max_iterations` | Optional (default=executor::kDefaultRequestStatsMaxIterations). The numbers of request stats to be kept. |
-| `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
-| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |
-| `gpu_device_ids` | Optional (default=unspecified). Comma-separated list of GPU IDs to use for this model. If not provided, the model will use all visible GPUs. |
-| `decoding_mode` | Optional. Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. |
-| `medusa_choices` | Optional. To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, mc_sim_7b_63 choices are used. |
-| `lora_cache_optimal_adapter_size` | Optional (default=8) Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. |
-| `lora_cache_max_adapter_size` | Optional (default=64) Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. |
-| `lora_cache_gpu_memory_fraction` | Optional (default=0.05) Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded |
-| `lora_cache_host_memory_bytes` | Optional (default=1G) Size of host LoRA cache in bytes |
-| `gpu_weights_percent` | Optional (default=1.0). Set to a number between 0.0 and 1.0 to specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime. Values less than 1.0 are only supported for an engine built with `weight_streaming` on. |
-
-*triton_model_repo/postprocessing/config.pbtxt*
-
-| Name | Description
-| :----------------------: | :-----------------------------: |
-| `tokenizer_dir` | The path to the tokenizer for the model. In this example, the path should be set to `/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2` as the tensorrtllm_backend directory will be mounted to `/tensorrtllm_backend` within the container |
-| `tokenizer_type` | The type of the tokenizer for the model, `t5`, `auto` and `llama` are supported. In this example, the type should be set to `auto` |
-
-### Launch Triton server
-
-Please follow the option corresponding to the way you build the TensorRT-LLM backend.
-
-#### Option 1. Launch Triton server *within Triton NGC container*
+To learn more about the benefits and the limitations of using the BLS model,
+please see the [model config](./docs/model_config.md#tensorrt_llm_bls-model) section.
 
 ```bash
-docker run --rm -it --net host --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v /path/to/tensorrtllm_backend:/tensorrtllm_backend nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 bash
+mkdir /triton_model_repo
+cp -r /app/all_models/inflight_batcher_llm/* /triton_model_repo/
 ```
+- [`llmapi`](./tensorrt_llm/triton_backend/all_models/llmapi/): Using TensorRT-LLM LLM API with pytorch backend.
+- [`multimodal`](./tensorrt_llm/triton_backend/all_models/multimodal/): Using TensorRT-LLM python runtime for multimodal models. See [`multimodal.md`](./docs/multimodal.md) for more details.
+- [`whisper`](./tensorrt_llm/triton_backend/all_models/whisper/): Using TensorRT-LLM python runtime for Whisper. See [`whisper.md`](./docs/whisper.md) for more details.
 
-#### Option 2. Launch Triton server *within the Triton container built via build.py script*
+#### Modify the Model Configuration
+Use the script to fill in the parameters in the model configuration files. For
+optimal performance or custom parameters, please refer to
+[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md).
+For more details on the model configuration and the parameters that can be
+modified, please refer to the [model config](./docs/model_config.md) section.
 
 ```bash
-docker run --rm -it --net host --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v /path/to/tensorrtllm_backend:/tensorrtllm_backend tritonserver bash
+ENGINE_DIR=/engines/gpt/fp16/4-gpu
+TOKENIZER_DIR=/app/tensorrt_llm/examples/models/core/gpt/gpt2
+MODEL_FOLDER=/triton_model_repo
+TRITON_MAX_BATCH_SIZE=4
+INSTANCE_COUNT=1
+MAX_QUEUE_DELAY_MS=0
+MAX_QUEUE_SIZE=0
+FILL_TEMPLATE_SCRIPT=/app/tools/fill_template.py
+DECOUPLED_MODE=false
+LOGITS_DATATYPE=TYPE_FP32
+
+python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:${LOGITS_DATATYPE}
+python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT}
+python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${ENGINE_DIR},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_fused_batching,max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:TYPE_FP16,logits_datatype:${LOGITS_DATATYPE}
+python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT}
+python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT},logits_datatype:${LOGITS_DATATYPE}
 ```
 
-#### Option 3. Launch Triton server *within the Triton container built via Docker*
+> **NOTE**:
+It is recommended to match the number of pre/post_instance_counts with triton_max_batch_size for better performance.
 
-```bash
-docker run --rm -it --net host --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v /path/to/tensorrtllm_backend:/tensorrtllm_backend triton_trt_llm bash
-```
+#### Serving with Triton
+
+Now, you're ready to launch the Triton server with the TensorRT-LLM model.
 
-Once inside the container, you can launch the Triton server with the following command:
+Use the launch_triton_server.py script. This launches multiple instances of tritonserver with MPI.
 
 ```bash
-cd /tensorrtllm_backend
-# --world_size is the number of GPUs you want to use for serving
-python3 scripts/launch_triton_server.py --world_size=4 --model_repo=/tensorrtllm_backend/triton_model_repo
+# 'world_size' is the number of GPUs you want to use for serving. This should
+# be aligned with the number of GPUs used to build the TensorRT-LLM engine.
+python3 /app/scripts/launch_triton_server.py --world_size=4 --model_repo=${MODEL_FOLDER}
 ```
 
-In order to use multiple TensorRT-LLM models, use the `--multi-model` option. The `--world_size` must be 1 as the TensorRT-LLM backend will dynamically launch TensorRT-LLM workers as needed.
+You should see the following logs when the server is successfully deployed.
 
 ```bash
-cd /tensorrtllm_backend
-python3 scripts/launch_triton_server.py --model_repo=/tensorrtllm_backend/triton_model_repo --multi-model
+...
+I0503 22:01:25.210518 1175 grpc_server.cc:2463] Started GRPCInferenceService at 0.0.0.0:8001
+I0503 22:01:25.211612 1175 http_server.cc:4692] Started HTTPService at 0.0.0.0:8000
+I0503 22:01:25.254914 1175 http_server.cc:362] Started Metrics Service at 0.0.0.0:8002
 ```
 
-When using the `--multi-model` option, the Triton model repository can contain multiple TensorRT-LLM models. When running multiple TensorRT-LLM models, the `gpu_device_ids` parameter should be specified in the models `config.pbtxt` configuration files. It is up to you to ensure there is no overlap between allocated GPU IDs.
+To stop Triton Server inside the container, run:
 
-When successfully deployed, the server produces logs similar to the following ones.
-```
-I0919 14:52:10.475738 293 grpc_server.cc:2451] Started GRPCInferenceService at 0.0.0.0:8001
-I0919 14:52:10.475968 293 http_server.cc:3558] Started HTTPService at 0.0.0.0:8000
-I0919 14:52:10.517138 293 http_server.cc:187] Started Metrics Service at 0.0.0.0:8002
+```bash
+pkill tritonserver
 ```
 
-### Query the server with the Triton generate endpoint
+#### Send an Inference Request
 
-Starting with Triton 23.10 release, you can query the server using Triton's
-[generate endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_generate.md)
-with a curl command based on the following general format within your client
-environment/container:
+##### Using the [generate endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_generate.md)
 
+The general format of the generate endpoint:
 ```bash
 curl -X POST localhost:8000/v2/models/${MODEL_NAME}/generate -d '{"{PARAM1_KEY}": "{PARAM1_VALUE}", ... }'
 ```
 
-In the case of the models used in this example, you can replace MODEL_NAME with `ensemble` or `tensorrt_llm_bls`. Examining the
-`ensemble` and `tensorrt_llm_bls` model's config.pbtxt file, you can see that 4 parameters are required to generate a response
-for this model:
+In the case of the models used in this example, you can replace MODEL_NAME with
+`ensemble` or `tensorrt_llm_bls`. Examining the ensemble and tensorrt_llm_bls
+model's config.pbtxt file, you can see that 4 parameters are required to
+generate a response for this model:
 
-- "text_input": Input text to generate a response from
-- "max_tokens": The number of requested output tokens
-- "bad_words": A list of bad words (can be empty)
-- "stop_words": A list of stop words (can be empty)
+- text_input: Input text to generate a response from
+- max_tokens: The number of requested output tokens
+- bad_words: A list of bad words (can be empty)
+- stop_words: A list of stop words (can be empty)
 
 Therefore, we can query the server in the following way:
 
+- if using the ensemble model
 ```bash
 curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}'
 ```
-if using the `ensemble` model or
-```
+
+- if using the tensorrt_llm_bls model
+
+```bash
 curl -X POST localhost:8000/v2/models/tensorrt_llm_bls/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}'
 ```
-if using the `tensorrt_llm_bls` model.
 
 Which should return a result similar to (formatted for readability):
-```json
+```bash
 {
   "model_name": "ensemble",
   "model_version": "1",
@@ -410,23 +298,31 @@ Which should return a result similar to (formatted for readability):
 }
 ```
 
-### Utilize the provided client script to send a request
+##### Using the client scripts
+
+You can refer to the client scripts in the
+[inflight_batcher_llm/client](./tensorrt_llm/triton_backend/inflight_batcher_llm/client) to see how to send
+requests via Python scripts.
 
-You can send requests to the "tensorrt_llm" model with the provided
-[python client script](./inflight_batcher_llm/client/inflight_batcher_llm_client.py)
-as following:
+Below is an example of using
+[inflight_batcher_llm_client](./tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py)
+to send requests to the `tensorrt_llm` model.
 
 ```bash
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir /workspace/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2
+pip3 install tritonclient[all]
+INFLIGHT_BATCHER_LLM_CLIENT=/app/inflight_batcher_llm/client/inflight_batcher_llm_client.py
+python3 ${INFLIGHT_BATCHER_LLM_CLIENT} --request-output-len 200 --tokenizer-dir ${TOKENIZER_DIR}
 ```
 
 The result should be similar to the following:
 
-```
+```bash
+Using pad_id:  50256
+Using end_id:  50256
+Input sequence:  [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257]
 Got completed request
-output_ids =  [[28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257, 21221, 878, 3867, 284, 3576, 287, 262, 1903, 6303, 82, 13, 679, 468, 1201, 3111, 287, 10808, 287, 3576, 11, 6342, 11, 21574, 290, 968, 1971, 13, 198, 198, 1544, 318, 6405, 284, 262, 1966, 2746, 290, 14549, 11, 11735, 12, 44507, 11, 290, 468, 734, 1751, 11, 257, 4957, 11, 18966, 11, 290, 257, 3367, 11, 7806, 13, 198, 198, 50, 726, 263, 338, 3656, 11, 11735, 12, 44507, 11, 318, 257, 1966, 2746, 290, 14549, 13, 198, 198, 1544, 318, 11803, 416, 465, 3656, 11, 11735, 12, 44507, 11, 290, 511, 734, 1751, 11, 7806, 290, 18966, 13, 198, 198, 50, 726, 263, 373, 4642, 287, 6342, 11, 4881, 11, 284, 257, 4141, 2988, 290, 257, 2679, 2802, 13, 198, 198, 1544, 373, 15657, 379, 262, 23566, 38719, 293, 748, 1355, 14644, 12, 3163, 912, 287, 6342, 290, 262, 15423, 4189, 710, 287, 6342, 13, 198, 198, 1544, 373, 257, 2888, 286, 262, 4141, 8581, 286, 13473, 290, 262, 4141, 8581, 286, 11536, 13, 198, 198, 1544, 373, 257, 2888, 286, 262, 4141, 8581, 286, 13473, 290, 262, 4141, 8581, 286, 11536, 13, 198, 198, 50, 726, 263, 373, 257, 2888, 286, 262, 4141, 8581, 286, 13473, 290]]
 Input: Born in north-east France, Soyer trained as a
-Output:  chef before moving to London in the early 1990s. He has since worked in restaurants in London, Paris, Milan and New York.
+Output beam 0:  chef before moving to London in the early 1990s. He has since worked in restaurants in London, Paris, Milan and New York.
 
 He is married to the former model and actress, Anna-Marie, and has two children, a daughter, Emma, and a son, Daniel.
 
@@ -434,41 +330,55 @@ Soyer's wife, Anna-Marie, is a former model and actress.
 
 He is survived by his wife, Anna-Marie, and their two children, Daniel and Emma.
 
-Soyer was born in Paris, France, to a French father and a German mother.
+Soyer was born in the north-east of France, and moved to London in the early 1990s.
 
-He was educated at the prestigious Ecole des Beaux-Arts in Paris and the Sorbonne in Paris.
+He was a chef at the London restaurant, The Bistro, before moving to New York in the early 2000s.
 
-He was a member of the French Academy of Sciences and the French Academy of Arts.
+He was a regular at the restaurant, and was also a regular at the restaurant, The Bistro, before moving to London in the early 2000s.
 
-He was a member of the French Academy of Sciences and the French Academy of Arts.
-
-Soyer was a member of the French Academy of Sciences and
+Soyer was a regular at the restaurant, and was
+Output sequence:  [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257, 21221, 878, 3867, 284, 3576, 287, 262, 1903, 6303, 82, 13, 679, 468, 1201, 3111, 287, 10808, 287, 3576, 11, 6342, 11, 21574, 290, 968, 1971, 13, 198, 198, 1544, 318, 6405, 284, 262, 1966, 2746, 290, 14549, 11, 11735, 12, 44507, 11, 290, 468, 734, 1751, 11, 257, 4957, 11, 18966, 11, 290, 257, 3367, 11, 7806, 13, 198, 198, 50, 726, 263, 338, 3656, 11, 11735, 12, 44507, 11, 318, 257, 1966, 2746, 290, 14549, 13, 198, 198, 1544, 318, 11803, 416, 465, 3656, 11, 11735, 12, 44507, 11, 290, 511, 734, 1751, 11, 7806, 290, 18966, 13, 198, 198, 50, 726, 263, 373, 4642, 287, 262, 5093, 12, 23316, 286, 4881, 11, 290, 3888, 284, 3576, 287, 262, 1903, 6303, 82, 13, 198, 198, 1544, 373, 257, 21221, 379, 262, 3576, 7072, 11, 383, 347, 396, 305, 11, 878, 3867, 284, 968, 1971, 287, 262, 1903, 4751, 82, 13, 198, 198, 1544, 373, 257, 3218, 379, 262, 7072, 11, 290, 373, 635, 257, 3218, 379, 262, 7072, 11, 383, 347, 396, 305, 11, 878, 3867, 284, 3576, 287, 262, 1903, 4751, 82, 13, 198, 198, 50, 726, 263, 373, 257, 3218, 379, 262, 7072, 11, 290, 373]
 ```
 
-#### Early stopping
+###### Early stopping
+
 You can also stop the generation process early by using the `--stop-after-ms`
 option to send a stop request after a few milliseconds:
 
 ```bash
-python inflight_batcher_llm/client/inflight_batcher_llm_client.py --stop-after-ms 200 --request-output-len 200 --tokenizer-dir /workspace/tensorrtllm_backend/tensorrt_llm/examples/gpt/gpt2
+python3 ${INFLIGHT_BATCHER_LLM_CLIENT} --stop-after-ms 200 --request-output-len 200 --request-id 1 --tokenizer-dir ${TOKENIZER_DIR}
 ```
 
 You will find that the generation process is stopped early and therefore the
 number of generated tokens is lower than 200. You can have a look at the
 client code to see how early stopping is achieved.
 
-#### Return context logits and/or generation logits
-If you want to get context logits and/or generation logits, you need to enable `--gather_context_logits` and/or `--gather_generation_logits` when building the engine (or `--gather_all_token_logits` to enable both at the same time). For more setting details about these two flags, please refer to [build.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/commands/build.py) or [gpt_runtime](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/gpt_runtime.md).
+###### Return context logits and/or generation logits
+
+If you want to get context logits and/or generation logits, you need to enable
+`--gather_context_logits` and/or `--gather_generation_logits` when building the
+engine (or `--gather_all_token_logits` to enable both at the same time). For
+more setting details about these two flags, please refer to
+[build.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/commands/build.py)
+or
+[gpt_runtime](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-runtime.md).
+
+After launching the server, you could get the output of logits by passing the
+corresponding parameters `--return-context-logits` and/or
+`--return-generation-logits` in the client scripts
+([end_to_end_grpc_client.py](./tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py)
+and
+[inflight_batcher_llm_client.py](./tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py)).
+
+For example:
 
-After launching the server, you could get the output of logits by passing the corresponding parameters `--return-context-logits` and/or `--return-generation-logits` in the client scripts ([end_to_end_grpc_client.py](./inflight_batcher_llm/client/end_to_end_grpc_client.py) and [inflight_batcher_llm_client.py](./inflight_batcher_llm/client/inflight_batcher_llm_client.py)). For example:
 ```bash
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 20 --tokenizer-dir /path/to/tokenizer/ \
---return-context-logits \
---return-generation-logits
+python3 ${INFLIGHT_BATCHER_LLM_CLIENT} --request-output-len 20 --tokenizer-dir ${TOKENIZER_DIR} --return-context-logits --return-generation-logits
 ```
 
 The result should be similar to the following:
-```
+
+```bash
 Input sequence:  [28524, 287, 5093, 12, 23316, 4881, 11, 30022, 263, 8776, 355, 257]
 Got completed request
 Input: Born in north-east France, Soyer trained as a
@@ -504,10 +414,244 @@ generation_logits: [[[[-106.33096  -105.58956  -111.44852  ... -111.04858  -111.
       19.625107]]]]
 ```
 
+##### Requests with batch size > 1
+
+The TRT-LLM backend supports requests with batch size greater than one. When
+sending a request with a batch size greater than one, the TRT-LLM backend will
+return multiple batch size 1 responses, where each response will be associated
+with a given batch index. An output tensor named `batch_index` is associated
+with each response to indicate which batch index this response corresponds to.
+
+The client script
+[end_to_end_grpc_client.py](./tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py)
+demonstrates how a client can send requests with batch size > 1 and consume the
+responses returned from Triton. When passing `--batch-inputs` to the client
+script, the client will create a request with multiple prompts, and use the
+`batch_index` output tensor to associate the responses to the original prompt.
+For example one could run:
+
+```
+python3 /app/inflight_batcher_llm/client/end_to_end_grpc_client.py -o 5 -p '["This is a test","I want you to","The cat is"]'  --batch-inputs
+```
+
+to send a request with a batch size of 3 to the Triton server.
+
+## Building from Source
+
+Please refer to the [build.md](./docs/build.md) for more details on how to
+build the Triton TRT-LLM container from source.
+
+## Supported Models
+
+Only a few examples are listed here. For all the supported models, please refer
+to the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html).
+
+- LLaMa
+  - [End to end workflow to run llama 7b with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama.md)
+  - [Build and run a LLaMA model in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)
+  - [Llama Multi-instance](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/llama_multi_instance.md)
+  - [Deploying Hugging Face Llama2-7b Model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Popular_Models_Guide/Llama2/trtllm_guide.md#infer-with-tensorrt-llm-backend)
+
+- Gemma
+  - [End to end workflow to run sp model with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/gemma.md)
+  - [Run Gemma on TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gemma)
+
+- Mistral
+  - [Build and run a Mixtral model in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/mixtral/README.md)
+
+- Multi-modal
+  - [End to end workflow to run multimodal models(e.g. BLIP2-OPT, LLava1.5-7B, VILA) with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/multimodal.md)
+  - [Deploying Hugging Face Llava1.5-7b Model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Popular_Models_Guide/Llava1.5/llava_trtllm_guide.md)
+
+- Encoder-Decoder
+  - [End to end workflow to run an Encoder-Decoder model with Triton](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/encoder_decoder.md)
+
+## Model Config
+
+Please refer to the [model config](./docs/model_config.md) for more details on
+the model configuration.
+
+## Model Deployment
+
+### TRT-LLM Multi-instance Support
+
+TensorRT-LLM backend relies on MPI to coordinate the execution of a model across
+multiple GPUs and nodes. Currently, there are two different modes supported to
+run a model across multiple GPUs, **Leader Mode** and **Orchestrator Mode**.
+
+> **Note**: This is different from the model multi-instance support from Triton
+> Server which allows multiple instances of a model to be run on the same or
+> different GPUs. For more information on Triton Server multi-instance support,
+> please refer to the
+> [Triton model config documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups).
+
+#### Leader Mode
+
+In leader mode, TensorRT-LLM backend spawns one Triton Server process for every
+GPU. The process with rank 0 is the leader process. Other Triton Server processes,
+do not return from the `TRITONBACKEND_ModelInstanceInitialize` call to avoid
+port collision and allowing the other processes to receive requests.
+
+The overview of this mode is described in the diagram below:
+
+![Leader Mode Overview](./images/leader-mode.png)
+
+This mode is friendly with [slurm](https://slurm.schedmd.com) deployments since
+it doesn't use
+[MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php).
+
+#### Orchestrator Mode
+
+In orchestrator mode, the TensorRT-LLM backend spawns a single Triton Server
+process that acts as an orchestrator and spawns one Triton Server process for
+every GPU that each model requires. This mode is mainly used when serving
+multiple models with TensorRT-LLM backend. In this mode, the `MPI` world size
+must be one as TRT-LLM backend will automatically create new workers as needed.
+The overview of this mode is described in the diagram below:
+
+![Orchestrator Mode Overview](./images/orchestrator-mode.png)
+
+Since this mode uses
+[MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php),
+it might not work properly with [slurm](https://slurm.schedmd.com) deployments.
+Additionally, this currently only works for single node deployments.
+
+#### Running Multiple Instances of LLaMa Model
+
+Please refer to
+[Running Multiple Instances of the LLaMa Model](docs/llama_multi_instance.md)
+for more information on running multiple instances of LLaMa model in different
+configurations.
+
+### Multi-node Support
+
+Check out the
+[Multi-Node Generative AI w/ Triton Server and TensorRT-LLM](https://github.com/triton-inference-server/tutorials/tree/main/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models)
+tutorial for Triton Server and TensorRT-LLM multi-node deployment.
+
+### Model Parallelism
+
+#### Tensor Parallelism, Pipeline Parallelism and Expert Parallelism
+
+[Tensor Parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#tensor-parallelism),
+[Pipeline Parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#pipeline-parallelism)
+and
+[Expert parallelism](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/parallelisms.html#expert-parallelism)
+are supported in TensorRT-LLM.
+
+See the models in the
+[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder for
+more details on how to build the engines with tensor parallelism, pipeline
+parallelism and expert parallelism.
+
+Some examples are shown below:
+
+- Build LLaMA v3 70B using 4-way tensor parallelism and 2-way pipeline parallelism.
+
+```bash
+python3 convert_checkpoint.py --model_dir ./tmp/llama/70B/hf/ \
+                            --output_dir ./tllm_checkpoint_8gpu_tp4_pp2 \
+                            --dtype float16 \
+                            --tp_size 4 \
+                            --pp_size 2
+
+trtllm-build --checkpoint_dir ./tllm_checkpoint_8gpu_tp4_pp2 \
+            --output_dir ./tmp/llama/70B/trt_engines/fp16/8-gpu/ \
+            --gemm_plugin auto
+```
+
+- Build Mixtral8x22B with tensor parallelism and expert parallelism
+
+```bash
+python3 ../llama/convert_checkpoint.py --model_dir ./Mixtral-8x22B-v0.1 \
+                             --output_dir ./tllm_checkpoint_mixtral_8gpu \
+                             --dtype float16 \
+                             --tp_size 8 \
+                             --moe_tp_size 2 \
+                             --moe_ep_size 4
+trtllm-build --checkpoint_dir ./tllm_checkpoint_mixtral_8gpu \
+                 --output_dir ./trt_engines/mixtral/tp2ep4 \
+                 --gemm_plugin float16
+```
+
+See the
+[doc](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/expert-parallelism.md)
+to learn more about how TensorRT-LLM expert parallelism works in Mixture of Experts (MoE).
+
+### MIG Support
+
+See the
+[MIG tutorial](https://github.com/triton-inference-server/tutorials/tree/main/Deployment/Kubernetes)
+for more details on how to run TRT-LLM models and Triton with MIG.
+
+### Scheduling
+
+The scheduler policy helps the batch manager adjust how requests are scheduled
+for execution. There are two scheduler policies supported in TensorRT-LLM,
+`MAX_UTILIZATION` and `GUARANTEED_NO_EVICT`. See the
+[batch manager design](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/batch-manager.md#gptmanager-design)
+to learn more about how scheduler policies work. You can specify the scheduler
+policy via the `batch_scheduler_policy` parameter in the
+[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model.
+
+### Key-Value Cache
+
+See the
+[KV Cache](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-attention.md#kv-cache)
+section for more details on how TensorRT-LLM supports KV cache. Also, check out
+the [KV Cache Reuse](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/kv_cache_reuse.md)
+documentation to learn more about how to enable KV cache reuse when building the
+TRT-LLM engine. Parameters for KV cache can be found in the
+[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model.
+
+### Decoding
+
+#### Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle
+
+TensorRT-LLM supports various decoding modes, including top-k, top-p,
+top-k top-p, beam search Medusa, ReDrafter, Lookahead and Eagle. See the
+[Sampling Parameters](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-runtime.md#sampling-parameters)
+section to learn more about top-k, top-p, top-k top-p and beam search decoding.
+Please refer to the
+[speculative decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/speculative-decoding.md)
+for more details on Medusa, ReDrafter, Lookahead and Eagle.
+
+Parameters for decoding modes can be found in the
+[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model.
+
+#### Speculative Decoding
+
+See the
+[Speculative Decoding](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/speculative_decoding.md)
+documentation to learn more about how TensorRT-LLM supports speculative decoding
+to improve the performance. The parameters for speculative decoding can be found
+in the [model config](./docs/model_config.md#tensorrt_llm_bls_model) of
+tensorrt_llm_bls model.
+
+### Chunked Context
+
+For more details on how to use chunked context, please refer to the
+[Chunked Context](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-attention.md#chunked-context)
+section. Parameters for chunked context can be found in the
+[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model.
+
+### Quantization
+
+Check out the
+[Quantization Guide](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md)
+to learn more about how to install the quantization toolkit and quantize
+TensorRT-LLM models. Also, check out the blog post
+[Speed up inference with SOTA quantization techniques in TRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/quantization-in-TRT-LLM.md)
+to learn more about how to speed up inference with quantization.
 
-### Launch Triton server *within Slurm based clusters*
+### LoRa
 
-#### Prepare some scripts
+Refer to [lora.md](./docs/lora.md) for more details on how to use LoRa
+with TensorRT-LLM and Triton.
+
+## Launch Triton server *within Slurm based clusters*
+
+### Prepare some scripts
 
 `tensorrt_llm_triton.sub`
 ```bash
@@ -525,7 +669,6 @@ sudo nvidia-smi -lgc 1410,1410
 
 srun --mpi=pmix \
     --container-image triton_trt_llm \
-    --container-mounts /path/to/tensorrtllm_backend:/tensorrtllm_backend \
     --container-workdir /tensorrtllm_backend \
     --output logs/tensorrt_llm_%t.out \
     bash /tensorrtllm_backend/tensorrt_llm_triton.sh
@@ -534,26 +677,27 @@ srun --mpi=pmix \
 `tensorrt_llm_triton.sh`
 ```bash
 TRITONSERVER="/opt/tritonserver/bin/tritonserver"
-MODEL_REPO="/tensorrtllm_backend/triton_model_repo"
+MODEL_REPO="/triton_model_repo"
 
 ${TRITONSERVER} --model-repository=${MODEL_REPO} --disable-auto-complete-config --backend-config=python,shm-region-prefix-name=prefix${SLURM_PROCID}_
 ```
 
-#### Submit a Slurm job
+If srun initializes the mpi environment, you can use the following command to launch the Triton server:
 
 ```bash
-sbatch tensorrt_llm_triton.sub
+srun --mpi pmix launch_triton_server.py --oversubscribe
 ```
 
-You might have to contact your cluster's administrator to help you customize the above script.
-
-### Kill the Triton server
+### Submit a Slurm job
 
 ```bash
-pkill tritonserver
+sbatch tensorrt_llm_triton.sub
 ```
 
+You might have to contact your cluster's administrator to help you customize the above script.
+
 ## Triton Metrics
+
 Starting with the 23.11 release of Triton, users can now obtain TRT LLM Batch
 Manager [statistics](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/batch-manager.md#statistics)
 by querying the Triton metrics endpoint. This can be accomplished by launching
@@ -561,15 +705,19 @@ a Triton server in any of the ways described above (ensuring the build code /
 container is 23.11 or later) and querying the server. Upon receiving a
 successful response, you can query the metrics endpoint by entering the
 following:
+
 ```bash
 curl localhost:8002/metrics
 ```
+
 Batch manager statistics are reported by the metrics endpoint in fields that
 are prefixed with `nv_trt_llm_`. Your output for these fields should look
 similar to the following (assuming your model is an inflight batcher model):
+
 ```bash
 # HELP nv_trt_llm_request_metrics TRT LLM request metrics
 # TYPE nv_trt_llm_request_metrics gauge
+nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="waiting",version="1"} 1
 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="context",version="1"} 1
 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="scheduled",version="1"} 1
 nv_trt_llm_request_metrics{model="tensorrt_llm",request_type="max",version="1"} 512
@@ -581,6 +729,7 @@ nv_trt_llm_runtime_memory_metrics{memory_type="gpu",model="tensorrt_llm",version
 nv_trt_llm_runtime_memory_metrics{memory_type="cpu",model="tensorrt_llm",version="1"} 0
 # HELP nv_trt_llm_kv_cache_block_metrics TRT LLM KV cache block metrics
 # TYPE nv_trt_llm_kv_cache_block_metrics gauge
+nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="fraction",model="tensorrt_llm",version="1"} 0.4875
 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"} 64
 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} 1
 nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="free",model="tensorrt_llm",version="1"} 6239
@@ -594,10 +743,16 @@ nv_trt_llm_inflight_batcher_metrics{inflight_batcher_specific_metric="total_cont
 # TYPE nv_trt_llm_general_metrics gauge
 nv_trt_llm_general_metrics{general_type="iteration_counter",model="tensorrt_llm",version="1"} 0
 nv_trt_llm_general_metrics{general_type="timestamp",model="tensorrt_llm",version="1"} 1700074049
+# HELP nv_trt_llm_disaggregated_serving_metrics TRT LLM disaggregated serving metrics
+# TYPE nv_trt_llm_disaggregated_serving_metrics counter
+nv_trt_llm_disaggregated_serving_metrics{disaggregated_serving_type="kv_cache_transfer_ms",model="tensorrt_llm",version="1"} 0
+nv_trt_llm_disaggregated_serving_metrics{disaggregated_serving_type="request_count",model="tensorrt_llm",version="1"} 0
 ```
+
 If, instead, you launched a V1 model, your output will look similar to the
 output above except the inflight batcher related fields will be replaced
 with something similar to the following:
+
 ```bash
 # HELP nv_trt_llm_v1_metrics TRT LLM v1-specific metrics
 # TYPE nv_trt_llm_v1_metrics gauge
@@ -605,8 +760,10 @@ nv_trt_llm_v1_metrics{model="tensorrt_llm",v1_specific_metric="total_generation_
 nv_trt_llm_v1_metrics{model="tensorrt_llm",v1_specific_metric="empty_generation_slots",version="1"} 0
 nv_trt_llm_v1_metrics{model="tensorrt_llm",v1_specific_metric="total_context_tokens",version="1"} 5
 ```
+
 Please note that versions of Triton prior to the 23.12 release do not
 support base Triton metrics. As such, the following fields will report 0:
+
 ```bash
 # HELP nv_inference_request_success Number of successful inference requests, all batch sizes
 # TYPE nv_inference_request_success counter
@@ -640,48 +797,64 @@ nv_inference_compute_output_duration_us{model="tensorrt_llm",version="1"} 0
 nv_inference_pending_request_count{model="tensorrt_llm",version="1"} 0
 ```
 
-## Multi-instance Support
+## Benchmarking
 
-TensorRT-LLM backend relies on MPI to coordinate the execution of a model across multiple GPUs
-and nodes. Currently, there are two different modes supported to run a model across multiple GPUs:
+Check out [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf)
+tool for benchmarking TensorRT-LLM models.
 
-1. [Leader mode](#leader-mode)
-2. [Orchestrator mode](#orchestrator-mode)
+You can also use the
+[benchmark_core_model script](./tools/inflight_batcher_llm/benchmark_core_model.py)
+to benchmark the core model `tensosrrt_llm`. The script sends requests directly
+to deployed `tensorrt_llm` model. The benchmark core model latency indicates the
+inference latency of TensorRT-LLM, not including the pre/post-processing latency
+which is usually handled by a third-party library such as HuggingFace.
 
-### Leader Mode
-
-In leader mode, TensorRT-LLM backend spawns one Triton Server process for every
-GPU. The process with rank 0 is the leader process. Other Triton Server processes,
-do not return from the `TRITONBACKEND_ModelInstanceInitialize` call to avoid
-port collision and allowing the other processes to receive requests.
+benchmark_core_model can generate traffic from 2 sources.
+1 - dataset (json file containing prompts and optional responses)
+2 - token normal distribution (user specified input, output seqlen)
 
-The overview of this mode is described in the diagram below:
+By default, exponential distrution is used to control arrival rate of requests.
+It can be changed to constant arrival time.
 
-![Leader Mode Overview](./images/leader-mode.png)
+```bash
+cd tools/inflight_batcher_llm
+```
 
-This mode is friendly with [slurm](https://slurm.schedmd.com) deployments since
-it doesn't use
-[MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php).
+Example: Run dataset with 10 req/sec requested rate with provided tokenizer.
 
-### Orchestrator Mode
+```bash
+python3 benchmark_core_model.py -i grpc --request_rate 10 dataset --dataset <dataset path> --tokenizer_dir <> --num_requests 5000
+```
 
-In orchestrator mode, the TensorRT-LLM backend spawns a single Triton Server process
-that acts as an orchestrator and spawns one Triton Server process for every
-GPU that each model requires. This mode is mainly used when serving multiple models
-with TensorRT-LLM backend.  In this mode, the `MPI` world size must be one as
-TRT-LLM backend will automatically create new workers as needed. The overview
-of this mode is described in the diagram below:
+Example: Generate I/O seqlen tokens with input normal distribution with mean_seqlen=128, stdev=10. Output normal distribution with mean_seqlen=20, stdev=2. Set stdev=0 to get constant seqlens.
 
-![Orchestrator Mode Overview](./images/orchestrator-mode.png)
+```bash
+python3 benchmark_core_model.py -i grpc --request_rate 10 token_norm_dist --input_mean 128 --input_stdev 5 --output_mean 20 --output_stdev 2 --num_requests 5000
+```
 
-Since this mode uses [MPI_Comm_spawn](https://www.open-mpi.org/doc/v4.1/man3/MPI_Comm_spawn.3.php),
-it might not work properly with [slurm](https://slurm.schedmd.com) deployments.
-Additionally, this currently only works for single node deployments.
+Expected outputs
 
-### Running Multiple Instances of LLaMa Model
+```bash
+[INFO] Warm up for benchmarking.
+[INFO] Start benchmarking on 5000 prompts.
+[INFO] Total Latency: 26585.349 ms
+[INFO] Total request latencies: 11569672.000999955 ms
++----------------------------+----------+
+|            Stat            |  Value   |
++----------------------------+----------+
+|        Requests/Sec        |  188.09  |
+|       OP tokens/sec        | 3857.66  |
+|     Avg. latency (ms)      | 2313.93  |
+|      P99 latency (ms)      | 3624.95  |
+|      P90 latency (ms)      | 3127.75  |
+| Avg. IP tokens per request |  128.53  |
+| Avg. OP tokens per request |  20.51   |
+|     Total latency (ms)     | 26582.72 |
+|       Total requests       | 5000.00  |
++----------------------------+----------+
 
-Please refer to [Running Multiple Instances of the LLaMa Model](docs/llama_multi_instance.md)
-for more information on running multiple instances of LLaMa model in different configurations.
+```
+*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
 
 ## Testing the TensorRT-LLM Backend
 Please follow the guide in [`ci/README.md`](ci/README.md) to see how to run
diff --git a/all_models/gpt/ensemble/1/.tmp b/all_models/gpt/ensemble/1/.tmp
deleted file mode 100644
index e69de29b..00000000
diff --git a/all_models/gpt/ensemble/config.pbtxt b/all_models/gpt/ensemble/config.pbtxt
deleted file mode 100755
index 708a990c..00000000
--- a/all_models/gpt/ensemble/config.pbtxt
+++ /dev/null
@@ -1,230 +0,0 @@
-name: "ensemble"
-platform: "ensemble"
-max_batch_size: 1024
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_length"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "random_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1, -1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "preprocessing"
-      model_version: -1
-      input_map {
-        key: "QUERY"
-        value: "text_input"
-      }
-      input_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "max_tokens"
-      }
-      input_map {
-        key: "BAD_WORDS_DICT"
-        value: "bad_words"
-      }
-      input_map {
-        key: "STOP_WORDS_DICT"
-        value: "stop_words"
-      }
-      output_map {
-        key: "REQUEST_INPUT_LEN"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      output_map {
-        key: "INPUT_ID"
-        value: "_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-    },
-    {
-      model_name: "tensorrt_llm"
-      model_version: -1
-      input_map {
-        key: "input_ids"
-        value: "_INPUT_ID"
-      }
-      input_map {
-        key: "input_lengths"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      input_map {
-        key: "request_output_len"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      input_map {
-          key: "end_id"
-          value: "end_id"
-      }
-      input_map {
-          key: "pad_id"
-          value: "pad_id"
-      }
-      input_map {
-          key: "runtime_top_k"
-          value: "top_k"
-      }
-      input_map {
-          key: "runtime_top_p"
-          value: "top_p"
-      }
-      input_map {
-          key: "temperature"
-          value: "temperature"
-      }
-      input_map {
-          key: "len_penalty"
-          value: "length_penalty"
-      }
-      input_map {
-          key: "repetition_penalty"
-          value: "repetition_penalty"
-      }
-      input_map {
-          key: "min_length"
-          value: "min_length"
-      }
-      input_map {
-          key: "presence_penalty"
-          value: "presence_penalty"
-      }
-      input_map {
-          key: "frequency_penalty"
-          value: "frequency_penalty"
-      }
-      input_map {
-          key: "random_seed"
-          value: "random_seed"
-      }
-      input_map {
-          key: "beam_width"
-          value: "beam_width"
-      }
-      input_map {
-          key: "output_log_probs"
-          value: "output_log_probs"
-      }
-      output_map {
-        key: "output_ids"
-        value: "_TOKENS_BATCH"
-      }
-    },
-    {
-      model_name: "postprocessing"
-      model_version: -1
-      input_map {
-        key: "TOKENS_BATCH"
-        value: "_TOKENS_BATCH"
-      }
-      output_map {
-        key: "OUTPUT"
-        value: "text_output"
-      }
-    }
-  ]
-}
diff --git a/all_models/gpt/postprocessing/1/model.py b/all_models/gpt/postprocessing/1/model.py
deleted file mode 100644
index 006a98cf..00000000
--- a/all_models/gpt/postprocessing/1/model.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side="left",
-                                                       trust_remote_code=True)
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        # Parse model output configs
-        output_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT")
-
-        # Convert Triton types to numpy types
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            tokens_batch = pb_utils.get_input_tensor_by_name(
-                request, 'TOKENS_BATCH').as_numpy()
-
-            # Reshape Input
-            # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
-            # tokens_batch = tokens_batch.T
-
-            # Postprocessing output data.
-            outputs = self._postprocessing(tokens_batch)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            output_tensor = pb_utils.Tensor(
-                'OUTPUT',
-                np.array(outputs).astype(self.output_dtype))
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _postprocessing(self, tokens_batch):
-        outputs = []
-        for beam_tokens in tokens_batch:
-            for tokens in beam_tokens:
-                output = self.tokenizer.decode(tokens)
-                outputs.append(output.encode('utf8'))
-        return outputs
diff --git a/all_models/gpt/postprocessing/config.pbtxt b/all_models/gpt/postprocessing/config.pbtxt
deleted file mode 100755
index 432acbab..00000000
--- a/all_models/gpt/postprocessing/config.pbtxt
+++ /dev/null
@@ -1,31 +0,0 @@
-name: "postprocessing"
-backend: "python"
-max_batch_size: 1024
-input [
-  {
-    name: "TOKENS_BATCH"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  }
-]
-output [
-  {
-    name: "OUTPUT"
-    data_type: TYPE_STRING
-    dims: [ -1, -1 ]
-  }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-instance_group [
-    {
-        count: 1
-        kind: KIND_CPU
-    }
-]
diff --git a/all_models/gpt/preprocessing/1/model.py b/all_models/gpt/preprocessing/1/model.py
deleted file mode 100644
index 9aca98b1..00000000
--- a/all_models/gpt/preprocessing/1/model.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-from typing import List
-
-import numpy as np
-import torch
-import triton_python_backend_utils as pb_utils
-from torch.nn.utils.rnn import pad_sequence
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       padding_side='left',
-                                                       legacy=False,
-                                                       trust_remote_code=True)
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
-                                            add_special_tokens=False)[0]
-
-        # Parse model output configs and convert Triton types to numpy types
-        input_names = [
-            "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
-        ]
-        for input_name in input_names:
-            setattr(
-                self,
-                input_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
-                        model_config, input_name)['data_type']))
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            query = pb_utils.get_input_tensor_by_name(request,
-                                                      'QUERY').as_numpy()
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, 'REQUEST_OUTPUT_LEN').as_numpy()
-
-            bad_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'BAD_WORDS_DICT').as_numpy()
-            stop_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'STOP_WORDS_DICT').as_numpy()
-
-            # Preprocessing input data.
-            input_id, request_input_len = self._create_request(query)
-            bad_words = self._to_word_list_format(bad_words_dict)
-            stop_words = self._to_word_list_format(stop_words_dict)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            input_id_tensor = pb_utils.Tensor(
-                'INPUT_ID',
-                np.array(input_id).astype(self.input_id_dtype))
-            request_input_len_tensor = pb_utils.Tensor(
-                'REQUEST_INPUT_LEN',
-                np.array(request_input_len).astype(
-                    self.request_input_len_dtype))
-            request_output_len_tensor = pb_utils.Tensor(
-                'REQUEST_OUTPUT_LEN', request_output_len)
-            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
-            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
-                                                    stop_words)
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(output_tensors=[
-                input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
-                request_input_len_tensor, request_output_len_tensor
-            ])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _create_request(self, query):
-        """
-            query : batch string (2D numpy array)
-        """
-        start_ids = [
-            torch.IntTensor(self.tokenizer.encode(s[0].decode()))
-            for s in query
-        ]
-        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
-
-        start_ids = pad_sequence(start_ids,
-                                 batch_first=True,
-                                 padding_value=self.pad_id)
-        # input_len = min(start_lengths)
-        #attn_mask = torch.ones((batch_size, input_len, input_len)).tril()
-
-        return start_ids, start_lengths
-
-    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
-        '''
-        word_lists format:
-            len(word_lists) == batch_size
-            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
-        '''
-        assert self.tokenizer != None, "need to set tokenizer"
-
-        if word_lists is None:
-            # Return an empty array of shape (1,2,0)
-            return np.empty([1, 2, 0], dtype="int32")
-
-        flat_ids = []
-        offsets = []
-        for word_list in word_lists:
-            item_flat_ids = []
-            item_offsets = []
-
-            for word in word_list:
-                if isinstance(word, bytes):
-                    word = word.decode()
-
-                ids = self.tokenizer.encode(word, add_special_tokens=False)
-
-                if len(ids) == 0:
-                    continue
-
-                item_flat_ids += ids
-                item_offsets.append(len(ids))
-
-            flat_ids.append(np.array(item_flat_ids))
-            offsets.append(np.cumsum(np.array(item_offsets)))
-
-        pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
-                                 constant_values=0)
-            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
-                                constant_values=-1)
-
-        return np.array([flat_ids, offsets], dtype="int32").transpose(
-            (1, 0, 2))
diff --git a/all_models/gpt/preprocessing/config.pbtxt b/all_models/gpt/preprocessing/config.pbtxt
deleted file mode 100644
index c36b2b81..00000000
--- a/all_models/gpt/preprocessing/config.pbtxt
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "preprocessing"
-backend: "python"
-max_batch_size: 1024
-input [
-    {
-        name: "QUERY"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "BAD_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "STOP_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    }
-]
-output [
-    {
-        name: "INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "STOP_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "PROMPT_LEARNING_TASK_NAME_IDS"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-instance_group [
-    {
-        count: 1
-        kind: KIND_CPU
-    }
-]
diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py
deleted file mode 100644
index e057b696..00000000
--- a/all_models/gpt/tensorrt_llm/1/model.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import json
-
-import torch
-import triton_python_backend_utils as pb_utils
-from torch import from_numpy
-
-from tensorrt_llm.runtime import ModelRunner, SamplingConfig
-
-
-def mpi_comm():
-    from mpi4py import MPI
-    return MPI.COMM_WORLD
-
-
-def mpi_rank():
-    return mpi_comm().Get_rank()
-
-
-def get_engine_name(model, dtype, tp_size, rank):
-    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-
-
-def get_input_tensor_by_name(request, name):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is not None:
-        # Triton tensor -> numpy tensor -> PyTorch tensor
-        return from_numpy(tensor.as_numpy())
-    else:
-        return tensor
-
-
-def get_input_scalar_by_name(request, name):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is not None:
-        # Triton tensor -> numpy tensor -> first scalar
-        tensor = tensor.as_numpy()
-        return tensor.reshape((tensor.size, ))[0]
-    else:
-        return tensor
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        model_config = json.loads(args['model_config'])
-        engine_dir = model_config['parameters']['engine_dir']['string_value']
-        self.comm = mpi_comm()
-        self.rank = mpi_rank()
-        self.runner = ModelRunner.from_dir(engine_dir=engine_dir,
-                                           rank=self.rank)
-        if self.rank != 0:
-            while (True):
-                self.execute([None])
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model.
-
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-        responses = []
-
-        # Every Python backend must iterate through list of requests and create
-        # an instance of pb_utils.InferenceResponse class for each of them. You
-        # should avoid storing any of the input Tensors in the class attributes
-        # as they will be overridden in subsequent inference requests. You can
-        # make a copy of the underlying NumPy array and store it if it is
-        # required.
-        for request in requests:
-            # Perform inference on the request and append it to responses list...
-            inputs = {}
-            if self.rank == 0:
-                inputs['input_ids'] = get_input_tensor_by_name(
-                    request, 'input_ids')
-                inputs['input_lengths'] = get_input_tensor_by_name(
-                    request, 'input_lengths')
-                inputs['request_output_len'] = get_input_scalar_by_name(
-                    request, 'request_output_len')
-                inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
-                inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
-                inputs['beam_width'] = get_input_scalar_by_name(
-                    request, 'beam_width')
-                inputs['temperature'] = get_input_scalar_by_name(
-                    request, 'temperature')
-                inputs['runtime_top_k'] = get_input_scalar_by_name(
-                    request, 'runtime_top_k')
-                inputs['runtime_top_p'] = get_input_scalar_by_name(
-                    request, 'runtime_top_p')
-                inputs['len_penalty'] = get_input_scalar_by_name(
-                    request, 'len_penalty')
-                inputs['repetition_penalty'] = get_input_scalar_by_name(
-                    request, 'repetition_penalty')
-                inputs['min_length'] = get_input_scalar_by_name(
-                    request, 'min_length')
-                inputs['presence_penalty'] = get_input_scalar_by_name(
-                    request, 'presence_penalty')
-                inputs['frequency_penalty'] = get_input_scalar_by_name(
-                    request, 'frequency_penalty')
-                inputs['random_seed'] = get_input_scalar_by_name(
-                    request, 'random_seed')
-                inputs['output_log_probs'] = get_input_scalar_by_name(
-                    request, 'output_log_probs')
-
-            # Broadcast requests to other clients
-            inputs = self.comm.bcast(inputs, root=0)
-            input_ids = inputs['input_ids'].cuda()
-            end_id = inputs['end_id']
-            pad_id = inputs['pad_id']
-
-            sampling_config = SamplingConfig(end_id=end_id, pad_id=pad_id)
-            if inputs['beam_width'] is not None:
-                sampling_config.num_beams = inputs['beam_width']
-            if inputs['temperature'] is not None:
-                sampling_config.temperature = inputs['temperature']
-            if inputs['runtime_top_k'] is not None:
-                sampling_config.top_k = inputs['runtime_top_k']
-            if inputs['runtime_top_p'] is not None:
-                sampling_config.top_p = inputs['runtime_top_p']
-            if inputs['len_penalty'] is not None:
-                sampling_config.length_penalty = inputs['len_penalty']
-            if inputs['repetition_penalty'] is not None:
-                sampling_config.repetition_penalty = inputs[
-                    'repetition_penalty']
-            if inputs['min_length'] is not None:
-                sampling_config.min_length = inputs['min_length']
-            if inputs['presence_penalty'] is not None:
-                sampling_config.presence_penalty = inputs['presence_penalty']
-            if inputs['frequency_penalty'] is not None:
-                sampling_config.frequency_penalty = inputs['frequency_penalty']
-            sampling_config.random_seed = inputs['random_seed']
-            sampling_config.output_log_probs = inputs['output_log_probs']
-            sampling_config.return_dict = True
-
-            outputs = self.runner.generate(input_ids, sampling_config)
-            output_ids = outputs["output_ids"]
-
-            if self.rank == 0:
-                # Create output tensors. You need pb_utils.Tensor
-                # objects to create pb_utils.InferenceResponse.
-                torch.cuda.synchronize()
-                output_tensors = [
-                    pb_utils.Tensor("output_ids",
-                                    output_ids.cpu().numpy())
-                ]
-
-                if sampling_config.output_log_probs:
-                    # [max_seq_len, batch_size, num_beams] -> [batch_size, max_seq_len, num_beams]
-                    log_probs = self.runner.session.log_probs_tiled.transpose(
-                        0, 1).cpu().numpy()
-                    output_tensors.append(
-                        pb_utils.Tensor("log_probs", log_probs))
-
-                # Create InferenceResponse. You can set an error here in case
-                # there was a problem with handling this inference request.
-                # Below is an example of how you can set errors in inference
-                # response:
-                #
-                # pb_utils.InferenceResponse(
-                #    output_tensors=..., TritonError("An error occurred"))
-
-                inference_response = pb_utils.InferenceResponse(output_tensors)
-            else:
-                inference_response = pb_utils.InferenceResponse([])
-            responses.append(inference_response)
-
-        # You must return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        return
diff --git a/all_models/gpt/tensorrt_llm/config.pbtxt b/all_models/gpt/tensorrt_llm/config.pbtxt
deleted file mode 100644
index 29f8523e..00000000
--- a/all_models/gpt/tensorrt_llm/config.pbtxt
+++ /dev/null
@@ -1,146 +0,0 @@
-name: "tensorrt_llm"
-backend: "python"
-max_batch_size: 1024
-
-# # Uncomment this for dynamic_batching
-# dynamic_batching {
-#    max_queue_delay_microseconds: 50000
-# }
-
-input [
-  {
-    name: "input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "request_output_len"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "len_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "min_length"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "random_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  }
-]
-output [
-  {
-    name: "output_ids"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  }
-]
-instance_group [
-  {
-    count: 1
-    kind : KIND_CPU
-  }
-]
-parameters {
-  key: "engine_dir"
-  value: {
-    string_value: "${engine_dir}"
-  }
-}
-parameters: {
-  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
-  value: {
-    string_value: "no"
-  }
-}
diff --git a/all_models/inflight_batcher_llm/ensemble/1/.tmp b/all_models/inflight_batcher_llm/ensemble/1/.tmp
deleted file mode 100644
index e69de29b..00000000
diff --git a/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/all_models/inflight_batcher_llm/ensemble/config.pbtxt
deleted file mode 100644
index eca25863..00000000
--- a/all_models/inflight_batcher_llm/ensemble/config.pbtxt
+++ /dev/null
@@ -1,470 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "ensemble"
-platform: "ensemble"
-max_batch_size: ${triton_max_batch_size}
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "decoder_text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_length"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "random_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stream"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-      name: "embedding_bias_words"
-      data_type: TYPE_STRING
-      dims: [ -1 ]
-      optional: true
-  },
-  {
-      name: "embedding_bias_weights"
-      data_type: TYPE_FP32
-      dims: [ -1 ]
-      optional: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1, -1 ]
-  }
-]
-ensemble_scheduling {
-  step [
-    {
-      model_name: "preprocessing"
-      model_version: -1
-      input_map {
-        key: "QUERY"
-        value: "text_input"
-      }
-      input_map {
-        key: "DECODER_QUERY"
-        value: "decoder_text_input"
-      }
-      input_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "max_tokens"
-      }
-      input_map {
-        key: "BAD_WORDS_DICT"
-        value: "bad_words"
-      }
-      input_map {
-        key: "STOP_WORDS_DICT"
-        value: "stop_words"
-      }
-      input_map {
-        key: "EMBEDDING_BIAS_WORDS"
-        value: "embedding_bias_words"
-      }
-      input_map {
-        key: "EMBEDDING_BIAS_WEIGHTS"
-        value: "embedding_bias_weights"
-      }
-      input_map {
-        key: "END_ID"
-        value: "end_id"
-      }
-      input_map {
-        key: "PAD_ID"
-        value: "pad_id"
-      }
-      output_map {
-        key: "REQUEST_INPUT_LEN"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      output_map {
-        key: "INPUT_ID"
-        value: "_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_DECODER_INPUT_LEN"
-        value: "_REQUEST_DECODER_INPUT_LEN"
-      }
-      output_map {
-        key: "DECODER_INPUT_ID"
-        value: "_DECODER_INPUT_ID"
-      }
-      output_map {
-        key: "REQUEST_OUTPUT_LEN"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      output_map {
-        key: "STOP_WORDS_IDS"
-        value: "_STOP_WORDS_IDS"
-      }
-      output_map {
-        key: "BAD_WORDS_IDS"
-        value: "_BAD_WORDS_IDS"
-      }
-      output_map {
-        key: "EMBEDDING_BIAS"
-        value: "_EMBEDDING_BIAS"
-      }
-      output_map {
-        key: "OUT_END_ID"
-        value: "_PREPROCESSOR_END_ID"
-      }
-      output_map {
-        key: "OUT_PAD_ID"
-        value: "_PREPROCESSOR_PAD_ID"
-      }
-    },
-    {
-      model_name: "tensorrt_llm"
-      model_version: -1
-      input_map {
-        key: "input_ids"
-        value: "_INPUT_ID"
-      }
-      input_map {
-        key: "decoder_input_ids"
-        value: "_DECODER_INPUT_ID"
-      }
-      input_map {
-        key: "input_lengths"
-        value: "_REQUEST_INPUT_LEN"
-      }
-      input_map {
-        key: "decoder_input_lengths"
-        value: "_REQUEST_DECODER_INPUT_LEN"
-      }
-      input_map {
-        key: "request_output_len"
-        value: "_REQUEST_OUTPUT_LEN"
-      }
-      input_map {
-          key: "end_id"
-          value: "_PREPROCESSOR_END_ID"
-      }
-      input_map {
-          key: "pad_id"
-          value: "_PREPROCESSOR_PAD_ID"
-      }
-      input_map {
-          key: "embedding_bias"
-          value: "_EMBEDDING_BIAS"
-      }
-      input_map {
-          key: "runtime_top_k"
-          value: "top_k"
-      }
-      input_map {
-          key: "runtime_top_p"
-          value: "top_p"
-      }
-      input_map {
-          key: "temperature"
-          value: "temperature"
-      }
-      input_map {
-          key: "len_penalty"
-          value: "length_penalty"
-      }
-      input_map {
-          key: "repetition_penalty"
-          value: "repetition_penalty"
-      }
-      input_map {
-          key: "min_length"
-          value: "min_length"
-      }
-      input_map {
-          key: "presence_penalty"
-          value: "presence_penalty"
-      }
-      input_map {
-          key: "frequency_penalty"
-          value: "frequency_penalty"
-      }
-      input_map {
-          key: "random_seed"
-          value: "random_seed"
-      }
-      input_map {
-          key: "return_log_probs"
-          value: "return_log_probs"
-      }
-      input_map {
-          key: "return_context_logits"
-          value: "return_context_logits"
-      }
-      input_map {
-          key: "return_generation_logits"
-          value: "return_generation_logits"
-      }
-      input_map {
-          key: "beam_width"
-          value: "beam_width"
-      }
-      input_map {
-          key: "streaming"
-          value: "stream"
-      }
-      input_map {
-        key: "prompt_embedding_table"
-        value: "prompt_embedding_table"
-      }
-      input_map {
-        key: "prompt_vocab_size"
-        value: "prompt_vocab_size"
-      }
-      input_map {
-        key: "stop_words_list"
-        value: "_STOP_WORDS_IDS"
-      }
-      input_map {
-        key: "bad_words_list"
-        value: "_BAD_WORDS_IDS"
-      }
-      output_map {
-        key: "output_ids"
-        value: "_TOKENS_BATCH"
-      }
-      output_map {
-        key: "sequence_length"
-        value: "_SEQUENCE_LENGTH"
-      },
-      output_map {
-        key: "cum_log_probs"
-        value: "_CUM_LOG_PROBS"
-      }
-      output_map {
-        key: "output_log_probs"
-        value: "_OUTPUT_LOG_PROBS"
-      },
-      output_map {
-        key: "context_logits"
-        value: "_CONTEXT_LOGITS"
-      },
-      output_map {
-        key: "generation_logits"
-        value: "_GENERATION_LOGITS"
-      }
-    },
-    {
-      model_name: "postprocessing"
-      model_version: -1
-      input_map {
-        key: "TOKENS_BATCH"
-        value: "_TOKENS_BATCH"
-      }
-      input_map {
-        key: "CUM_LOG_PROBS"
-        value: "_CUM_LOG_PROBS"
-      }
-      input_map {
-        key: "OUTPUT_LOG_PROBS"
-        value: "_OUTPUT_LOG_PROBS"
-      }
-      input_map {
-        key: "CONTEXT_LOGITS"
-        value: "_CONTEXT_LOGITS"
-      }
-      input_map {
-        key: "GENERATION_LOGITS"
-        value: "_GENERATION_LOGITS"
-      }
-      input_map {
-        key: "SEQUENCE_LENGTH"
-        value: "_SEQUENCE_LENGTH"
-      }
-      output_map {
-        key: "OUTPUT"
-        value: "text_output"
-      }
-      output_map {
-        key: "OUT_OUTPUT_LOG_PROBS"
-        value: "output_log_probs"
-      }
-      output_map {
-        key: "OUT_CUM_LOG_PROBS"
-        value: "cum_log_probs"
-      }
-      output_map {
-        key: "OUT_CONTEXT_LOGITS"
-        value: "context_logits"
-      }
-      output_map {
-        key: "OUT_GENERATION_LOGITS"
-        value: "generation_logits"
-      }
-    }
-  ]
-}
diff --git a/all_models/inflight_batcher_llm/postprocessing/1/model.py b/all_models/inflight_batcher_llm/postprocessing/1/model.py
deleted file mode 100644
index 4ab14fb1..00000000
--- a/all_models/inflight_batcher_llm/postprocessing/1/model.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        skip_special_tokens = model_config['parameters'].get(
-            'skip_special_tokens')
-        if skip_special_tokens is not None:
-            skip_special_tokens_str = skip_special_tokens[
-                'string_value'].lower()
-            if skip_special_tokens_str in [
-                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
-            ]:
-                self.skip_special_tokens = skip_special_tokens_str in [
-                    'true', '1', 't', 'y', 'yes'
-                ]
-            else:
-                print(
-                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
-                )
-                self.skip_special_tokens = True
-        else:
-            print(
-                f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
-            )
-            self.skip_special_tokens = True
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side='left',
-                                                       trust_remote_code=True)
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        # Parse model output configs
-        output_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT")
-
-        # Convert Triton types to numpy types
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            tokens_batch = pb_utils.get_input_tensor_by_name(
-                request, 'TOKENS_BATCH').as_numpy()
-
-            # Get sequence length
-            sequence_lengths = pb_utils.get_input_tensor_by_name(
-                request, 'SEQUENCE_LENGTH').as_numpy()
-
-            # Get cum log probs
-            cum_log_probs = pb_utils.get_input_tensor_by_name(
-                request, 'CUM_LOG_PROBS')
-
-            # Get sequence length
-            output_log_probs = pb_utils.get_input_tensor_by_name(
-                request, 'OUTPUT_LOG_PROBS')
-
-            # Get context logits
-            context_logits = pb_utils.get_input_tensor_by_name(
-                request, 'CONTEXT_LOGITS')
-
-            # Get generation logits
-            generation_logits = pb_utils.get_input_tensor_by_name(
-                request, 'GENERATION_LOGITS')
-
-            # Reshape Input
-            # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
-            # tokens_batch = tokens_batch.T
-
-            # Postprocessing output data.
-            outputs = self._postprocessing(tokens_batch, sequence_lengths)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            output_tensor = pb_utils.Tensor(
-                'OUTPUT',
-                np.array(outputs).astype(self.output_dtype))
-
-            outputs = []
-            outputs.append(output_tensor)
-
-            if cum_log_probs:
-                out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
-                                                    cum_log_probs.as_numpy())
-                outputs.append(out_cum_log_probs)
-            else:
-                out_cum_log_probs = pb_utils.Tensor(
-                    'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
-                outputs.append(out_cum_log_probs)
-
-            if output_log_probs:
-                out_output_log_probs = pb_utils.Tensor(
-                    'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
-                outputs.append(out_output_log_probs)
-            else:
-                out_output_log_probs = pb_utils.Tensor(
-                    'OUT_OUTPUT_LOG_PROBS',
-                    np.array([[[0.0]]], dtype=np.float32))
-                outputs.append(out_output_log_probs)
-
-            if context_logits:
-                out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
-                                                     context_logits.as_numpy())
-                outputs.append(out_context_logits)
-            else:
-                out_context_logits = pb_utils.Tensor(
-                    'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
-                                                   dtype=np.float32))
-                outputs.append(out_context_logits)
-
-            if generation_logits:
-                out_generation_logits = pb_utils.Tensor(
-                    'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
-                outputs.append(out_generation_logits)
-            else:
-                out_generation_logits = pb_utils.Tensor(
-                    'OUT_GENERATION_LOGITS',
-                    np.array([[[[0.0]]]], dtype=np.float32))
-                outputs.append(out_generation_logits)
-
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=outputs)
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _postprocessing(self, tokens_batch, sequence_lengths):
-        outputs = []
-        for batch_idx, beam_tokens in enumerate(tokens_batch):
-            for beam_idx, tokens in enumerate(beam_tokens):
-                seq_len = sequence_lengths[batch_idx][beam_idx]
-                output = self.tokenizer.decode(
-                    tokens[:seq_len],
-                    skip_special_tokens=self.skip_special_tokens)
-                outputs.append(output.encode('utf8'))
-        return outputs
diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
deleted file mode 100644
index aaecb134..00000000
--- a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "postprocessing"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-input [
-  {
-    name: "TOKENS_BATCH"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "SEQUENCE_LENGTH"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "CUM_LOG_PROBS"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "OUTPUT_LOG_PROBS"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-    optional: true
-  },
-  {
-    name: "CONTEXT_LOGITS"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-    optional: true
-  },
-  {
-    name: "GENERATION_LOGITS"
-    data_type: TYPE_FP32
-    dims: [ -1, -1, -1 ]
-    optional: true
-  }
-]
-output [
-  {
-    name: "OUTPUT"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "OUT_CUM_LOG_PROBS"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "OUT_OUTPUT_LOG_PROBS"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "OUT_CONTEXT_LOGITS"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "OUT_GENERATION_LOGITS"
-    data_type: TYPE_FP32
-    dims: [ -1, -1, -1 ]
-  }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-parameters {
-  key: "skip_special_tokens"
-  value: {
-    string_value: "${skip_special_tokens}"
-  }
-}
-
-instance_group [
-    {
-        count: ${postprocessing_instance_count}
-        kind: KIND_CPU
-    }
-]
diff --git a/all_models/inflight_batcher_llm/preprocessing/1/model.py b/all_models/inflight_batcher_llm/preprocessing/1/model.py
deleted file mode 100644
index e392659e..00000000
--- a/all_models/inflight_batcher_llm/preprocessing/1/model.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-from typing import List
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer, T5Tokenizer
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-
-        add_special_tokens = model_config['parameters'].get(
-            'add_special_tokens')
-        if add_special_tokens is not None:
-            add_special_tokens_str = add_special_tokens['string_value'].lower()
-            if add_special_tokens_str in [
-                    'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no'
-            ]:
-                self.add_special_tokens = add_special_tokens_str in [
-                    'true', '1', 't', 'y', 'yes'
-                ]
-            else:
-                print(
-                    f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default."
-                )
-                self.add_special_tokens = True
-        else:
-            print(
-                f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default."
-            )
-            self.add_special_tokens = True
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side='left',
-                                                       trust_remote_code=True)
-        if isinstance(self.tokenizer, T5Tokenizer):
-            self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
-
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        self.tokenizer_end_id = self.tokenizer.encode(
-            self.tokenizer.eos_token, add_special_tokens=False)[0]
-        self.tokenizer_pad_id = self.tokenizer.encode(
-            self.tokenizer.pad_token, add_special_tokens=False)[0]
-
-        # Parse model output configs and convert Triton types to numpy types
-        output_names = [
-            "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN",
-            "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS",
-            "OUT_END_ID", "OUT_PAD_ID"
-        ]
-        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
-        for input_name in input_names:
-            setattr(
-                self,
-                input_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_input_config_by_name(
-                        model_config, input_name)['data_type']))
-
-        for output_name in output_names:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
-                        model_config, output_name)['data_type']))
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model. Depending on the batching configuration (e.g. Dynamic
-        Batching) used, `requests` may contain multiple requests. Every
-        Python model, must create one pb_utils.InferenceResponse for every
-        pb_utils.InferenceRequest in `requests`. If there is an error, you can
-        set the error argument when creating a pb_utils.InferenceResponse.
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-
-        responses = []
-
-        # Every Python backend must iterate over everyone of the requests
-        # and create a pb_utils.InferenceResponse for each of them.
-        logger = pb_utils.Logger
-        for idx, request in enumerate(requests):
-            # Get input tensors
-            query = pb_utils.get_input_tensor_by_name(request,
-                                                      'QUERY').as_numpy()
-            decoder_query = pb_utils.get_input_tensor_by_name(
-                request, 'DECODER_QUERY')
-            if decoder_query is not None:
-                decoder_query = decoder_query.as_numpy()
-
-            batch_dim = query.shape[0]
-            if batch_dim != 1:
-
-                err_str = "Inflight batching backend expects requests with batch size of 1."
-                logger.log_error(err_str)
-                responses.append(
-                    pb_utils.InferenceResponse(
-                        output_tensors=[],
-                        error=pb_utils.TritonError(err_str)))
-                continue
-
-            request_output_len = pb_utils.get_input_tensor_by_name(
-                request, 'REQUEST_OUTPUT_LEN').as_numpy()
-
-            bad_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'BAD_WORDS_DICT')
-            if bad_words_dict is not None:
-                bad_words_dict = bad_words_dict.as_numpy()
-
-            stop_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'STOP_WORDS_DICT')
-            if stop_words_dict is not None:
-                stop_words_dict = stop_words_dict.as_numpy()
-
-            embedding_bias_words = pb_utils.get_input_tensor_by_name(
-                request, 'EMBEDDING_BIAS_WORDS')
-            if embedding_bias_words is not None:
-                embedding_bias_words = embedding_bias_words.as_numpy()
-
-            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
-                request, 'EMBEDDING_BIAS_WEIGHTS')
-            if embedding_bias_weights is not None:
-                embedding_bias_weights = embedding_bias_weights.as_numpy()
-
-            # Take the end_id from the input tensors
-            # If not specified, use tokenizer to get end_id
-            end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID')
-            if end_id is not None:
-                end_id = end_id.as_numpy()
-            else:
-                end_id = [[self.tokenizer_end_id]]
-
-            # Take the pad_id from the input tensors
-            # If not specified, use tokenizer to get pad_id
-            pad_id = pb_utils.get_input_tensor_by_name(request, 'PAD_ID')
-            if pad_id is not None:
-                pad_id = pad_id.as_numpy()
-            else:
-                pad_id = [[self.tokenizer_pad_id]]
-
-            # Preprocessing input data.
-            input_id, request_input_len = self._create_request(query)
-            print(input_id)
-            print(request_input_len)
-            if decoder_query is not None:
-                decoder_input_id, request_decoder_input_len = self._create_request(
-                    decoder_query)
-            else:
-                decoder_input_id = pad_id * np.ones((1, 1), np.int32)
-                request_decoder_input_len = 1 * np.ones((1, 1), np.int32)
-
-            bad_words = self._to_word_list_format(bad_words_dict)
-            stop_words = self._to_word_list_format(stop_words_dict)
-
-            embedding_bias = self._get_embedding_bias(
-                embedding_bias_words, embedding_bias_weights,
-                self.embedding_bias_weights_dtype)
-
-            # Create output tensors. You need pb_utils.Tensor
-            # objects to create pb_utils.InferenceResponse.
-            input_id_tensor = pb_utils.Tensor(
-                'INPUT_ID', input_id.astype(self.input_id_dtype))
-            request_input_len_tensor = pb_utils.Tensor(
-                'REQUEST_INPUT_LEN',
-                request_input_len.astype(self.request_input_len_dtype))
-            decoder_input_id_tensor = pb_utils.Tensor(
-                'DECODER_INPUT_ID',
-                decoder_input_id.astype(self.decoder_input_id_dtype))
-            request_decoder_input_len_tensor = pb_utils.Tensor(
-                'REQUEST_DECODER_INPUT_LEN',
-                request_decoder_input_len.astype(
-                    self.request_decoder_input_len_dtype))
-            request_output_len_tensor = pb_utils.Tensor(
-                'REQUEST_OUTPUT_LEN', request_output_len)
-            bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
-            stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
-                                                    stop_words)
-            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
-                                                    embedding_bias)
-            end_id_tensor = pb_utils.Tensor('OUT_END_ID',
-                                            np.array(end_id, dtype=np.int32))
-            pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID',
-                                            np.array(pad_id, dtype=np.int32))
-
-            inference_response = pb_utils.InferenceResponse(output_tensors=[
-                input_id_tensor, decoder_input_id_tensor, bad_words_ids_tensor,
-                stop_words_ids_tensor, request_input_len_tensor,
-                request_decoder_input_len_tensor, request_output_len_tensor,
-                embedding_bias_tensor, end_id_tensor, pad_id_tensor
-            ])
-            responses.append(inference_response)
-
-        # You should return a list of pb_utils.InferenceResponse. Length
-        # of this list must match the length of `requests` list.
-        return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
-
-    def _create_request(self, query):
-        """
-            query : batch string (2D numpy array)
-        """
-        if isinstance(self.tokenizer, T5Tokenizer):
-            start_ids = [
-                np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
-                    s[0].decode(), add_special_tokens=self.add_special_tokens)
-                         ).astype(int) for s in query
-            ]
-        else:
-            start_ids = [
-                np.array(
-                    self.tokenizer.encode(
-                        s[0].decode(),
-                        add_special_tokens=self.add_special_tokens)).astype(
-                            int) for s in query
-            ]
-        start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
-
-        max_len = 0
-        for seq in start_ids:
-            max_len = max(max_len, seq.shape[0])
-        start_ids = np.stack([
-            np.pad(seq, (0, max_len - seq.shape[0]),
-                   'constant',
-                   constant_values=(0, self.tokenizer_pad_id))
-            for seq in start_ids
-        ])
-
-        return start_ids, start_lengths
-
-    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
-        '''
-        word_lists format:
-            len(word_lists) == batch_size
-            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
-        '''
-        assert self.tokenizer != None, "need to set tokenizer"
-
-        if word_lists is None:
-            # Return an empty array of shape (1,2,0)
-            return np.empty([1, 2, 0], dtype="int32")
-
-        flat_ids = []
-        offsets = []
-        for word_list in word_lists:
-            item_flat_ids = []
-            item_offsets = []
-
-            for word in word_list:
-                if isinstance(word, bytes):
-                    word = word.decode()
-
-                ids = self.tokenizer.encode(word, add_special_tokens=False)
-                if len(ids) == 0:
-                    continue
-
-                item_flat_ids += ids
-                item_offsets.append(len(ids))
-
-            flat_ids.append(np.array(item_flat_ids))
-            offsets.append(np.cumsum(np.array(item_offsets)))
-
-        pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-        for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-            flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
-                                 constant_values=0)
-            offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
-                                constant_values=-1)
-
-        return np.array([flat_ids, offsets], dtype="int32").transpose(
-            (1, 0, 2))
-
-    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
-                            bias_dtype):
-
-        assert self.tokenizer != None, "need to set tokenizer"
-
-        if embedding_bias_words is None or embedding_bias_weights is None:
-            return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
-
-        batch_embedding_bias = []
-        for words, weights in zip(embedding_bias_words,
-                                  embedding_bias_weights):
-
-            vocab_size = self.tokenizer.vocab_size
-            embedding_bias = [0.] * vocab_size
-
-            assert len(words) == len(
-                weights
-            ), "Embedding bias words must have same dimension as embedding bias weights"
-
-            for word, weight in zip(words, weights):
-                if isinstance(word, bytes):
-                    word = word.decode()
-                ids = self.tokenizer.encode(word)
-
-                if len(ids) == 0:
-                    continue
-
-                for id in ids:
-                    embedding_bias[id] += weight
-
-            batch_embedding_bias.append(np.array(embedding_bias))
-
-        return np.array(batch_embedding_bias, dtype=bias_dtype)
diff --git a/all_models/inflight_batcher_llm/preprocessing/config.pbtxt b/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
deleted file mode 100644
index 165134c4..00000000
--- a/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "preprocessing"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-input [
-    {
-        name: "QUERY"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-    },
-    {
-        name: "DECODER_QUERY"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "BAD_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "STOP_WORDS_DICT"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "EMBEDDING_BIAS_WORDS"
-        data_type: TYPE_STRING
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "EMBEDDING_BIAS_WEIGHTS"
-        data_type: TYPE_FP32
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "END_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-        optional: true
-    },
-    {
-        name: "PAD_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-        optional: true
-    }
-]
-output [
-    {
-        name: "INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_INPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "DECODER_INPUT_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_DECODER_INPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ 1 ]
-    },
-    {
-        name: "BAD_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "STOP_WORDS_IDS"
-        data_type: TYPE_INT32
-        dims: [ 2, -1 ]
-    },
-    {
-        name: "EMBEDDING_BIAS"
-        data_type: TYPE_FP32
-        dims: [ -1 ]
-    },
-    {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "OUT_END_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    },
-    {
-        name: "OUT_PAD_ID"
-        data_type: TYPE_INT32
-        dims: [ -1 ]
-    }
-]
-
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "${tokenizer_dir}"
-  }
-}
-
-parameters {
-  key: "add_special_tokens"
-  value: {
-    string_value: "${add_special_tokens}"
-  }
-}
-
-instance_group [
-    {
-        count: ${preprocessing_instance_count}
-        kind: KIND_CPU
-    }
-]
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/1/.gitkeep b/all_models/inflight_batcher_llm/tensorrt_llm/1/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py b/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
deleted file mode 100644
index 3bbf86d1..00000000
--- a/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py
+++ /dev/null
@@ -1,782 +0,0 @@
-import datetime
-import json
-import os
-import time
-from threading import Lock, Thread
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from torch import from_numpy
-
-import tensorrt_llm.bindings.executor as trtllm
-
-
-def get_input_tensor_by_name(request, name):
-    tensor = pb_utils.get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-    return tensor.as_numpy()
-
-
-def get_input_scalar_by_name(request, name):
-    tensor = get_input_tensor_by_name(request, name)
-    if tensor is None:
-        return None
-    if tensor.size != 1:
-        raise pb_utils.TritonModelException(
-            f"Expected a single value for {name}")
-    return tensor.item()
-
-
-def read_parameter_as_type(value, name, pytype=str):
-    if value == "":
-        return None
-    if value.startswith("${") and value.endswith("}"):
-        return None
-    if pytype is bool:
-        return value.lower() in ["1", "true"]
-    try:
-        result = pytype(value)
-        return result
-    except:
-        pb_utils.Logger.log_warning(
-            f"Could not read parameter '{name}' with value '{value}', will use default."
-        )
-        return None
-
-
-def get_parameter(model_config, name, pytype=str):
-    if name not in model_config['parameters']:
-        return None
-    return read_parameter_as_type(
-        model_config['parameters'][name]['string_value'], name, pytype)
-
-
-def convert_word_list(word_list):
-    if word_list is None:
-        return None
-    word_list = word_list.tolist()
-    if len(word_list) == 0 or len(word_list[0]) != 2:
-        raise pb_utils.TritonModelException(f"Invalid format for word list.")
-    words, indices = word_list[0]
-    result = []
-    current_index = 0
-    for i in indices:
-        if i == -1:
-            continue
-        if i > len(words):
-            raise pb_utils.TritonModelException(
-                f"Invalid format for word list.")
-        current_word = []
-        while current_index < i:
-            current_word.append(words[current_index])
-            current_index += 1
-        result.append(current_word)
-    return result
-
-
-def parse_medusa_choices(medusa_choices):
-    if medusa_choices is None:
-        return None
-    try:
-        result = json.loads(
-            "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
-        assert isinstance(result, list) and len(result) > 0
-        assert all([isinstance(x, list) for x in result])
-        assert all([isinstance(y, int) for x in result for y in x])
-    except Exception:
-        raise pb_utils.TritonModelException(
-            "Invalid format for medusa_choices")
-    return result
-
-
-def get_sampling_config_from_request(request):
-    kwargs = {}
-    kwargs['beam_width'] = get_input_scalar_by_name(request, 'beam_width') or 1
-    kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k')
-    kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p')
-    kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
-        'top_p'] <= 0 else kwargs['top_p']
-    kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed')
-    kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature')
-    kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length')
-    kwargs['repetition_penalty'] = get_input_scalar_by_name(
-        request, 'repetition_penalty')
-    kwargs['presence_penalty'] = get_input_scalar_by_name(
-        request, 'presence_penalty')
-    kwargs['frequency_penalty'] = get_input_scalar_by_name(
-        request, 'frequency_penalty')
-    kwargs['length_penalty'] = get_input_scalar_by_name(request, 'len_penalty')
-    kwargs['top_p_min'] = get_input_scalar_by_name(request,
-                                                   'runtime_top_p_min')
-    kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
-        request, 'runtime_top_p_reset_ids')
-    kwargs['top_p_decay'] = get_input_scalar_by_name(request,
-                                                     'runtime_top_p_decay')
-    kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
-        request, 'beam_search_diversity_rate')
-    kwargs['early_stopping'] = get_input_scalar_by_name(
-        request, 'early_stopping')
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    return trtllm.SamplingConfig(**kwargs)
-
-
-def get_output_config_from_request(request, exclude_input_from_output):
-    kwargs = {}
-    kwargs["return_log_probs"] = get_input_scalar_by_name(
-        request, 'return_log_probs')
-    kwargs["return_context_logits"] = get_input_scalar_by_name(
-        request, 'return_context_logits')
-    kwargs["return_generation_logits"] = get_input_scalar_by_name(
-        request, 'return_generation_logits')
-    kwargs["exclude_input_from_output"] = exclude_input_from_output
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    return trtllm.OutputConfig(**kwargs)
-
-
-def get_external_draft_tokens_config_from_request(request):
-    kwargs = {}
-    draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids')
-    if draft_input_ids is not None:
-        kwargs['tokens'] = draft_input_ids.tolist()
-    draft_logits = get_input_tensor_by_name(request, 'draft_logits')
-    if draft_logits is not None:
-        kwargs['logits'] = from_numpy(draft_logits)
-    kwargs['acceptance_threshold'] = get_input_scalar_by_name(
-        request, 'draft_acceptance_threshold')
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.ExternalDraftTokensConfig(**kwargs)
-    return None
-
-
-def get_prompt_tuning_config_from_request(request):
-    # prompt_vocab_size is unused by executor.
-    kwargs = {}
-    prompt_embedding_table = get_input_tensor_by_name(
-        request, 'prompt_embedding_table')
-    if prompt_embedding_table is not None:
-        kwargs["embedding_table"] = from_numpy(prompt_embedding_table)
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.PromptTuningConfig(**kwargs)
-    return None
-
-
-def get_lora_config_from_request(request):
-    kwargs = {}
-    kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id')
-    lora_weights = get_input_tensor_by_name(request, 'lora_weights')
-    if lora_weights is not None:
-        kwargs["weights"] = from_numpy(lora_weights)
-    lora_config = get_input_tensor_by_name(request, 'lora_config')
-    if lora_config is not None:
-        kwargs["config"] = from_numpy(lora_config)
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    if len(kwargs) > 0:
-        return trtllm.LoraConfig(**kwargs)
-    return None
-
-
-def convert_request(request, exclude_input_from_output, decoupled):
-    inputs = {}
-    input_token_ids = get_input_tensor_by_name(request, 'input_ids')
-    if input_token_ids is None:
-        raise pb_utils.TritonModelException(
-            "A value is required for input_ids")
-    input_token_ids = input_token_ids.tolist()
-    if len(input_token_ids) == 0:
-        raise pb_utils.TritonModelException(f"Invalid format for input_ids")
-    inputs['input_token_ids'] = input_token_ids[0]
-    # input_lengths is not not used by executor.
-    inputs['max_new_tokens'] = get_input_scalar_by_name(
-        request, 'request_output_len')
-    if inputs['max_new_tokens'] is None:
-        raise pb_utils.TritonModelException(
-            "A value is required for request_output_len")
-    inputs['streaming'] = get_input_scalar_by_name(request, 'streaming')
-    if inputs['streaming'] and not decoupled:
-        raise pb_utils.TritonModelException(
-            "Streaming is only supported in decoupled mode.")
-    inputs['end_id'] = get_input_scalar_by_name(request, 'end_id')
-    inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id')
-    inputs['stop_words'] = convert_word_list(
-        get_input_tensor_by_name(request, 'stop_words_list'))
-    inputs['bad_words'] = convert_word_list(
-        get_input_tensor_by_name(request, 'bad_words_list'))
-    embedding_bias = get_input_tensor_by_name(request, 'embedding_bias')
-    if embedding_bias is not None and embedding_bias.size != 0:
-        inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
-
-    sampling_config = get_sampling_config_from_request(request)
-    output_config = get_output_config_from_request(request,
-                                                   exclude_input_from_output)
-    external_draft_tokens_config = get_external_draft_tokens_config_from_request(
-        request)
-    prompt_tuning_config = get_prompt_tuning_config_from_request(request)
-    lora_config = get_lora_config_from_request(request)
-
-    return trtllm.Request(
-        **inputs,
-        sampling_config=sampling_config,
-        output_config=output_config,
-        external_draft_tokens_config=external_draft_tokens_config,
-        prompt_tuning_config=prompt_tuning_config,
-        lora_config=lora_config,
-    )
-
-
-def convert_response(response):
-    if response.has_error():
-        return pb_utils.InferenceResponse(output_tensors=[],
-                                          error=pb_utils.TritonError(
-                                              response.error_msg)), True
-    result = response.result
-    beam_lengths = np.expand_dims(
-        np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
-    max_beam_length = max([len(beam) for beam in result.output_token_ids])
-    output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
-                         -1, np.int32)
-    for idx, beam in enumerate(result.output_token_ids):
-        output_ids[0, idx, :len(beam)] = beam
-    output_tensors = [
-        pb_utils.Tensor("output_ids", output_ids),
-        pb_utils.Tensor("sequence_length", beam_lengths),
-    ]
-    output_tensors.append(
-        pb_utils.Tensor(
-            "cum_log_probs",
-            np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
-            if result.cum_log_probs is not None else np.zeros(
-                (1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "output_log_probs",
-            np.expand_dims(np.array(result.log_probs, np.float32), 0) if
-            result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "context_logits",
-            np.expand_dims(np.array(result.context_logits, np.float32), 0)
-            if result.context_logits is not None else np.zeros(
-                (1, 1, 1), np.float32)))
-    output_tensors.append(
-        pb_utils.Tensor(
-            "generation_logits",
-            np.expand_dims(np.array(result.generation_logits, np.float32), 0)
-            if result.generation_logits is not None else np.zeros(
-                (1, 1, 1, 1), np.float32)))
-    return pb_utils.InferenceResponse(output_tensors), result.is_final
-
-
-def convert_scheduler_policy(batch_scheduler_policy: str):
-    if batch_scheduler_policy.lower() == "max_utilization":
-        return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
-    elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
-        return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
-    raise pb_utils.TritonModelException(
-        f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
-    )
-
-
-def convert_batching_type(gpt_model_type: str):
-    if gpt_model_type is None:
-        return None
-    if gpt_model_type.lower(
-    ) == "inflight_fused_batching" or gpt_model_type.lower(
-    ) == "inflight_batching":
-        return trtllm.BatchingType.INFLIGHT
-    elif gpt_model_type.lower() == "v1":
-        return trtllm.BatchingType.STATIC
-    raise pb_utils.TritonModelException(
-        f"gpt_model_type value of '{gpt_model_type}' is not supported.")
-
-
-def convert_decoding_mode(decoding_mode: str):
-    if decoding_mode is None:
-        return None
-    elif decoding_mode == "auto":
-        return trtllm.DecodingMode.Auto()
-    elif decoding_mode == "top_k":
-        return trtllm.DecodingMode.TopK()
-    elif decoding_mode == "top_p":
-        return trtllm.DecodingMode.TopP()
-    elif decoding_mode == "top_k_top_p":
-        return trtllm.DecodingMode.TopKTopP()
-    elif decoding_mode == "beam_search":
-        return trtllm.DecodingMode.BeamSearch()
-    elif decoding_mode == "medusa":
-        return trtllm.DecodingMode.Medusa()
-    raise pb_utils.TritonModelException(
-        f"decoding_mode value of '{decoding_mode}' is not supported.")
-
-
-def convert_timestamp_to_seconds(timestamp: str):
-    return int(
-        datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())
-
-
-class TritonPythonModel:
-    """Your Python model must use the same class name. Every Python model
-    that is created must have "TritonPythonModel" as the class name.
-    """
-
-    def get_scheduler_config(self, model_config):
-        batch_scheduler_policy = get_parameter(model_config,
-                                               "batch_scheduler_policy")
-        if batch_scheduler_policy is None:
-            return trtllm.SchedulerConfig()
-        return trtllm.SchedulerConfig(
-            convert_scheduler_policy(batch_scheduler_policy))
-
-    def get_kv_cache_config(self, model_config):
-        kwargs = {
-            "enable_block_reuse":
-            get_parameter(model_config, "enable_kv_cache_reuse", bool),
-            "max_tokens":
-            get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
-            "sink_token_length":
-            get_parameter(model_config, "sink_token_length", int),
-            "max_attention_window":
-            get_parameter(model_config, "max_attention_window_size", int),
-            "free_gpu_memory_fraction":
-            get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
-                          float),
-            "host_cache_size":
-            get_parameter(model_config, "kv_cache_host_memory_bytes", int),
-            "onboard_blocks":
-            get_parameter(model_config, "kv_cache_onboard_blocks", bool),
-        }
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.KvCacheConfig(**kwargs)
-
-    def get_parallel_config(self, model_config):
-        kwargs = {}
-        gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
-        if gpu_device_ids:
-            kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
-        self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
-                                                    "0") == "1"
-        if self.use_orchestrator_mode:
-            kwargs[
-                "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
-            worker_path = get_parameter(model_config, "worker_path")
-            if worker_path is not None:
-                raise pb_utils.TritonModelException(
-                    "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
-                )
-            executor_worker_path = get_parameter(model_config,
-                                                 "executor_worker_path")
-            kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
-                True, executor_worker_path)
-        if len(kwargs) > 0:
-            return trtllm.ParallelConfig(**kwargs)
-        return None
-
-    def get_peft_cache_config(self, model_config):
-        kwargs = {
-            "optimal_adapter_size":
-            get_parameter(model_config, "lora_cache_optimal_adapter_size",
-                          int),
-            "max_adapter_size":
-            get_parameter(model_config, "lora_cache_max_adapter_size", int),
-            "device_cache_percent":
-            get_parameter(model_config, "lora_cache_gpu_memory_fraction",
-                          float),
-            "host_cache_size":
-            get_parameter(model_config, "lora_cache_host_memory_bytes", int),
-        }
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.PeftCacheConfig(**kwargs)
-
-    def get_decoding_config(self, model_config):
-        kwargs = {
-            "medusa_choices":
-            parse_medusa_choices(get_parameter(model_config,
-                                               "medusa_choices")),
-            "decoding_mode":
-            convert_decoding_mode(get_parameter(model_config,
-                                                "decoding_mode")),
-        }
-        print(kwargs)
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.DecodingConfig(**kwargs)
-
-    def get_executor_config(self, model_config):
-        kwargs = {
-            "max_beam_width":
-            get_parameter(model_config, "max_beam_width", int),
-            "scheduler_config":
-            self.get_scheduler_config(model_config),
-            "kv_cache_config":
-            self.get_kv_cache_config(model_config),
-            "enable_chunked_context":
-            get_parameter(model_config, "enable_chunked_context", bool),
-            "normalize_log_probs":
-            get_parameter(model_config, "normalize_log_probs", bool),
-            "batching_type":
-            convert_batching_type(get_parameter(model_config,
-                                                "gpt_model_type")),
-            "parallel_config":
-            self.get_parallel_config(model_config),
-            "peft_cache_config":
-            self.get_peft_cache_config(model_config),
-            "decoding_config":
-            self.get_decoding_config(model_config),
-        }
-        kwargs = {k: v for k, v in kwargs.items() if v is not None}
-        return trtllm.ExecutorConfig(**kwargs)
-
-    def create_metrics(self, model: str, version: str, is_v1_model: bool):
-        self.request_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_request_metrics",
-            description="TRT LLM request metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.runtime_memory_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_runtime_memory_metrics",
-            description="TRT LLM runtime memory metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.kv_cache_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_kv_cache_block_metrics",
-            description="TRT LLM KV cache block metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        model_type = "v1" if is_v1_model else "inflight_batcher"
-        self.model_type_metric_family = pb_utils.MetricFamily(
-            name=f"nv_trt_llm_{model_type}_metrics",
-            description=f"TRT LLM {model_type}-specific metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        self.general_metric_family = pb_utils.MetricFamily(
-            name="nv_trt_llm_general_metrics",
-            description="General TRT LLM metrics",
-            kind=pb_utils.MetricFamily.GAUGE,
-        )
-        common_labels = {"model": model, "version": version}
-        self.all_metrics = {
-            # Request metrics
-            "num_active_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "active",
-                **common_labels
-            }),
-            "max_num_active_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "max",
-                **common_labels
-            }),
-            "num_scheduled_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "scheduled",
-                **common_labels
-            }),
-            "num_context_requests":
-            self.request_metric_family.Metric(labels={
-                "request_type": "context",
-                **common_labels
-            }),
-            # Runtime metrics
-            "cpu_mem_usage":
-            self.runtime_memory_metric_family.Metric(labels={
-                "memory_type": "cpu",
-                **common_labels
-            }),
-            "gpu_mem_usage":
-            self.runtime_memory_metric_family.Metric(labels={
-                "memory_type": "gpu",
-                **common_labels
-            }),
-            "pinned_mem_usage":
-            self.runtime_memory_metric_family.Metric(labels={
-                "memory_type": "pinned",
-                **common_labels
-            }),
-            # KV cache metrics
-            "max_num_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "max",
-                **common_labels
-            }),
-            "free_num_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "free",
-                **common_labels
-            }),
-            "used_num_blocks":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "used",
-                **common_labels
-            }),
-            "tokens_per_block":
-            self.kv_cache_metric_family.Metric(labels={
-                "kv_cache_block_type": "tokens_per",
-                **common_labels
-            }),
-            # General metrics
-            "timestamp":
-            self.general_metric_family.Metric(labels={
-                "general_type": "timestamp",
-                **common_labels
-            }),
-            "iter":
-            self.general_metric_family.Metric(labels={
-                "general_type": "iteration_counter",
-                **common_labels
-            }),
-        }
-        if is_v1_model:
-            self.all_metrics.update({
-                "num_ctx_tokens":
-                self.model_type_metric_family.Metric(labels={
-                    "v1_specific_metric": "total_context_tokens",
-                    **common_labels
-                }),
-                "num_gen_tokens":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "v1_specific_metric": "total_generation_tokens",
-                        **common_labels
-                    }),
-                "empty_gen_slots":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "v1_specific_metric": "empty_generation_slots",
-                        **common_labels
-                    }),
-            })
-        else:
-            self.all_metrics.update({
-                "num_ctx_tokens":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric":
-                        "total_context_tokens",
-                        **common_labels
-                    }),
-                "num_gen_requests":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric":
-                        "generation_requests",
-                        **common_labels
-                    }),
-                "micro_batch_id":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric": "micro_batch_id",
-                        **common_labels
-                    }),
-                "num_paused_requests":
-                self.model_type_metric_family.Metric(
-                    labels={
-                        "inflight_batcher_specific_metric": "paused_requests",
-                        **common_labels
-                    }),
-            })
-
-    def initialize(self, args):
-        """`initialize` is called only once when the model is being loaded.
-        Implementing `initialize` function is optional. This function allows
-        the model to initialize any state associated with this model.
-
-        Parameters
-        ----------
-        args : dict
-          Both keys and values are strings. The dictionary keys and values are:
-          * model_config: A JSON string containing the model configuration
-          * model_instance_kind: A string containing model instance kind
-          * model_instance_device_id: A string containing model instance device ID
-          * model_repository: Model repository path
-          * model_version: Model version
-          * model_name: Model name
-        """
-        model_config = json.loads(args['model_config'])
-        gpt_model_path = get_parameter(model_config, "gpt_model_path")
-        if get_parameter(model_config, "enable_trt_overlap", bool):
-            raise pb_utils.TritonModelException(
-                f"enable_trt_overlap=true is not supported.")
-        self.exclude_input_from_output = get_parameter(
-            model_config, "exclude_input_in_output", bool)
-        executor_config = self.get_executor_config(model_config)
-        self.executor = trtllm.Executor(gpt_model_path,
-                                        trtllm.ModelType.DECODER_ONLY,
-                                        executor_config)
-        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
-        self.cancellation_check_period_ms = get_parameter(
-            model_config, "cancellation_check_period_ms", int) or 100
-        self.stats_check_period_ms = get_parameter(
-            model_config, "stats_check_period_ms", int) or 100
-
-        if not self.decoupled:
-            raise pb_utils.TritonModelException(
-                "Please enable decoupled transaction policy in the model configuration to serve this model"
-            )
-
-        self.create_metrics(args["model_name"],
-                            args["model_version"],
-                            is_v1_model=executor_config.batching_type ==
-                            trtllm.BatchingType.STATIC)
-        self.triton_id_to_req_id = {}
-        self.req_id_to_response_sender = {}
-        self.lock = Lock()
-        self.running = False
-        self.awaiter_thread = Thread(target=self.awaiter_loop)
-        self.cancellation_thread = Thread(target=self.cancellation_loop)
-        self.metrics_thread = Thread(target=self.metrics_loop)
-        if self.executor.can_enqueue_requests():
-            self.running = True
-            self.awaiter_thread.start()
-            self.cancellation_thread.start()
-            self.metrics_thread.start()
-        else:
-            # In leader mode, worker ranks will wait here until leader is done.
-            self.executor.shutdown()
-
-    def handle_stop_request(self, triton_id, response_sender):
-        if triton_id is None or triton_id == "":
-            response_sender.send(
-                pb_utils.InferenceResponse(error=pb_utils.TritonError(
-                    "A request id must be provided for request cancellation")),
-                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-            return
-
-        if triton_id in self.triton_id_to_req_id:
-            req_id = self.triton_id_to_req_id[triton_id]
-            self.executor.cancel_request(req_id)
-
-        response_sender.send(
-            pb_utils.InferenceResponse(),
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-    def execute(self, requests):
-        """`execute` must be implemented in every Python model. `execute`
-        function receives a list of pb_utils.InferenceRequest as the only
-        argument. This function is called when an inference is requested
-        for this model.
-
-        Parameters
-        ----------
-        requests : list
-          A list of pb_utils.InferenceRequest
-
-        Returns
-        -------
-        list
-          A list of pb_utils.InferenceResponse. The length of this list must
-          be the same as `requests`
-        """
-        if not self.executor.can_enqueue_requests():
-            return
-
-        # Convert to executor requests.
-        triton_requests = []
-        executor_requests = []
-        for request in requests:
-            response_sender = request.get_response_sender()
-            if get_input_scalar_by_name(request, 'stop'):
-                self.handle_stop_request(request.request_id(), response_sender)
-            else:
-                try:
-                    converted = convert_request(request,
-                                                self.exclude_input_from_output,
-                                                self.decoupled)
-                except Exception as e:
-                    response_sender.send(
-                        pb_utils.InferenceResponse(error=pb_utils.TritonError(
-                            f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
-                        )),
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                else:
-                    triton_requests.append(request)
-                    executor_requests.append(converted)
-
-        with self.lock:
-            request_ids = self.executor.enqueue_requests(executor_requests)
-            for req_id, request in zip(request_ids, triton_requests):
-                triton_id = request.request_id()
-                self.req_id_to_response_sender[
-                    req_id] = triton_id, request.get_response_sender()
-                self.triton_id_to_req_id[triton_id] = req_id
-        return None
-
-    def awaiter_loop(self):
-        """Gets responses from executor and returns the results."""
-        while self.running:
-            for response in self.executor.await_responses(
-                    timeout=datetime.timedelta(milliseconds=1)):
-                req_id = response.request_id
-                with self.lock:
-                    if req_id not in self.req_id_to_response_sender:
-                        continue
-                    triton_id, response_sender = self.req_id_to_response_sender[
-                        req_id]
-
-                triton_response, is_final = convert_response(response)
-                response_sender.send(
-                    triton_response,
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
-                    if is_final else 0)
-
-                if is_final:
-                    with self.lock:
-                        del self.triton_id_to_req_id[triton_id]
-                        del self.req_id_to_response_sender[req_id]
-                # Remove local reference so response_sender can be cleaned properly.
-                del response_sender
-
-    def cancellation_loop(self):
-        """Checks if any pending requests have been cancelled."""
-        while self.running:
-            time.sleep(self.cancellation_check_period_ms / 1000.0)
-            with self.lock:
-                for req_id, (triton_id, response_sender
-                             ) in self.req_id_to_response_sender.items():
-                    if response_sender.is_cancelled():
-                        self.executor.cancel_request(req_id)
-                    # Remove local reference so response_sender can be cleaned properly.
-                    del response_sender
-
-    def metrics_loop(self):
-        """Updates triton metrics using stats from the executor."""
-        while self.running:
-            time.sleep(self.stats_check_period_ms / 1000.0)
-            for stat in self.executor.get_latest_iteration_stats():
-                try:
-                    for key, metric in self.all_metrics.items():
-                        value = None
-                        if hasattr(stat, key):
-                            value = getattr(stat, key)
-                        elif stat.kv_cache_stats is not None and hasattr(
-                                stat.kv_cache_stats, key):
-                            value = getattr(stat.kv_cache_stats, key)
-                        elif stat.static_batching_stats is not None and hasattr(
-                                stat.static_batching_stats, key):
-                            value = getattr(stat.static_batching_stats, key)
-                        elif stat.inflight_batching_stats is not None and hasattr(
-                                stat.inflight_batching_stats, key):
-                            value = getattr(stat.inflight_batching_stats, key)
-                        if value is not None:
-                            if key == "timestamp":
-                                value = convert_timestamp_to_seconds(value)
-                            metric.set(value)
-                        else:
-                            pb_utils.Logger.log_warn(
-                                f"Metric \"{key}\" not found.")
-                except Exception as e:
-                    pb_utils.Logger.log_warn(
-                        f"Error while processing metrics: {e}")
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        if self.executor.can_enqueue_requests():
-            self.running = False
-            self.awaiter_thread.join()
-            self.cancellation_thread.join()
-            self.metrics_thread.join()
-            self.executor.shutdown()
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
deleted file mode 100644
index fd6c6d01..00000000
--- a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+++ /dev/null
@@ -1,541 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "tensorrt_llm"
-backend: "${triton_backend}"
-max_batch_size: ${triton_max_batch_size}
-
-model_transaction_policy {
-  decoupled: ${decoupled_mode}
-}
-
-dynamic_batching {
-    preferred_batch_size: [ ${triton_max_batch_size} ]
-    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
-}
-
-input [
-  {
-    name: "input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    allow_ragged_batch: true
-  },
-  {
-    name: "input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "request_output_len"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-  },
-  {
-    name: "draft_input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "decoder_input_ids"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "decoder_input_lengths"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-    reshape: { shape: [ ] }
-  },
-  {
-    name: "draft_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "draft_acceptance_threshold"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "stop_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "bad_words_list"
-    data_type: TYPE_INT32
-    dims: [ 2, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "embedding_bias"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p_min"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p_decay"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "runtime_top_p_reset_ids"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "len_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "early_stopping"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "min_length"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "beam_search_diversity_rate"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "random_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "stop"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "streaming"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-    allow_ragged_batch: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  # the unique task ID for the given LoRA.
-  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
-  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
-  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
-  {
-    name: "lora_task_id"
-	data_type: TYPE_UINT64
-	dims: [ 1 ]
-    reshape: { shape: [ ] }
-	optional: true
-  },
-  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
-  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
-  # each of the in / out tensors are first flattened and then concatenated together in the format above.
-  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
-  {
-    name: "lora_weights"
-	data_type: TYPE_FP16
-	dims: [ -1, -1 ]
-	optional: true
-	allow_ragged_batch: true
-  },
-  # module identifier (same size a first dimension of lora_weights)
-  # See LoraModule::ModuleType for model id mapping
-  #
-  # "attn_qkv": 0     # compbined qkv adapter
-  # "attn_q": 1       # q adapter
-  # "attn_k": 2       # k adapter
-  # "attn_v": 3       # v adapter
-  # "attn_dense": 4   # adapter for the dense layer in attention
-  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
-  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
-  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
-  #
-  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
-  {
-    name: "lora_config"
-	data_type: TYPE_INT32
-	dims: [ -1, 3 ]
-	optional: true
-	allow_ragged_batch: true
-  }
-]
-output [
-  {
-    name: "output_ids"
-    data_type: TYPE_INT32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "sequence_length"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1, -1 ]
-  }
-]
-instance_group [
-  {
-    count: 1
-    kind : KIND_CPU
-  }
-]
-parameters: {
-  key: "max_beam_width"
-  value: {
-    string_value: "${max_beam_width}"
-  }
-}
-parameters: {
-  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
-  value: {
-    string_value: "no"
-  }
-}
-parameters: {
-  key: "gpt_model_type"
-  value: {
-    string_value: "${batching_strategy}"
-  }
-}
-parameters: {
-  key: "gpt_model_path"
-  value: {
-    string_value: "${engine_dir}"
-  }
-}
-parameters: {
-  key: "encoder_model_path"
-  value: {
-    string_value: "${encoder_engine_dir}"
-  }
-}
-parameters: {
-  key: "max_tokens_in_paged_kv_cache"
-  value: {
-    string_value: "${max_tokens_in_paged_kv_cache}"
-  }
-}
-parameters: {
-  key: "max_attention_window_size"
-  value: {
-    string_value: "${max_attention_window_size}"
-  }
-}
-parameters: {
-  key: "sink_token_length"
-  value: {
-    string_value: "${sink_token_length}"
-  }
-}
-parameters: {
-  key: "batch_scheduler_policy"
-  value: {
-    string_value: "${batch_scheduler_policy}"
-  }
-}
-parameters: {
-  key: "kv_cache_free_gpu_mem_fraction"
-  value: {
-    string_value: "${kv_cache_free_gpu_mem_fraction}"
-  }
-}
-parameters: {
-  key: "kv_cache_host_memory_bytes"
-  value: {
-    string_value: "${kv_cache_host_memory_bytes}"
-  }
-}
-parameters: {
-  key: "kv_cache_onboard_blocks"
-  value: {
-    string_value: "${kv_cache_onboard_blocks}"
-  }
-}
-# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
-# parameters: {
-#   key: "enable_trt_overlap"
-#   value: {
-#     string_value: "${enable_trt_overlap}"
-#   }
-# }
-parameters: {
-  key: "exclude_input_in_output"
-  value: {
-    string_value: "${exclude_input_in_output}"
-  }
-}
-parameters: {
-  key: "cancellation_check_period_ms"
-  value: {
-    string_value: "${cancellation_check_period_ms}"
-  }
-}
-parameters: {
-  key: "stats_check_period_ms"
-  value: {
-    string_value: "${stats_check_period_ms}"
-  }
-}
-parameters: {
-  key: "iter_stats_max_iterations"
-  value: {
-    string_value: "${iter_stats_max_iterations}"
-  }
-}
-parameters: {
-  key: "request_stats_max_iterations"
-  value: {
-    string_value: "${request_stats_max_iterations}"
-  }
-}
-parameters: {
-  key: "enable_kv_cache_reuse"
-  value: {
-    string_value: "${enable_kv_cache_reuse}"
-  }
-}
-parameters: {
-  key: "normalize_log_probs"
-  value: {
-    string_value: "${normalize_log_probs}"
-  }
-}
-parameters: {
-  key: "enable_chunked_context"
-  value: {
-    string_value: "${enable_chunked_context}"
-  }
-}
-parameters: {
-  key: "gpu_device_ids"
-  value: {
-    string_value: "${gpu_device_ids}"
-  }
-}
-parameters: {
-  key: "lora_cache_optimal_adapter_size"
-  value: {
-    string_value: "${lora_cache_optimal_adapter_size}"
-  }
-}
-parameters: {
-  key: "lora_cache_max_adapter_size"
-  value: {
-    string_value: "${lora_cache_max_adapter_size}"
-  }
-}
-parameters: {
-  key: "lora_cache_gpu_memory_fraction"
-  value: {
-    string_value: "${lora_cache_gpu_memory_fraction}"
-  }
-}
-parameters: {
-  key: "lora_cache_host_memory_bytes"
-  value: {
-    string_value: "${lora_cache_host_memory_bytes}"
-  }
-}
-parameters: {
-  key: "decoding_mode"
-  value: {
-    string_value: "${decoding_mode}"
-  }
-}
-parameters: {
-  key: "executor_worker_path"
-  value: {
-    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
-  }
-}
-parameters: {
-  key: "medusa_choices"
-    value: {
-      string_value: "${medusa_choices}"
-  }
-}
-parameters: {
-  key: "gpu_weights_percent"
-    value: {
-      string_value: "${gpu_weights_percent}"
-  }
-}
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py b/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
deleted file mode 100644
index de9e28bd..00000000
--- a/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from collections.abc import Generator
-from dataclasses import dataclass
-from typing import Optional
-
-import numpy as np
-
-
-class RequestValidationError(Exception):
-    pass
-
-
-def _validate_that(condition: bool, msg: str):
-    if not condition:
-        raise RequestValidationError(msg)
-
-
-def _validate_non_empty(data, msg: str):
-    _validate_that(data is not None and data.size > 0, msg)
-
-
-def _validate_single_gt_0(data, msg: str):
-    _validate_non_empty(data, msg)
-    _validate_that(data.flatten()[0] > 0, msg)
-
-
-def _single_value(data: Optional[np.ndarray]):
-    if data is None:
-        return None
-    return data.flatten()[0]
-
-
-@dataclass
-class Request:
-    text_input: np.ndarray = np.array([])
-    decoder_text_input: np.ndarray = None
-    max_tokens: np.ndarray = np.array([])
-    bad_words: Optional[np.ndarray] = None
-    stop_words: Optional[np.ndarray] = None
-    end_id: Optional[np.ndarray] = None
-    pad_id: Optional[np.ndarray] = None
-    top_k: Optional[np.ndarray] = None
-    top_p: Optional[np.ndarray] = None
-    temperature: Optional[np.ndarray] = None
-    length_penalty: Optional[np.ndarray] = None
-    repetition_penalty: Optional[np.ndarray] = None
-    min_length: Optional[np.ndarray] = None
-    return_log_probs: Optional[np.ndarray] = None
-    prompt_embedding_table: Optional[np.ndarray] = None
-    prompt_vocab_size: Optional[np.ndarray] = None
-    embedding_bias_words: Optional[np.ndarray] = None
-    embedding_bias_weights: Optional[np.ndarray] = None
-    num_draft_tokens: Optional[np.ndarray] = None
-    use_draft_logits: Optional[np.ndarray] = None
-    stream: Optional[np.ndarray] = None
-    beam_width: Optional[np.ndarray] = None
-    return_context_logits: Optional[np.ndarray] = None
-    return_generation_logits: Optional[np.ndarray] = None
-    random_seed: Optional[np.ndarray] = None
-    presence_penalty: Optional[np.ndarray] = None
-    frequency_penalty: Optional[np.ndarray] = None
-
-    def validate(self):
-        _validate_non_empty(self.text_input, "text_input is required")
-        _validate_single_gt_0(self.max_tokens,
-                              "max_tokens must be a single value > 0")
-
-        num_draft_tokens = _single_value(self.num_draft_tokens)
-        stream = _single_value(self.stream)
-        _single_value(self.return_generation_logits)
-        context_logits = _single_value(self.return_context_logits)
-
-        if num_draft_tokens:
-            _validate_that(
-                not stream,
-                "streaming is not supported with speculative decoding")
-            _validate_that(
-                not context_logits,
-                "context logits are not supported with speculative decoding")
-
-
-@dataclass
-class DraftRequest:
-    draft_input_ids: Optional[np.ndarray] = None
-    draft_logits: Optional[np.ndarray] = None
-
-
-@dataclass
-class PreprocResponse:
-    input_ids: np.ndarray = np.array([])
-    decoder_input_ids: np.ndarray = None
-    input_lengths: np.ndarray = np.array([])
-    decoder_input_lengths: np.ndarray = None
-    bad_words_list: Optional[np.ndarray] = None
-    stop_words_list: Optional[np.ndarray] = None
-    embedding_bias: Optional[np.ndarray] = None
-    end_id: Optional[np.ndarray] = None
-    pad_id: Optional[np.ndarray] = None
-
-    @classmethod
-    def with_new_inputs(cls,
-                        other,
-                        input_ids: Optional[np.ndarray] = None,
-                        input_lengths: Optional[np.ndarray] = None):
-        return cls(
-            input_ids=(input_ids
-                       if input_ids is not None else other.input_ids),
-            input_lengths=(input_lengths if input_lengths is not None else
-                           other.input_lengths),
-            decoder_input_ids=other.decoder_input_ids,
-            decoder_input_lengths=other.decoder_input_lengths,
-            bad_words_list=other.bad_words_list,
-            stop_words_list=other.stop_words_list,
-            end_id=other.end_id,
-            pad_id=other.pad_id,
-        )
-
-
-@dataclass
-class GenerationResponse:
-    output_ids: np.ndarray = np.array([])
-    sequence_length: np.ndarray = np.array([])
-    cum_log_probs: Optional[np.ndarray] = None
-    output_log_probs: Optional[np.ndarray] = None
-    context_logits: Optional[np.ndarray] = None
-    generation_logits: Optional[np.ndarray] = None
-
-
-@dataclass
-class Response:
-    text_output: np.ndarray = np.array([])
-    cum_log_probs: Optional[np.ndarray] = None
-    output_log_probs: Optional[np.ndarray] = None
-    context_logits: Optional[np.ndarray] = None
-    generation_logits: Optional[np.ndarray] = None
-
-    def __eq__(self, o) -> bool:
-        """Just for testing"""
-        if not isinstance(o, Response):
-            return False
-        return (np.array_equal(self.text_output, o.text_output)
-                and np.array_equal(self.cum_log_probs, o.cum_log_probs)
-                and np.array_equal(self.output_log_probs, o.output_log_probs)
-                and np.array_equal(self.context_logits, o.context_logits) and
-                np.array_equal(self.generation_logits, o.generation_logits))
-
-
-class Decoder:
-
-    def __init__(self, streaming=False, accumulate=False):
-        self._streaming = streaming
-        self._accumulate = accumulate
-
-        self._accumulated_tokens = None
-
-    def decode(self,
-               request: Request,
-               speculative_decoding=False) -> Generator[Response, None, None]:
-        preproc_response = self.preprocess(request)
-
-        if speculative_decoding:
-            for gen_response in self._spec_generate(preproc_response, request):
-                yield self.postprocess(gen_response)
-        else:
-            if not self._streaming:
-                gen_response = self._generate_non_streaming(
-                    preproc_response, request)
-                yield self.postprocess(gen_response)
-            else:
-                for gen_response in self._generate(preproc_response, request):
-                    yield self.postprocess(gen_response)
-
-    def encountered_stop_words(self, input_ids, stop_words_ids):
-        for stop_word_ids in stop_words_ids:
-            if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
-                return True
-        return False
-
-    def _spec_generate(
-            self, preproc: PreprocResponse,
-            request: Request) -> Generator[GenerationResponse, None, None]:
-
-        prompt_input_ids: np.ndarray = preproc.input_ids[0]
-        input_ids: np.ndarray = prompt_input_ids
-        output_len: int = request.max_tokens[0][0]
-        last_input_ids: np.ndarray = None
-        draft_output_ids: np.ndarray = None
-        draft_logits: np.ndarray = None
-
-        target_response: GenerationResponse = None
-
-        cur_preproc = preproc
-
-        counter = 0
-        while True:
-            counter += 1
-            num_draft_tokens = min(
-                request.num_draft_tokens[0][0],
-                len(prompt_input_ids) + output_len - len(input_ids) - 1)
-
-            draft_request = None
-            if num_draft_tokens > 0:
-                draft_response: GenerationResponse = self._draft_generate_non_streaming(
-                    cur_preproc, request, num_draft_tokens)
-                seq_len: int = draft_response.sequence_length[0][0]
-                # [1, beamWidth, outputLength] -> [outputLen]
-                draft_output_ids = draft_response.output_ids[0][0]
-                # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded]
-                if request.use_draft_logits is not None and request.use_draft_logits[
-                        0]:
-                    if draft_response.generation_logits is not None:
-                        draft_logits = draft_response.generation_logits[0][0]
-
-                input_draft_tokens = draft_output_ids[len(input_ids):seq_len]
-                draft_request = DraftRequest(
-                    draft_input_ids=np.expand_dims(input_draft_tokens, 0))
-                if request.use_draft_logits is not None and request.use_draft_logits[
-                        0]:
-                    draft_request.draft_logits = np.expand_dims(
-                        draft_logits[-len(input_draft_tokens):], 0)
-            else:
-                draft_request = DraftRequest()
-            target_response = self._generate_non_streaming(
-                cur_preproc, request, draft_request)
-            last_input_ids = input_ids
-            input_ids = target_response.output_ids[0][0]
-            cur_preproc = PreprocResponse.with_new_inputs(
-                cur_preproc, np.expand_dims(input_ids, 0),
-                np.array([[len(input_ids)]], dtype=np.int32))
-
-            # Evaluate criteria to stop generation loop.
-            # If we've hit or exceeded the max output length, should stop
-            length_stop = (len(input_ids) >=
-                           len(prompt_input_ids) + output_len)
-            if length_stop:
-                break
-            # If draft and target have same outputs, should stop. Normally target should return 1 more token.
-            # If they are the same length, they should differ at the last token
-            target_draft_equal = draft_output_ids is not None and np.array_equal(
-                draft_output_ids, input_ids)
-            if target_draft_equal:
-                break
-            # If tokens no longer change, should stop, means we have hit early stopping
-            last_current_equal = np.array_equal(last_input_ids, input_ids)
-            if last_current_equal:
-                break
-            # Need to check if stop words was encountered
-            hit_stop_words = self.encountered_stop_words(
-                input_ids, preproc.stop_words_list[0])
-            if hit_stop_words:
-                break
-
-        yield target_response
-
-    def _draft_generate_non_streaming(
-            self, preproc: PreprocResponse, request: Request,
-            num_draft_tokens: int) -> GenerationResponse:
-        raise NotImplementedError()
-
-    def _generate(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None
-    ) -> Generator[GenerationResponse, None, None]:
-        raise NotImplementedError()
-
-    def _generate_non_streaming(
-            self,
-            preproc: PreprocResponse,
-            request: Request,
-            draft_request: Optional[DraftRequest] = None
-    ) -> GenerationResponse:
-        raise NotImplementedError()
-
-    def postprocess(self, gen_response: GenerationResponse) -> Response:
-        if self._accumulate and self._streaming:
-            new_tokens: np.ndarray = gen_response.output_ids
-            if new_tokens.ndim != 3:
-                raise Exception("Expected output_ids tensor to have 3 dims.")
-            if new_tokens.shape[0] != 1:
-                raise Exception("Expected batch size of 1")
-            if new_tokens.shape[1] != 1:
-                raise Exception(
-                    "Accumulation of tokens is only implemented for beam width = 1"
-                )
-
-            self._accumulated_tokens = new_tokens if (
-                self._accumulated_tokens is None) else np.concatenate(
-                    (self._accumulated_tokens, new_tokens), axis=2)
-            sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]],
-                                        dtype=np.int32)
-            return self._postprocess(self._accumulated_tokens,
-                                     sequence_lengths, gen_response)
-        else:
-            return self._postprocess(gen_response.output_ids, None,
-                                     gen_response)
-
-    def _postprocess(self, tokens: np.ndarray,
-                     sequence_lengths: Optional[np.ndarray],
-                     gen_response: GenerationResponse) -> Response:
-        raise NotImplementedError()
-
-    def preprocess(self, request: Request) -> PreprocResponse:
-        raise NotImplementedError()
-
-    def reset_decoder(self):
-        self._accumulated_tokens = None
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py b/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
deleted file mode 100644
index 456ded5a..00000000
--- a/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from collections.abc import Callable
-from typing import Dict, Optional
-
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from lib.decode import *
-from typing_extensions import override
-
-
-class TritonDecoder(Decoder):
-
-    def __init__(self,
-                 streaming=False,
-                 accumulate=False,
-                 preproc_model_name="preprocessing",
-                 postproc_model_name="postprocessing",
-                 llm_model_name="tensorrt_llm",
-                 draft_llm_model_name: Optional[str] = None):
-        super().__init__(streaming=streaming, accumulate=accumulate)
-        self.preproc_model_name = preproc_model_name
-        self.postproc_model_name = postproc_model_name
-        self.llm_model_name = llm_model_name
-        self.draft_llm_model_name = draft_llm_model_name
-
-        self._preproc_outputs = [
-            "INPUT_ID",
-            "DECODER_INPUT_ID",
-            "REQUEST_INPUT_LEN",
-            "REQUEST_DECODER_INPUT_LEN",
-            "BAD_WORDS_IDS",
-            "STOP_WORDS_IDS",
-            "EMBEDDING_BIAS",
-            "OUT_PAD_ID",
-            "OUT_END_ID",
-        ]
-
-        self._llm_outputs = [
-            "output_ids",
-            "sequence_length",
-            "cum_log_probs",
-            "output_log_probs",
-            "context_logits",
-            "generation_logits",
-        ]
-
-        self._postproc_outputs = [
-            "OUTPUT",
-        ]
-
-        self.input_names = [
-            "text_input",
-            "decoder_text_input",
-            "max_tokens",
-            "bad_words",
-            "stop_words",
-            "end_id",
-            "pad_id",
-            "top_k",
-            "top_p",
-            "temperature",
-            "length_penalty",
-            "repetition_penalty",
-            "min_length",
-            "presence_penalty",
-            "frequency_penalty",
-            "random_seed",
-            "return_log_probs",
-            "return_context_logits",
-            "return_generation_logits",
-            "beam_width",
-            "stream",
-            "prompt_embedding_table",
-            "prompt_vocab_size",
-            "embedding_bias_words",
-            "embedding_bias_weights",
-            "num_draft_tokens",
-            "use_draft_logits",
-        ]
-
-        self.__undo_reshape_whitelist = {
-            "max_tokens",
-            "end_id",
-            "pad_id",
-            "top_k",
-            "top_p",
-            "temperature",
-            "length_penalty",
-            "repetition_penalty",
-            "min_length",
-            "presence_penalty",
-            "frequency_penalty",
-            "random_seed",
-            "return_log_probs",
-            "return_context_logits",
-            "return_generation_logits",
-            "beam_width",
-            "stream",
-            "prompt_vocab_size",
-            "num_draft_tokens",
-            "use_draft_logits",
-        }
-
-    def _exec_triton_request(self, request):
-        responses = request.exec(decoupled=True)
-        for r in responses:
-            if r.has_error():
-                raise pb_utils.TritonModelException(r.error().message())
-            yield r
-
-    def _exec_triton_request_single(self, request):
-        responses = request.exec(decoupled=False)
-        if responses.has_error():
-            raise pb_utils.TritonModelException(responses.error().message())
-        return responses
-
-    def create_triton_response(self, response: Response):
-        name_map = {
-            "text_output": "text_output",
-            "cum_log_probs": "cum_log_probs",
-            "output_log_probs": "output_log_probs",
-            "context_logits": "context_logits",
-            "generation_logits": "generation_logits"
-        }
-        tensors = self.create_triton_tensors(response, name_map)
-        return pb_utils.InferenceResponse(output_tensors=tensors)
-
-    def convert_triton_request(self, triton_request) -> Request:
-        request = Request()
-        for triton_name in self.input_names:
-            tensor = pb_utils.get_input_tensor_by_name(triton_request,
-                                                       triton_name)
-            target_name = triton_name
-            if tensor is None:
-                continue
-            if not hasattr(request, target_name):
-                raise AttributeError(
-                    f"Request has no attribute '{target_name}'")
-            setattr(request, target_name, tensor.as_numpy())
-        return request
-
-    def convert_triton_response(self,
-                                triton_response,
-                                response_factory: Callable,
-                                name_map=None):
-        response = response_factory()
-        for tensor in triton_response.output_tensors():
-            if tensor is None:
-                continue
-            triton_name = tensor.name()
-            value = tensor.as_numpy()
-            target_name = triton_name
-            if name_map and triton_name in name_map:
-                target_name = name_map[triton_name]
-            if name_map and not triton_name in name_map:
-                continue
-            if target_name is None:
-                # explicitly ignore this triton input
-                continue
-            if not hasattr(response, target_name):
-                raise AttributeError(
-                    f"response object has not attribute '{target_name}'")
-            setattr(response, target_name, value)
-        return response
-
-    def __undo_reshape(self, x, name):
-        if name in self.__undo_reshape_whitelist and len(x.shape) == 1:
-            # handle reshapes
-            return np.expand_dims(x, 0)
-        else:
-            return x
-
-    def create_triton_tensors(self, obj, name_map: dict):
-        tensors = []
-        for name, triton_name in name_map.items():
-            if triton_name is None:
-                continue
-            value = getattr(obj, name)
-            if value is None:
-                continue
-            t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name))
-            tensors.append(t)
-        return tensors
-
-    @override
-    def preprocess(self, request: Request) -> PreprocResponse:
-        input_tensors = self._get_preproc_tensors(request)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.preproc_model_name,
-            inputs=input_tensors,
-            requested_output_names=self._preproc_outputs)
-        triton_output = self._exec_triton_request_single(triton_req)
-        return self._get_preproc_response(triton_output)
-
-    def _get_preproc_tensors(self, request: Request):
-        name_map = {
-            "text_input": "QUERY",
-            "decoder_text_input": "DECODER_QUERY",
-            "max_tokens": "REQUEST_OUTPUT_LEN",
-            "bad_words": "BAD_WORDS_DICT",
-            "stop_words": "STOP_WORDS_DICT",
-            "embedding_bias_words": "EMBEDDING_BIAS_WORDS",
-            "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS",
-            "pad_id": "PAD_ID",
-            "end_id": "END_ID",
-        }
-        return self.create_triton_tensors(request, name_map)
-
-    def _get_preproc_response(self, triton_output):
-        name_map = {
-            "INPUT_ID": "input_ids",
-            "DECODER_INPUT_ID": "decoder_input_ids",
-            "REQUEST_INPUT_LEN": "input_lengths",
-            "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths",
-            "BAD_WORDS_IDS": "bad_words_list",
-            "STOP_WORDS_IDS": "stop_words_list",
-            "EMBEDDING_BIAS": "embedding_bias",
-            "OUT_PAD_ID": "pad_id",
-            "OUT_END_ID": "end_id",
-        }
-        return self.convert_triton_response(triton_output, PreprocResponse,
-                                            name_map)
-
-    @override
-    def _draft_generate_non_streaming(
-            self, preproc: PreprocResponse, request: Request,
-            num_draft_tokens: int) -> GenerationResponse:
-        input_tensors = self._get_llm_tensors(preproc, request,
-                                              num_draft_tokens, None, True)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.draft_llm_model_name,
-            inputs=input_tensors,
-            requested_output_names=self._llm_outputs)
-        triton_response = self._exec_triton_request_single(triton_req)
-        llm_response = self._get_llm_response(triton_response)
-        return llm_response
-
-    @override
-    def _generate(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None
-    ) -> Generator[GenerationResponse, None, None]:
-        input_tensors = self._get_llm_tensors(preproc, request, None,
-                                              draft_request)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.llm_model_name,
-            inputs=input_tensors,
-            requested_output_names=self._llm_outputs)
-        for r in self._exec_triton_request(triton_req):
-            yield self._get_llm_response(r)
-
-    @override
-    def _generate_non_streaming(
-            self,
-            preproc: PreprocResponse,
-            request: Request,
-            draft_request: Optional[DraftRequest] = None
-    ) -> GenerationResponse:
-        input_tensors = self._get_llm_tensors(preproc, request, None,
-                                              draft_request)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.llm_model_name,
-            inputs=input_tensors,
-            requested_output_names=self._llm_outputs)
-        r = self._exec_triton_request_single(triton_req)
-        return self._get_llm_response(r)
-
-    def _get_llm_tensors(self,
-                         preproc: PreprocResponse,
-                         request: Request,
-                         num_output_tokens: Optional[int] = None,
-                         draft_request: Optional[DraftRequest] = None,
-                         is_draft_model_request: bool = False):
-        tensors = []
-        tensors.extend(self._get_tensors_from_preproc(preproc))
-        tensors.extend(
-            self._get_llm_tensors_from_request(request, num_output_tokens,
-                                               draft_request,
-                                               is_draft_model_request))
-        return tensors
-
-    def _get_tensors_from_preproc(self, preproc: PreprocResponse):
-        name_map = {
-            "input_ids": "input_ids",
-            "decoder_input_ids": "decoder_input_ids",
-            "input_lengths": "input_lengths",
-            "bad_words_list": "bad_words_list",
-            "stop_words_list": "stop_words_list",
-            "embedding_bias": "embedding_bias",
-            "pad_id": "pad_id",
-            "end_id": "end_id",
-        }
-        return self.create_triton_tensors(preproc, name_map)
-
-    def _get_llm_tensors_from_request(
-            self,
-            request: Request,
-            num_output_tokens: Optional[int] = None,
-            draft_request: Optional[DraftRequest] = None,
-            is_draft_model_request: bool = False):
-        name_map: Dict[str, Optional[str]] = {
-            "beam_width": "beam_width",
-            "top_k": "runtime_top_k",
-            "top_p": "runtime_top_p",
-            "length_penalty": "len_penalty",
-            "repetition_penalty": "repetition_penalty",
-            "min_length": "min_length",
-            "presence_penalty": "presence_penalty",
-            "frequency_penalty": "frequency_penalty",
-            "random_seed": "random_seed",
-            "return_log_probs": "return_log_probs",
-            "stream": "streaming",
-            "prompt_embedding_table": "prompt_embedding_table",
-            "prompt_vocab_size": "prompt_vocab_size",
-        }
-        tensors = self.create_triton_tensors(request, name_map)
-
-        out_len = request.max_tokens[0][0] if request.max_tokens else None
-        if num_output_tokens is not None:
-            out_len = num_output_tokens
-        elif draft_request:
-            if draft_request.draft_input_ids is not None:
-                out_len = len(draft_request.draft_input_ids[0]) + 1
-            else:
-                out_len = 1
-
-        if out_len is None:
-            raise Exception("Could not determine request_output_len")
-        else:
-            tensors.append(
-                pb_utils.Tensor("request_output_len",
-                                np.array([[out_len]], dtype=np.int32)))
-
-        if draft_request:
-            if draft_request.draft_input_ids is not None:
-                tensors.append(
-                    pb_utils.Tensor("draft_input_ids",
-                                    draft_request.draft_input_ids))
-                if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[
-                        0]:
-                    tensors.append(
-                        pb_utils.Tensor("draft_logits",
-                                        draft_request.draft_logits))
-
-        return_context_logits = False
-        return_generation_logits = False
-        if draft_request is None:
-            if is_draft_model_request:
-                return_generation_logits = request.use_draft_logits[
-                    0] if request.use_draft_logits is not None else False
-            else:
-                return_context_logits = request.return_context_logits[
-                    0] if request.return_context_logits is not None else False
-                return_generation_logits = request.return_generation_logits[
-                    0] if request.return_generation_logits is not None else False
-
-        tensors.append(
-            pb_utils.Tensor("return_context_logits",
-                            np.array([[return_context_logits]])))
-        tensors.append(
-            pb_utils.Tensor("return_generation_logits",
-                            np.array([[return_generation_logits]])))
-        return tensors
-
-    def _get_llm_response(self, triton_output):
-        name_map = {
-            "output_ids": "output_ids",
-            "sequence_length": "sequence_length",
-            "cum_log_probs": "cum_log_probs",
-            "output_log_probs": "output_log_probs",
-            "context_logits": "context_logits",
-            "generation_logits": "generation_logits",
-        }
-        return self.convert_triton_response(triton_output, GenerationResponse,
-                                            name_map)
-
-    def _postprocess(self, tokens: np.ndarray,
-                     sequence_lengths: Optional[np.ndarray],
-                     gen_response: GenerationResponse) -> Response:
-        input_tensors = self._get_postproc_tensors(tokens, sequence_lengths,
-                                                   gen_response)
-        triton_req = pb_utils.InferenceRequest(
-            model_name=self.postproc_model_name,
-            inputs=input_tensors,
-            requested_output_names=self._postproc_outputs)
-        r = self._exec_triton_request_single(triton_req)
-        response = self._get_response(r, gen_response)
-        return response
-
-    def _get_postproc_tensors(self, tokens: np.ndarray,
-                              sequence_lengths: Optional[np.ndarray],
-                              gen_response: GenerationResponse):
-        tensors = [
-            pb_utils.Tensor("TOKENS_BATCH", tokens),
-            pb_utils.Tensor(
-                "SEQUENCE_LENGTH", sequence_lengths
-                if sequence_lengths else gen_response.sequence_length)
-        ]
-        return tensors
-
-    def _get_response(self, triton_output, gen_res: GenerationResponse):
-        tensors = triton_output.output_tensors()
-        t_map = {}
-        for named_t in tensors:
-            name = named_t.name()
-            t = named_t.as_numpy()
-            t_map[name] = t
-        response = Response(text_output=t_map["OUTPUT"],
-                            cum_log_probs=gen_res.cum_log_probs,
-                            output_log_probs=gen_res.output_log_probs,
-                            context_logits=gen_res.context_logits,
-                            generation_logits=gen_res.generation_logits)
-        return response
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py b/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py
deleted file mode 100644
index 609e3236..00000000
--- a/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/model.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import json
-import traceback
-
-import triton_python_backend_utils as pb_utils
-from lib.triton_decoder import TritonDecoder
-
-
-class TritonPythonModel:
-
-    def initialize(self, args):
-
-        # Parse model configs
-        model_config = json.loads(args['model_config'])
-
-        params = model_config['parameters']
-
-        accumulate_tokens_str = ''
-        if 'accumulate_tokens' in params:
-            accumulate_tokens_str = params['accumulate_tokens']['string_value']
-
-        self.accumulate_tokens = accumulate_tokens_str.lower() in [
-            'true', 'yes', '1', 't'
-        ]
-
-        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
-
-        self.logger = pb_utils.Logger
-
-        self.llm_model_name = "tensorrt_llm"
-        if "tensorrt_llm_model_name" in params:
-            self.llm_model_name = params["tensorrt_llm_model_name"][
-                "string_value"]
-        self.draft_llm_model_name = None
-        if "tensorrt_llm_draft_model_name" in params:
-            self.draft_llm_model_name = params[
-                "tensorrt_llm_draft_model_name"]["string_value"]
-
-        self.decoder = TritonDecoder(
-            streaming=self.decoupled,
-            accumulate=self.accumulate_tokens,
-            preproc_model_name="preprocessing",
-            postproc_model_name="postprocessing",
-            llm_model_name=self.llm_model_name,
-            draft_llm_model_name=self.draft_llm_model_name)
-
-    def execute(self, requests):
-
-        responses = []
-
-        for request in requests:
-            if self.decoupled:
-                response_sender = request.get_response_sender()
-            try:
-
-                req = self.decoder.convert_triton_request(request)
-                req.validate()
-                speculative_decode = (req.num_draft_tokens is not None
-                                      and req.num_draft_tokens[0][0] > 0)
-                if speculative_decode and (self.draft_llm_model_name is None
-                                           or self.draft_llm_model_name == ""):
-                    raise Exception(
-                        "cannot perform speculative decoding without draft model"
-                    )
-                res_gen = self.decoder.decode(
-                    req, speculative_decoding=speculative_decode)
-
-                for res in res_gen:
-                    triton_response = self.decoder.create_triton_response(res)
-                    if self.decoupled:
-                        response_sender.send(triton_response)
-                    else:
-                        responses.append(triton_response)
-
-                if self.decoupled:
-                    response_sender.send(
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-
-            except Exception:
-                self.logger.log_error(traceback.format_exc())
-                # If encountering an error, send a response with err msg
-                error_response = pb_utils.InferenceResponse(
-                    output_tensors=[],
-                    error=pb_utils.TritonError(traceback.format_exc()))
-
-                if self.decoupled:
-                    response_sender.send(error_response)
-                    response_sender.send(
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
-                else:
-                    responses.append(error_response)
-
-            self.decoder.reset_decoder()
-            if self.decoupled:
-                return None
-            else:
-                assert len(responses) == len(requests)
-                return responses
-
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print('Cleaning up...')
diff --git a/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
deleted file mode 100644
index ba0fa585..00000000
--- a/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-name: "tensorrt_llm_bls"
-backend: "python"
-max_batch_size: ${triton_max_batch_size}
-
-model_transaction_policy {
-  decoupled: ${decoupled_mode}
-}
-
-input [
-  {
-    name: "text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "decoder_text_input"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-    optional: true
-  },
-  {
-    name: "max_tokens"
-    data_type: TYPE_INT32
-    dims: [ -1 ]
-  },
-  {
-   name: "bad_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-   name: "stop_words"
-   data_type: TYPE_STRING
-   dims: [ -1 ]
-   optional: true
-  },
-  {
-    name: "end_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "pad_id"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "temperature"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "length_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "repetition_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "min_length"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "presence_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "frequency_penalty"
-    data_type: TYPE_FP32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "random_seed"
-    data_type: TYPE_UINT64
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "return_log_probs"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_context_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "return_generation_logits"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    reshape: { shape: [ ] }
-    optional: true
-  },
-  {
-    name: "beam_width"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "stream"
-    data_type: TYPE_BOOL
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-    name: "prompt_embedding_table"
-    data_type: TYPE_FP16
-    dims: [ -1, -1 ]
-    optional: true
-  },
-  {
-    name: "prompt_vocab_size"
-    data_type: TYPE_INT32
-    dims: [ 1 ]
-    optional: true
-  },
-  {
-      name: "embedding_bias_words"
-      data_type: TYPE_STRING
-      dims: [ -1 ]
-      optional: true
-  },
-  {
-      name: "embedding_bias_weights"
-      data_type: TYPE_FP32
-      dims: [ -1 ]
-      optional: true
-  },
-  {
-      name: "num_draft_tokens",
-      data_type: TYPE_INT32,
-      dims: [ 1 ]
-      optional: true
-  },
-  {
-      name: "use_draft_logits",
-      data_type: TYPE_BOOL,
-      dims: [ 1 ]
-      reshape: { shape: [ ] }
-      optional: true
-  }
-]
-output [
-  {
-    name: "text_output"
-    data_type: TYPE_STRING
-    dims: [ -1 ]
-  },
-  {
-    name: "cum_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1 ]
-  },
-  {
-    name: "output_log_probs"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "context_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1 ]
-  },
-  {
-    name: "generation_logits"
-    data_type: TYPE_FP32
-    dims: [ -1, -1, -1 ]
-  }
-]
-
-parameters: {
-  key: "accumulate_tokens"
-  value: {
-    string_value: "${accumulate_tokens}"
-  }
-}
-parameters: {
-  key: "tensorrt_llm_model_name"
-  value: {
-    string_value: "${tensorrt_llm_model_name}"
-  }
-}
-parameters: {
-  key: "tensorrt_llm_draft_model_name"
-  value: {
-    string_value: "${tensorrt_llm_draft_model_name}"
-  }
-}
-
-instance_group [
-  {
-    count: ${bls_instance_count}
-    kind : KIND_CPU
-  }
-]
diff --git a/all_models/tests/test_decode.py b/all_models/tests/test_decode.py
deleted file mode 100644
index 0744ff14..00000000
--- a/all_models/tests/test_decode.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from typing import Dict, Optional
-
-import numpy as np
-import pytest
-# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm_bls/1/
-from lib.decode import *
-
-
-class MockDecoder(Decoder):
-
-    def __init__(self,
-                 streaming=False,
-                 accumulate=False,
-                 data_dict: Optional[Dict] = None):
-        super().__init__(streaming=streaming, accumulate=accumulate)
-        self.data_dict = data_dict
-        self.draft_step = -1
-        self.target_step = -1
-
-        self.draft_num_calls = 0
-        self.target_num_calls = 0
-
-    def preprocess(self, request: Request) -> PreprocResponse:
-        return PreprocResponse(
-            input_ids=np.array([self.data_dict["input_ids"]]),
-            input_lengths=np.array([[len(self.data_dict["input_ids"])]]),
-            stop_words_list=np.array([[[]]]))
-
-    def _postprocess(self, tokens: np.ndarray,
-                     sequence_lengths: Optional[np.ndarray],
-                     gen_response: GenerationResponse) -> Response:
-        target_output = self.data_dict["target_output"][self.target_step]
-        return Response(text_output=np.array([target_output["output_text"]]))
-
-    def _draft_generate_non_streaming(
-            self, preproc: PreprocResponse, request: Request,
-            num_draft_tokens: int) -> GenerationResponse:
-        self.draft_num_calls += 1
-        self.draft_step += 1
-        draft_output = self.data_dict["draft_output"][self.draft_step]
-        response = GenerationResponse(
-            output_ids=np.array([[draft_output["output_ids"]]]),
-            generation_logits=None,
-            sequence_length=np.array([[draft_output["sequence_length"]]]))
-        if self.data_dict.get("use_draft_logits", False):
-            print("!!!")
-            response.generation_logits = draft_output["generation_logits"]
-        return response
-
-    def _generate(
-        self,
-        preproc: PreprocResponse,
-        request: Request,
-        draft_request: Optional[DraftRequest] = None
-    ) -> Generator[GenerationResponse, None, None]:
-        for idx, target_output in enumerate(self.data_dict["target_output"]):
-            self.target_num_calls += 1
-            self.target_step = idx
-            output_len = len(target_output["output_ids"])
-            yield GenerationResponse(output_ids=np.array(
-                [[target_output["output_ids"]]]),
-                                     sequence_length=np.array([[output_len]]))
-
-    def _generate_non_streaming(
-            self,
-            preproc: PreprocResponse,
-            request: Request,
-            draft_request: Optional[DraftRequest] = None
-    ) -> GenerationResponse:
-        self.target_num_calls += 1
-        # Return the full completion (final step) if not using speculative decoding in non-streaming mode
-        if not self.data_dict["use_speculative"]:
-            self.target_step = (len(self.data_dict["target_output"]) - 2)
-        else:
-            print(draft_request)
-            assert draft_request is not None
-            if draft_request.draft_input_ids is not None:
-                assert draft_request.draft_input_ids.shape[1] > 0
-                if self.data_dict.get("use_draft_logits", False):
-                    assert draft_request.draft_logits is not None
-                    assert draft_request.draft_logits.shape[
-                        1] == draft_request.draft_input_ids.shape[1]
-
-        self.target_step += 1
-        target_output = self.data_dict["target_output"][self.target_step]
-        output_len = len(target_output["output_ids"])
-        return GenerationResponse(output_ids=np.array(
-            [[target_output["output_ids"]]]),
-                                  sequence_length=np.array([[output_len]]))
-
-
-decode_testcases = [
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        False,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7],
-            "output_text": "Deep learning is a"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9],
-            "output_text": "Deep learning is a subset"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22],
-            "output_text": "Deep learning is a subset of Machine"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "output_text":
-            "Deep learning is a subset of Machine learning"
-        }]
-    },
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        True,
-        "num_draft_tokens":
-        3,
-        "use_draft_logits":
-        False,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "output_text":
-            "Deep learning is a subset of Machine learning"
-        }],
-        "draft_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 22],
-            "sequence_length": 7,
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "sequence_length": 9,
-        }]
-    },
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        True,
-        "num_draft_tokens":
-        3,
-        "use_draft_logits":
-        True,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "output_text":
-            "Deep learning is a subset of Machine learning"
-        }],
-        "draft_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 22],
-            "sequence_length": 7,
-            "generation_logits": np.random.rand(1, 1, 7, 1024),
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "sequence_length": 9,
-            "generation_logits": np.random.rand(1, 1, 7, 1024),
-        }]
-    },
-]
-
-
-@pytest.mark.parametrize("test_case", decode_testcases)
-def test_decode(test_case):
-
-    request = Request(
-        text_input=np.array([[test_case["text_input"]]], dtype=object),
-        max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
-        num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
-                                   dtype=np.int32)
-                          if "num_draft_tokens" in test_case else None),
-        use_draft_logits=(np.array([[test_case["use_draft_logits"]]],
-                                   dtype=bool)
-                          if "use_draft_logits" in test_case else None),
-        stop_words=np.array([[[]]]))
-    # Last index is the expected response
-    expected_res = Response(text_output=np.array(
-        [test_case["target_output"][-1]["output_text"]], dtype=object))
-
-    if not test_case["use_speculative"]:
-        # Test non speculative mode
-
-        # non-streaming
-        d = MockDecoder(data_dict=test_case, streaming=False)
-        for res in d.decode(request):
-            assert expected_res == res
-        assert d.target_num_calls == 1
-
-        # streaming
-        d = MockDecoder(data_dict=test_case, streaming=True)
-        final_res = None
-        for res in d.decode(request):
-            final_res = res
-        assert final_res == expected_res
-        assert d.target_num_calls == len(test_case["target_output"])
-    else:
-        # Test speculative decoding
-        d = MockDecoder(data_dict=test_case)
-        final_res = None
-        for res in d.decode(request, speculative_decoding=True):
-            final_res = res
-        assert final_res == expected_res
-        num_steps = len(test_case["draft_output"])
-        assert d.target_num_calls == num_steps
-        assert d.draft_num_calls == num_steps
-
-
-length_stop_testcases = [{
-    "text_input":
-    "Deep learning is",
-    "max_tokens":
-    1,
-    "use_speculative":
-    True,
-    "num_draft_tokens":
-    3,
-    "input_ids": [1, 10, 11, 23],
-    "target_output": [{
-        "output_ids": [1, 10, 11, 23],
-        "output_text": "Deep learning is a"
-    }, {
-        "output_ids": "not important",
-        "output_text": "not important"
-    }],
-    "draft_output": [{
-        "output_ids": ["not important"],
-        "sequence_length": 0
-    }, {
-        "output_ids": ["not important"],
-        "sequence_length": 0
-    }]
-}]
-
-
-@pytest.mark.parametrize("test_case", length_stop_testcases)
-def test_length_stop(test_case):
-    # Since max_tokens is 1, test if get the first output as the final output
-    # and make sure the draft model is never called
-    request = Request(
-        text_input=np.array([[test_case["text_input"]]], dtype=object),
-        max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
-        num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
-                                   dtype=np.int32)
-                          if "num_draft_tokens" in test_case else None),
-        stop_words=np.array([[[]]]))
-    # Index 0 is the expected response
-    expected_res = Response(text_output=np.array(
-        [test_case["target_output"][0]["output_text"]], dtype=object))
-
-    d = MockDecoder(data_dict=test_case)
-    final_res = None
-    for res in d.decode(request, speculative_decoding=True):
-        final_res = res
-    assert final_res == expected_res
-    assert d.target_num_calls == 1
-    assert d.draft_num_calls == 0
-
-
-early_stopping_testcases = [
-    {
-        "text_input":
-        "Deep learning is",
-        "max_tokens":
-        10,
-        "use_speculative":
-        True,
-        "num_draft_tokens":
-        3,
-        "input_ids": [1, 10, 11, 23],
-        "target_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of"
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21],
-            "output_text": "Deep learning is a subset of Machine"
-        }, {
-            "output_ids": ["not important"],
-            "output_text": "not important"
-        }],
-        "draft_output": [{
-            "output_ids": [1, 10, 11, 23, 7, 9, 22],
-            "sequence_length": 7
-        }, {
-            "output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
-            "sequence_length": 9
-        }, {
-            "output_ids": ["not important"],
-            "sequence_length": 0
-        }]
-    },
-]
-
-
-@pytest.mark.parametrize("test_case", early_stopping_testcases)
-def test_early_stopping(test_case):
-
-    request = Request(
-        text_input=np.array([[test_case["text_input"]]], dtype=object),
-        max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
-        num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
-                                   dtype=np.int32)
-                          if "num_draft_tokens" in test_case else None),
-        stop_words=np.array([[[]]]))
-    # Index 1 is the expected response
-    expected_res = Response(text_output=np.array(
-        [test_case["target_output"][1]["output_text"]], dtype=object))
-
-    d = MockDecoder(data_dict=test_case)
-    final_res = None
-    for res in d.decode(request, speculative_decoding=True):
-        final_res = res
-    assert final_res == expected_res
-    assert d.target_num_calls == 2
-    assert d.draft_num_calls == 2
-
-
-def test_request_validation():
-    req = Request()
-    with pytest.raises(RequestValidationError):
-        req.validate()
-    req.text_input = np.array([["input string"]], dtype=object)
-    with pytest.raises(RequestValidationError):
-        req.validate()
-    req.max_tokens = np.array([[10]])
-    req.validate()
-
-    req.stream = np.array([[True]])
-    req.num_draft_tokens = np.array([[5]])
-
-    with pytest.raises(RequestValidationError):
-        req.validate()
diff --git a/all_models/tests/test_python_backend.py b/all_models/tests/test_python_backend.py
deleted file mode 100644
index 2a4756a7..00000000
--- a/all_models/tests/test_python_backend.py
+++ /dev/null
@@ -1,579 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import sys
-from dataclasses import dataclass
-from typing import Dict, List
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-import torch
-
-# Mock pb_utils
-sys.modules["triton_python_backend_utils"] = MagicMock()
-
-# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm/1/
-from model import *
-
-import tensorrt_llm.bindings.executor as trtllm
-
-
-@dataclass
-class MockTritonTensor:
-    _name: str
-    _tensor: np.ndarray
-
-    def name(self) -> str:
-        return self._name
-
-    def as_numpy(self) -> np.ndarray:
-        return self._tensor
-
-
-@dataclass
-class MockTritonError:
-    message: str
-
-
-@dataclass
-class MockTritonResponse:
-    tensors: Dict[str, MockTritonTensor]
-    error: MockTritonError
-
-    def __init__(self,
-                 output_tensors: List[MockTritonTensor],
-                 error: MockTritonError = None):
-        self.tensors = {}
-        for tensor in output_tensors:
-            self.tensors[tensor.name()] = tensor
-        self.error = error
-
-    def output_tensors(self):
-        return self.tensors.values()
-
-    def has_error(self):
-        return self.error is not None
-
-
-@dataclass
-class MockTritonRequest:
-    tensors: Dict[str, MockTritonTensor]
-
-    def get_input_tensor_by_name(self, name: str) -> MockTritonTensor:
-        return self.tensors[name] if name in self.tensors else None
-
-    def get_response_sender(self):
-        return None
-
-
-def mock_pb_utils_get_input_tensor_by_name_side_effect(
-        request: MockTritonRequest, name: str) -> MockTritonTensor:
-    return request.get_input_tensor_by_name(name)
-
-
-@pytest.fixture(autouse=True)
-def apply_patches():
-    patch("model.pb_utils.Tensor", new=MockTritonTensor).start()
-    patch("model.pb_utils.InferenceResponse", new=MockTritonResponse).start()
-    patch("model.pb_utils.TritonError", new=MockTritonError).start()
-    patch("model.pb_utils.InferenceRequest", new=MockTritonRequest).start()
-    patch("model.pb_utils.get_input_tensor_by_name",
-          new=mock_pb_utils_get_input_tensor_by_name_side_effect).start()
-    patch("model.pb_utils.TritonModelException", new=Exception).start()
-
-
-@pytest.fixture
-def triton_request() -> MockTritonRequest:
-    inputs = {
-        "input_ids": [[28524, 287, 5093, 12]],
-        "request_output_len": [[16]],
-        "streaming": [[True]],
-        "end_id": [50256],
-        "pad_id": [50256],
-        "stop_words_list": [[[14480, 326, 262, 1171], [1, 4, -1, -1]]],
-        "bad_words_list": [[[24044, 76, 1230], [2, 3, -1]]],
-        "embedding_bias":
-        np.array([[0., 0., 0.]], dtype=np.float32),
-        "beam_width": [2],
-        "runtime_top_k": [1],
-        "runtime_top_p": [0.],
-        "random_seed": [4],
-        "temperature": [1.],
-        "min_length": [3],
-        "repetition_penalty": [1.0],
-        "presence_penalty": [2.0],
-        "frequency_penalty": [4.0],
-        "len_penalty": [8.0],
-        "runtime_top_p_min": [1.0],
-        "runtime_top_p_reset_ids": [1],
-        "runtime_top_p_decay": [1.0],
-        "beam_search_diversity_rate": [1.0],
-        "early_stopping": [True],
-        "return_log_probs":
-        True,
-        "return_context_logits":
-        True,
-        "return_generation_logits":
-        True,
-        "draft_input_ids": [0, 1],
-        "draft_logits":
-        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
-        "draft_acceptance_threshold":
-        1.0,
-        "prompt_embedding_table":
-        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float16),
-        "lora_task_id": [1],
-        "lora_weights":
-        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float16),
-        "lora_config":
-        np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32),
-        # Unused by executor backend but may still be in the request.
-        "input_lengths": [4],
-        "prompt_vocab_size": [2],
-    }
-    return MockTritonRequest(
-        {k: MockTritonTensor(k, np.array(v))
-         for k, v in inputs.items()})
-
-
-@pytest.fixture
-def triton_request_minimal() -> MockTritonRequest:
-    inputs = {
-        "input_ids": [[28524, 287, 5093, 12]],
-        "request_output_len": [[16]],
-    }
-    return MockTritonRequest(
-        {k: MockTritonTensor(k, np.array(v))
-         for k, v in inputs.items()})
-
-
-@pytest.fixture
-def trtllm_response() -> trtllm.Response:
-    result = trtllm.Result()
-    result.is_final = True
-    result.output_token_ids = [[1, 2, 3]]
-    result.cum_log_probs = [1]
-    result.log_probs = [[1, 3]]
-    result.context_logits = torch.ones(3, 10)
-    result.generation_logits = torch.ones(1, 5, 10)
-    return trtllm.Response(0, result)
-
-
-@pytest.fixture
-def trtllm_response_minimal() -> trtllm.Response:
-    result = trtllm.Result()
-    result.is_final = False
-    result.output_token_ids = [[1, 2, 3]]
-    return trtllm.Response(0, result)
-
-
-@pytest.fixture
-def trtllm_response_error() -> trtllm.Response:
-    return trtllm.Response(0, "internal error")
-
-
-def test_get_input_tensor_by_name(triton_request: MockTritonRequest):
-    assert (get_input_tensor_by_name(triton_request, "input_ids") == np.array(
-        [[28524, 287, 5093, 12]])).all()
-    assert get_input_tensor_by_name(triton_request, "no_value") is None
-
-
-def test_get_input_scalar_by_name(triton_request: MockTritonRequest):
-    assert get_input_scalar_by_name(triton_request, "request_output_len") == 16
-    assert get_input_scalar_by_name(triton_request, "streaming") == True
-    assert get_input_scalar_by_name(triton_request, "end_id") == 50256
-    assert get_input_scalar_by_name(triton_request, "pad_id") == 50256
-    assert get_input_scalar_by_name(triton_request, "beam_width") == 2
-    assert get_input_scalar_by_name(triton_request, "runtime_top_k") == 1
-    assert get_input_scalar_by_name(triton_request, "runtime_top_p") == 0.
-    assert get_input_scalar_by_name(triton_request, "temperature") == 1.
-
-
-def test_read_parameter_as_type():
-    assert read_parameter_as_type("", "name") is None
-    assert read_parameter_as_type("", "name", int) is None
-    assert read_parameter_as_type("", "name", float) is None
-    assert read_parameter_as_type("", "name", bool) is None
-    assert read_parameter_as_type("${unfilled_parameter}", "name") is None
-    assert read_parameter_as_type("foo", "name", int) is None
-    assert read_parameter_as_type("string_value", "name") == "string_value"
-    assert read_parameter_as_type("4", "name", int) == 4
-    assert read_parameter_as_type("0.5", "name", float) == 0.5
-    assert read_parameter_as_type("1", "name", bool) == True
-    assert read_parameter_as_type("true", "name", bool) == True
-    assert read_parameter_as_type("True", "name", bool) == True
-    assert read_parameter_as_type("0", "name", bool) == False
-    assert read_parameter_as_type("false", "name", bool) == False
-    assert read_parameter_as_type("False", "name", bool) == False
-
-
-def test_get_parameter():
-    model_config = {"parameters": {"max_beam_width": {"string_value": "1"}}}
-    assert get_parameter(model_config, "max_beam_width", int) == 1
-    assert get_parameter(model_config, "gpt_model_type", str) is None
-
-
-def test_convert_word_list():
-    assert convert_word_list(None) is None
-    assert convert_word_list(np.array([[[], []]])) == []
-    assert convert_word_list(
-        np.array([[[14480, 326, 262, 1171], [1, 4, -1,
-                                             -1]]])) == [[14480],
-                                                         [326, 262, 1171]]
-    assert convert_word_list(np.array([[[24044, 76, 1230],
-                                        [2, 3, -1]]])) == [[24044, 76], [1230]]
-    assert convert_word_list(np.array([[[326, 262, 1230],
-                                        [3, -1, -1]]])) == [[326, 262, 1230]]
-    for bad_format in [
-            np.array([]),
-            np.array([[]]),
-            np.array([[[]]]),
-            np.array([[[1], [2], [3]]]),
-            np.array([[[262], [5]]]),
-    ]:
-        with pytest.raises(Exception, match="Invalid format for word list"):
-            convert_word_list(bad_format)
-
-
-def test_parse_medusa_choices():
-    assert parse_medusa_choices("{0, 0, 0}, {0, 1}") == [[0, 0, 0], [0, 1]]
-    for bad_format in [
-            "{{}",
-            "{",
-            "{{}",
-            "}",
-            "{0, 1, 2",
-            "0, 1, 2",
-            "{0, 1, 2}, {\"foo\"}",
-    ]:
-        with pytest.raises(Exception,
-                           match="Invalid format for medusa_choices"):
-            parse_medusa_choices(bad_format)
-
-
-def test_convert_request(triton_request: MockTritonRequest):
-    converted = convert_request(triton_request,
-                                exclude_input_from_output=True,
-                                decoupled=True)
-    assert isinstance(converted, trtllm.Request)
-    assert converted.input_token_ids == [28524, 287, 5093, 12]
-    assert converted.max_new_tokens == 16
-    assert converted.streaming == True
-    assert converted.end_id == 50256
-    assert converted.pad_id == 50256
-    assert converted.stop_words == [[14480], [326, 262, 1171]]
-    assert converted.bad_words == [[24044, 76], [1230]]
-    assert (converted.embedding_bias == torch.tensor([0., 0., 0.])).all()
-    assert converted.logits_post_processor_name is None
-
-    assert isinstance(converted.external_draft_tokens_config,
-                      trtllm.ExternalDraftTokensConfig)
-    assert converted.external_draft_tokens_config.tokens == [0, 1]
-    assert (converted.external_draft_tokens_config.logits == torch.tensor(
-        [[1.0, 2.0], [3.0, 4.0]])).all()
-    assert converted.external_draft_tokens_config.acceptance_threshold == 1.0
-
-    assert isinstance(converted.prompt_tuning_config,
-                      trtllm.PromptTuningConfig)
-    assert (converted.prompt_tuning_config.embedding_table == torch.tensor(
-        [[1.0, 2.0], [3.0, 4.0]])).all()
-
-    assert isinstance(converted.lora_config, trtllm.LoraConfig)
-    assert converted.lora_config.task_id == 1
-    assert (converted.lora_config.weights == torch.tensor([[1.0, 2.0],
-                                                           [3.0, 4.0]])).all()
-    assert (converted.lora_config.config == torch.tensor([[1, 2, 3],
-                                                          [4, 5, 6]])).all()
-
-    assert converted.sampling_config.beam_width == 2
-    assert converted.sampling_config.top_k == 1
-    assert converted.sampling_config.top_p is None
-    assert converted.sampling_config.top_p_min == 1.0
-    assert converted.sampling_config.top_p_reset_ids == 1
-    assert converted.sampling_config.top_p_decay == 1.0
-    assert converted.sampling_config.random_seed == 4
-    assert converted.sampling_config.temperature == 1.0
-    assert converted.sampling_config.min_length == 3
-    assert converted.sampling_config.beam_search_diversity_rate == 1.0
-    assert converted.sampling_config.repetition_penalty == 1.0
-    assert converted.sampling_config.presence_penalty == 2.0
-    assert converted.sampling_config.frequency_penalty == 4.0
-    assert converted.sampling_config.length_penalty == 8.0
-    assert converted.sampling_config.early_stopping == True
-
-    assert converted.output_config.return_log_probs == True
-    assert converted.output_config.return_context_logits == True
-    assert converted.output_config.return_generation_logits == True
-    assert converted.output_config.exclude_input_from_output == True
-
-
-def test_convert_request_minimal(triton_request_minimal: MockTritonRequest):
-    converted = convert_request(triton_request_minimal,
-                                exclude_input_from_output=False,
-                                decoupled=False)
-    assert converted.input_token_ids == [28524, 287, 5093, 12]
-    assert converted.max_new_tokens == 16
-    assert converted.streaming == False
-    assert converted.end_id is None
-    assert converted.pad_id is None
-    assert converted.stop_words is None
-    assert converted.bad_words is None
-    assert converted.embedding_bias is None
-    assert converted.logits_post_processor_name is None
-    assert converted.external_draft_tokens_config is None
-    assert converted.prompt_tuning_config is None
-    assert converted.lora_config is None
-
-    assert converted.sampling_config.beam_width == 1
-    assert converted.sampling_config.top_k is None
-    assert converted.sampling_config.top_p is None
-    assert converted.sampling_config.top_p_min is None
-    assert converted.sampling_config.top_p_reset_ids is None
-    assert converted.sampling_config.top_p_decay is None
-    assert converted.sampling_config.random_seed is None
-    assert converted.sampling_config.temperature is None
-    assert converted.sampling_config.min_length is None
-    assert converted.sampling_config.beam_search_diversity_rate is None
-    assert converted.sampling_config.repetition_penalty is None
-    assert converted.sampling_config.presence_penalty is None
-    assert converted.sampling_config.frequency_penalty is None
-    assert converted.sampling_config.length_penalty is None
-    assert converted.sampling_config.early_stopping is None
-
-    assert converted.output_config.return_log_probs == False
-    assert converted.output_config.return_context_logits == False
-    assert converted.output_config.return_generation_logits == False
-    assert converted.output_config.exclude_input_from_output == False
-
-
-def test_convert_request_invalid():
-    with pytest.raises(Exception, match="A value is required for input_ids"):
-        no_input_ids = MockTritonRequest({
-            "request_output_len":
-            MockTritonTensor("request_output_len", np.array([[128]]))
-        })
-        convert_request(no_input_ids, False, False)
-    with pytest.raises(Exception, match="Invalid format for input_ids"):
-        bad_input_ids = MockTritonRequest(
-            {"input_ids": MockTritonTensor("input_ids", np.array([]))})
-        convert_request(bad_input_ids, False, False)
-    with pytest.raises(Exception,
-                       match="A value is required for request_output_len"):
-        no_output_len = MockTritonRequest({
-            "input_ids":
-            MockTritonTensor("input_ids", np.array([[1, 2, 3]]))
-        })
-        convert_request(no_output_len, False, False)
-    with pytest.raises(Exception,
-                       match="Streaming is only supported in decoupled mode."):
-        streaming_non_decoupled = MockTritonRequest({
-            "input_ids":
-            MockTritonTensor("input_ids", np.array([[1, 2, 3]])),
-            "request_output_len":
-            MockTritonTensor("request_output_len", np.array([[128]])),
-            "streaming":
-            MockTritonTensor("streaming", np.array([[True]])),
-        })
-        convert_request(streaming_non_decoupled, False, False)
-
-
-def test_convert_response(trtllm_response: trtllm.Response):
-    response, is_final = convert_response(trtllm_response)
-    assert is_final == True
-    assert (response.tensors["output_ids"].as_numpy() == np.array([[1, 2, 3]
-                                                                   ])).all()
-    assert (response.tensors["sequence_length"].as_numpy() == np.array(
-        [[3]])).all()
-    assert (response.tensors["cum_log_probs"].as_numpy() == np.array(
-        [1])).all()
-    assert (response.tensors["output_log_probs"].as_numpy() == np.array(
-        [[1, 3]])).all()
-    assert (response.tensors["context_logits"].as_numpy() == np.ones(
-        (3, 10), dtype=np.float32)).all()
-    assert (response.tensors["generation_logits"].as_numpy() == np.ones(
-        (1, 5, 10), dtype=np.float32)).all()
-
-
-def test_convert_response_minimal(trtllm_response_minimal: trtllm.Response):
-    response, is_final = convert_response(trtllm_response_minimal)
-    assert is_final == False
-    assert (response.tensors["output_ids"].as_numpy() == np.array([[1, 2, 3]
-                                                                   ])).all()
-    assert (response.tensors["sequence_length"].as_numpy() == np.array(
-        [[3]])).all()
-    assert (response.tensors["cum_log_probs"].as_numpy() == np.zeros(
-        (1, 1), np.float32)).all()
-    assert (response.tensors["output_log_probs"].as_numpy() == np.zeros(
-        (1, 1, 1), np.float32)).all()
-    assert (response.tensors["context_logits"].as_numpy() == np.zeros(
-        (1, 1, 1), np.float32)).all()
-    assert (response.tensors["generation_logits"].as_numpy() == np.zeros(
-        (1, 1, 1, 1), np.float32)).all()
-
-
-def test_convert_response_error(trtllm_response_error: trtllm.Response):
-    response, is_final = convert_response(trtllm_response_error)
-    assert is_final == True
-    assert response.has_error() and response.error.message == "internal error"
-
-
-def test_convert_scheduler_policy():
-    assert convert_scheduler_policy(
-        "max_utilization") == trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
-    assert convert_scheduler_policy(
-        "guaranteed_no_evict"
-    ) == trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
-    with pytest.raises(
-            Exception,
-            match="batch_scheduler_policy value of 'other' is not supported"):
-        convert_scheduler_policy("other")
-
-
-def test_convert_batching_type():
-    assert convert_batching_type(
-        "inflight_fused_batching") == trtllm.BatchingType.INFLIGHT
-    assert convert_batching_type(
-        "inflight_batching") == trtllm.BatchingType.INFLIGHT
-    assert convert_batching_type("v1") == trtllm.BatchingType.STATIC
-    with pytest.raises(
-            Exception,
-            match="gpt_model_type value of 'other' is not supported"):
-        convert_batching_type("other")
-
-
-def test_convert_decoding_mode():
-    assert convert_decoding_mode(None) is None
-    assert convert_decoding_mode("auto").isAuto()
-    assert convert_decoding_mode("top_k").isTopK()
-    assert convert_decoding_mode("top_p").isTopP()
-    assert convert_decoding_mode("top_k_top_p").isTopKandTopP()
-    assert convert_decoding_mode("beam_search").isBeamSearch()
-    assert convert_decoding_mode("medusa").isMedusa()
-    with pytest.raises(
-            Exception,
-            match="decoding_mode value of 'other' is not supported"):
-        convert_decoding_mode("other")
-
-
-@pytest.fixture
-def model_config() -> Dict:
-    config = {
-        "max_beam_width": "2",
-        "enable_chunked_context": "true",
-        "normalize_log_probs": "false",
-        "gpt_model_type": "inflight_batching",
-        "medusa_choices": "{1, 2, 3, 4}, {5, 6, 7}",
-        "decoding_mode": "medusa",
-        "batch_scheduler_policy": "max_utilization",
-        "enable_kv_cache_reuse": "true",
-        "max_tokens_in_paged_kv_cache": "1",
-        "max_attention_window_size": "2",
-        "sink_token_length": "3",
-        "kv_cache_free_gpu_mem_fraction": "0.5",
-        "kv_cache_host_memory_bytes": "4",
-        "kv_cache_onboard_blocks": "false",
-        "gpu_device_ids": "0,1,2,3",
-        "executor_worker_path": str(os.path.abspath(__file__)),
-        "lora_cache_optimal_adapter_size": "1",
-        "lora_cache_max_adapter_size": "2",
-        "lora_cache_gpu_memory_fraction": "0.5",
-        "lora_cache_host_memory_bytes": "4",
-    }
-    return {"parameters": {k: {"string_value": v} for k, v in config.items()}}
-
-
-def test_get_executor_config(model_config: Dict):
-    os.environ["TRTLLM_ORCHESTRATOR"] = "0"
-    config = TritonPythonModel().get_executor_config(model_config)
-    assert config.max_beam_width == 2
-    assert config.enable_chunked_context == True
-    assert config.normalize_log_probs == False
-    assert config.batching_type == trtllm.BatchingType.INFLIGHT
-    assert config.decoding_config.medusa_choices == [[1, 2, 3, 4], [5, 6, 7]]
-    assert config.decoding_config.decoding_mode.isMedusa()
-    assert config.scheduler_config.capacity_scheduler_policy == trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
-    assert config.kv_cache_config.enable_block_reuse == True
-    assert config.kv_cache_config.max_tokens == 1
-    assert config.kv_cache_config.max_attention_window == 2
-    assert config.kv_cache_config.sink_token_length == 3
-    assert config.kv_cache_config.free_gpu_memory_fraction == 0.5
-    assert config.kv_cache_config.host_cache_size == 4
-    assert config.kv_cache_config.onboard_blocks == False
-    assert config.parallel_config.device_ids == [0, 1, 2, 3]
-    assert config.parallel_config.orchestrator_config is None
-    assert config.peft_cache_config.optimal_adapter_size == 1
-    assert config.peft_cache_config.max_adapter_size == 2
-    assert config.peft_cache_config.device_cache_percent == 0.5
-    assert config.peft_cache_config.host_cache_size == 4
-    assert config.iter_stats_max_iterations == 1000
-    assert config.request_stats_max_iterations == 0
-    assert config.logits_post_processor_map is None
-    del os.environ["TRTLLM_ORCHESTRATOR"]
-
-
-def test_get_executor_config_orchestrator_mode(model_config: Dict):
-    os.environ["TRTLLM_ORCHESTRATOR"] = "1"
-    config = TritonPythonModel().get_executor_config(model_config)
-    assert config.parallel_config.device_ids == [0, 1, 2, 3]
-    assert config.parallel_config.orchestrator_config.is_orchestrator == True
-    assert config.parallel_config.orchestrator_config.worker_executable_path == str(
-        os.path.abspath(__file__))
-    del os.environ["TRTLLM_ORCHESTRATOR"]
-
-
-def test_get_executor_config_minimal():
-    if "TRTLLM_ORCHESTRATOR" in os.environ:
-        del os.environ["TRTLLM_ORCHESTRATOR"]
-    config = TritonPythonModel().get_executor_config({"parameters": {}})
-    assert config.max_beam_width == 1
-    assert config.enable_chunked_context == False
-    assert config.normalize_log_probs == True
-    assert config.batching_type == trtllm.BatchingType.INFLIGHT
-    assert config.decoding_config.decoding_mode is None
-    assert config.decoding_config.medusa_choices is None
-    assert config.scheduler_config.capacity_scheduler_policy == trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
-    assert config.kv_cache_config.enable_block_reuse == False
-    assert config.kv_cache_config.max_tokens is None
-    assert config.kv_cache_config.max_attention_window is None
-    assert config.kv_cache_config.sink_token_length is None
-    assert config.kv_cache_config.free_gpu_memory_fraction is None
-    assert config.kv_cache_config.host_cache_size is None
-    assert config.kv_cache_config.onboard_blocks == True
-    assert config.parallel_config is None
-    assert config.peft_cache_config.optimal_adapter_size == 8
-    assert config.peft_cache_config.max_adapter_size == 64
-    assert config.peft_cache_config.device_cache_percent is None
-    assert config.peft_cache_config.host_cache_size is None
-    assert config.iter_stats_max_iterations == 1000
-    assert config.request_stats_max_iterations == 0
-    assert config.logits_post_processor_map is None
-
-
-def test_convert_timestamp_to_seconds():
-    assert convert_timestamp_to_seconds("01-01-1970 00:00:00") == 0
-    assert convert_timestamp_to_seconds("05-17-2024 23:28:39") == 1715988519
diff --git a/all_models/tests/test_triton_decoder.py b/all_models/tests/test_triton_decoder.py
deleted file mode 100644
index dadf5878..00000000
--- a/all_models/tests/test_triton_decoder.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from dataclasses import dataclass
-from typing import Dict, List
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-
-# Mock pb_utils
-sys.modules["triton_python_backend_utils"] = MagicMock()
-
-from lib.decode import GenerationResponse, PreprocResponse, Request, Response
-# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm_bls/1/
-from lib.triton_decoder import TritonDecoder
-
-
-@dataclass
-class MockTritonTensor:
-    _name: str
-    _tensor: np.ndarray
-
-    def name(self) -> str:
-        return self._name
-
-    def as_numpy(self) -> np.ndarray:
-        return self._tensor
-
-
-@dataclass
-class MockTritonResponse:
-    tensors: Dict[str, MockTritonTensor]
-
-    def __init__(self, output_tensors: List[MockTritonTensor]):
-        self.tensors = {}
-        for tensor in output_tensors:
-            self.tensors[tensor.name()] = tensor
-
-    def output_tensors(self):
-        return self.tensors.values()
-
-
-@dataclass
-class MockTritonRequest:
-    tensors: Dict[str, MockTritonTensor]
-
-    def get_input_tensor_by_name(self, name: str) -> MockTritonTensor:
-        return self.tensors[name] if name in self.tensors else None
-
-    def get_response_sender(self):
-        return None
-
-
-@pytest.fixture
-def triton_decoder() -> TritonDecoder:
-    return TritonDecoder()
-
-
-@pytest.fixture
-def response(request) -> MockTritonResponse:
-    output_names = [
-        "text_output",
-        "cum_log_probs",
-        "output_log_probs",
-        "context_logits",
-        "generation_logits",
-    ]
-    response = Response()
-    for output_name in output_names:
-        setattr(response, output_name, np.array(request.param[output_name]))
-    return response
-
-
-@pytest.fixture
-def triton_request(request) -> MockTritonRequest:
-    input_names = [
-        "text_input", "max_tokens", "bad_words", "stop_words", "end_id",
-        "pad_id", "top_k", "top_p", "temperature", "length_penalty",
-        "repetition_penalty", "min_length", "presence_penalty",
-        "frequency_penalty", "random_seed", "return_log_probs",
-        "return_context_logits", "return_generation_logits", "beam_width",
-        "stream", "prompt_embedding_table", "prompt_vocab_size",
-        "embedding_bias_words", "embedding_bias_weights", "num_draft_tokens"
-    ]
-    triton_tensor_map = {}
-    for input_name in input_names:
-        if input_name in request.param:
-            triton_tensor = MockTritonTensor(
-                input_name, np.array(request.param[input_name]))
-            triton_tensor_map[input_name] = triton_tensor
-    return MockTritonRequest(triton_tensor_map)
-
-
-@pytest.fixture(autouse=True)
-def apply_patches():
-    patch("lib.triton_decoder.pb_utils.Tensor", new=MockTritonTensor).start()
-    patch("lib.triton_decoder.pb_utils.InferenceResponse",
-          new=MockTritonResponse).start()
-    patch("lib.triton_decoder.pb_utils.InferenceRequest",
-          new=MockTritonRequest).start()
-    patch("lib.triton_decoder.pb_utils.get_input_tensor_by_name",
-          new=mock_pb_utils_get_input_tensor_by_name_side_effect).start()
-
-
-def mock_pb_utils_get_input_tensor_by_name_side_effect(
-        request: MockTritonRequest, name: str) -> MockTritonTensor:
-    return request.get_input_tensor_by_name(name)
-
-
-mock_reponse = {
-    "text_output": ["Hello world"],
-    "cum_log_probs": [[0.0]],
-    "output_log_probs": [[[0.1, 0.3]]],
-    "context_logits": [[[-0.2, 0.2]]],
-    "generation_logits": [[[0.3, 1.1]]]
-}
-
-mock_request = {"text_input": [["Hello world"]], "max_tokens": [[24]]}
-
-
-@pytest.mark.parametrize("response", [mock_reponse], indirect=True)
-def test_create_triton_response(triton_decoder: TritonDecoder,
-                                response: Response):
-    triton_response = triton_decoder.create_triton_response(response)
-    # Check if all fields and values are present in the triton response
-    output_triton_tensors = triton_response.output_tensors()
-    output_triton_tensor_map = {
-        tensor.name(): tensor.as_numpy()
-        for tensor in output_triton_tensors
-    }
-    assert (output_triton_tensor_map.keys() == response.__dict__.keys())
-    for output_name in output_triton_tensor_map:
-        output_tensor = output_triton_tensor_map[output_name]
-        np.testing.assert_array_equal(output_tensor,
-                                      getattr(response, output_name))
-
-
-@pytest.mark.parametrize("triton_request", [mock_request], indirect=True)
-def test_convert_triton_request(triton_decoder: TritonDecoder,
-                                triton_request: MockTritonRequest):
-    request = triton_decoder.convert_triton_request(triton_request)
-    tensor_names = [
-        tensor_name for tensor_name in request.__dict__.keys()
-        if getattr(request, tensor_name) is not None
-    ]
-    assert set(tensor_names) == triton_request.tensors.keys()
-    for tensor_name in tensor_names:
-        request_tensor = getattr(request, tensor_name)
-        if request_tensor is not None:
-            triton_tensor = triton_request.get_input_tensor_by_name(
-                tensor_name)
-            assert triton_tensor is not None
-            np.testing.assert_array_equal(getattr(request, tensor_name),
-                                          triton_tensor.as_numpy())
-
-
-_preproc_name_map = {
-    "INPUT_ID": "input_ids",
-    "REQUEST_INPUT_LEN": "input_lengths",
-    "BAD_WORDS_IDS": "bad_words_list",
-    "STOP_WORDS_IDS": "stop_words_list",
-    "EMBEDDING_BIAS": "embedding_bias",
-    "OUT_PAD_ID": "pad_id",
-    "OUT_END_ID": "end_id",
-}
-_generation_name_map = {
-    "output_ids": "output_ids",
-    "sequence_length": "sequence_length",
-    "cum_log_probs": "cum_log_probs",
-    "output_log_probs": "output_log_probs",
-    "context_logits": "context_logits",
-    "generation_logits": "generation_logits",
-}
-
-convert_triton_response_testcases = [{
-    "response_factory": PreprocResponse,
-    "name_map": _preproc_name_map,
-    "response": {
-        "INPUT_ID": [["Hello world"]],
-        "REQUEST_INPUT_LEN": [[16]]
-    }
-}, {
-    "response_factory": GenerationResponse,
-    "name_map": _generation_name_map,
-    "response": {
-        "output_ids": [[[1, 23, 23412, 2]]],
-        "sequence_length": [[4]]
-    }
-}]
-
-
-@pytest.mark.parametrize("convert_triton_response_testcases",
-                         convert_triton_response_testcases)
-def test_convert_triton_response(triton_decoder: TritonDecoder,
-                                 convert_triton_response_testcases):
-    triton_tensors = []
-    for tensor_name, tensor in convert_triton_response_testcases[
-            "response"].items():
-        triton_tensors.append(MockTritonTensor(tensor_name, np.array(tensor)))
-    triton_response = MockTritonResponse(triton_tensors)
-    response = triton_decoder.convert_triton_response(
-        triton_response, convert_triton_response_testcases["response_factory"],
-        convert_triton_response_testcases["name_map"])
-
-    response_tensors_length = len([
-        attr for attr in response.__dict__
-        if getattr(response, attr) is not None
-    ])
-    assert len(convert_triton_response_testcases["response"]
-               ) == response_tensors_length
-    for tensor_name, tensor in convert_triton_response_testcases[
-            "response"].items():
-        target_name = tensor_name
-        if convert_triton_response_testcases["name_map"]:
-            target_name = convert_triton_response_testcases["name_map"][
-                tensor_name]
-        assert getattr(response, target_name) is not None
-        np.testing.assert_array_equal(
-            convert_triton_response_testcases["response"][tensor_name],
-            getattr(response, target_name))
-
-
-create_triton_tensors_testcases = [{
-    "obj":
-    Request(text_input=np.array([["Hello world"]]),
-            max_tokens=np.array([["16"]]),
-            return_log_probs=np.array([True])),
-    "name_map": {
-        "text_input": "QUERY",
-        "max_tokens": "REQUEST_OUTPUT_LEN",
-        "return_log_probs": "return_log_probs",
-    },
-    "undo_reshape_map": {
-        "return_log_probs": True,
-    }
-}]
-
-
-@pytest.mark.parametrize("create_triton_tensors_testcases",
-                         create_triton_tensors_testcases)
-def test_create_triton_tensors(triton_decoder: TritonDecoder,
-                               create_triton_tensors_testcases):
-    request = create_triton_tensors_testcases["obj"]
-    obj_tensors_length = len([
-        attr for attr in request.__dict__ if getattr(request, attr) is not None
-    ])
-    triton_tensors = triton_decoder.create_triton_tensors(
-        create_triton_tensors_testcases["obj"],
-        create_triton_tensors_testcases["name_map"])
-    triton_tensor_map = {
-        tensor.name(): tensor.as_numpy()
-        for tensor in triton_tensors
-    }
-    assert len(triton_tensors) == obj_tensors_length
-    for tensor_name in request.__dict__:
-        if getattr(request, tensor_name) is not None:
-            target_name = create_triton_tensors_testcases["name_map"][
-                tensor_name]
-            assert target_name in triton_tensor_map
-            if create_triton_tensors_testcases.get("undo_reshape_map",
-                                                   {}).get(target_name, False):
-                np.testing.assert_array_equal(
-                    triton_tensor_map[target_name],
-                    np.expand_dims(getattr(request, tensor_name), 0))
-            else:
-                np.testing.assert_array_equal(triton_tensor_map[target_name],
-                                              getattr(request, tensor_name))
diff --git a/ci/L0_backend_trtllm/base_metrics_verification_tests.py b/ci/L0_backend_trtllm/base_metrics_verification_tests.py
deleted file mode 100644
index 33b2f86d..00000000
--- a/ci/L0_backend_trtllm/base_metrics_verification_tests.py
+++ /dev/null
@@ -1,188 +0,0 @@
-#!/usr/bin/python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import json
-import sys
-from collections import defaultdict
-
-import numpy as np
-import requests
-
-sys.path.append("/opt/tritonserver/tensorrtllm_backend/tools/utils")
-import unittest
-
-import utils
-
-# This unit test was generated because the Triton team needed a
-# static test in which an equal number of inferences were distributed
-# across the 3 models orchestrated by the ensemble. This is so we could
-# compare inference request counts and latencies in an equal environment.
-# Many of the tests provided by the TRT team unevenly distribute requests
-# so when we poll metrics the tensorrt_llm model, for example, will have
-# performed 72 inferences whereas the pre/post models will have only
-# performed 49. Further, because of this unequal distribution of requests
-# we cannot check whether the latency across the 3 models is <= to the
-# latency of the ensemble.
-
-# Consider removing this unit test when the TRT tests have stabilized.
-
-
-class TRTLLMBaseMetricsTest(unittest.TestCase):
-
-    def _get_metrics(self):
-        metrics_url = "http://localhost:8002/metrics"
-        r = requests.get(metrics_url)
-        r.raise_for_status()
-        return r.text
-
-    def _run_infer(self, client, prompts, output_lens):
-        model_name = "ensemble"
-        async_requests = []
-        for i, prompt in enumerate(prompts):
-            input0 = [[prompt]]
-            input0_data = np.array(input0).astype(object)
-            output0_len = np.ones_like(input0).astype(
-                np.int32) * output_lens[i]
-            bad_words_list = np.array([[""]], dtype=object)
-            stop_words_list = np.array([[""]], dtype=object)
-
-            inputs = [
-                utils.prepare_tensor("text_input", input0_data, "http"),
-                utils.prepare_tensor("max_tokens", output0_len, "http"),
-                utils.prepare_tensor("bad_words", bad_words_list, "http"),
-                utils.prepare_tensor("stop_words", stop_words_list, "http"),
-            ]
-
-            async_requests.append(
-                client.async_infer(model_name, inputs, request_id=str(i)))
-
-        try:
-            utils.get_http_results(async_requests)
-        except Exception as e:
-            print("Failed receiving responses: " + str(e))
-            sys.exit(1)
-
-    def _all_equal(self, iterable):
-        return all(item == iterable[0] for item in iterable)
-
-    def _verify_base_metrics(self, filename):
-        # FIXME: Custom parsing is messy. As part of the Triton
-        # CLI work, we should add a metrics client API that will
-        # return the metrics in a neatly formatted JSON.
-        model_metrics = defaultdict(dict)
-        with open(filename) as metrics_file:
-            for line in metrics_file:
-                if line[0] != "#" and "nv_inference" in line:
-                    # Splits metric line into:
-                    # ex. 'nv_inference_request_success', '{model="ensemble",version="1"}', '104'
-                    model_data = line.replace("{", " {").split()
-                    key = model_data[0].replace("nv_inference_", "")
-                    model = model_data[1].split('"')[1]
-                    value = model_data[2]
-                    model_metrics[model][key] = value
-
-        print(json.dumps(model_metrics, indent=4))
-
-        # Assert the expected models are in the metrics output
-        expected_models = [
-            "ensemble", "preprocessing", "postprocessing", "tensorrt_llm"
-        ]
-        self.assertTrue(
-            all(model in model_metrics for model in expected_models))
-
-        # Assert each model records the same number of metrics
-        self.assertTrue(
-            self._all_equal(
-                [len(model_metrics[model].keys()) for model in model_metrics]))
-
-        # Assert models have the same counts
-        count_keys = [
-            "request_success", "request_failure", "count", "exec_count",
-            "pending_request_count"
-        ]
-        for stat in count_keys:
-            self.assertTrue(
-                self._all_equal(
-                    [model_metrics[model][stat] for model in model_metrics]))
-
-        # Assert ensemble duration stats are greater than composing duration stats
-        # Because ensemble models encapsulate a pipeline of submodels
-        # (preprocessing --> tensorrt_llm --> postprocessing in this case), we
-        # expect each duration metric for the ensemble model to be greater the
-        # corresponding sum for that metric across each of the submodels.
-        duration_keys = [
-            "request_duration_us", "compute_input_duration_us",
-            "compute_infer_duration_us", "compute_output_duration_us"
-        ]
-        for stat in duration_keys:
-            composing_stat_duration = sum([
-                int(model_metrics[model][stat]) for model in model_metrics
-                if model != "ensemble"
-            ])
-            ensemble_stat_duration = int(model_metrics["ensemble"][stat])
-            self.assertTrue(composing_stat_duration > 0)
-            self.assertTrue(ensemble_stat_duration > 0)
-            self.assertTrue(ensemble_stat_duration >= composing_stat_duration)
-
-    def test_end_to_end(self):
-        try:
-            client = utils.create_inference_server_client("http",
-                                                          "localhost:8000",
-                                                          concurrency=128,
-                                                          verbose=True)
-        except Exception as e:
-            print("channel creation failed: " + str(e))
-            sys.exit(1)
-
-        max_input_len = 500
-        op_tokens_per_word = 1.3
-        dataset = "./simple_data.json"
-
-        prompts = []
-        output_lens = []
-        with open(dataset, "r") as f:
-            data_dict = json.load(f)
-            for req in data_dict:
-                prompt = req["input"] + " " + req["instruction"]
-                output = req["output"]
-                # 1.3 is a magic number that converts number of words to number of tokens
-                if int(len(prompt.split(" ")) /
-                       op_tokens_per_word) > max_input_len:
-                    continue
-                prompts.append(prompt)
-                output_lens.append(
-                    int(len(output.split(" ")) * op_tokens_per_word))
-
-        self._run_infer(client, prompts, output_lens)
-        metrics = self._get_metrics()
-        filename = "./base_metrics.out"
-        with open(filename, "w+") as metrics_file:
-            metrics_file.write(metrics)
-        self._verify_base_metrics(filename)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
deleted file mode 100644
index ad6c539f..00000000
--- a/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import json
-import os
-import re
-import unittest
-from datetime import datetime, timedelta
-
-AVAILABLE_GPUS = int(os.environ.get("AVAILABLE_GPUS", "1"))
-
-metric_to_stat_dict = {
-    "request_type=context": "Context Requests",
-    "request_type=scheduled": "Scheduled Requests",
-    "request_type=max": "Max Request Count",
-    "request_type=active": "Active Request Count",
-    "memory_type=pinned": "Runtime Pinned Memory Usage",
-    "memory_type=gpu": "Runtime GPU Memory Usage",
-    "memory_type=cpu": "Runtime CPU Memory Usage",
-    "kv_cache_block_type=tokens_per": "Tokens per KV cache block",
-    "kv_cache_block_type=used": "Used KV cache blocks",
-    "kv_cache_block_type=free": "Free KV cache blocks",
-    "kv_cache_block_type=max": "Max KV cache blocks",
-    "inflight_batcher_specific_metric=total_context_tokens":
-    "Total Context Tokens",
-    "inflight_batcher_specific_metric=micro_batch_id": "MicroBatch ID",
-    "inflight_batcher_specific_metric=generation_requests":
-    "Generation Requests",
-    "inflight_batcher_specific_metric=paused_requests": "Paused Requests",
-    "v1_specific_metric=total_context_tokens": "Total Context Tokens",
-    "v1_specific_metric=total_generation_tokens": "Total Generation Tokens",
-    "v1_specific_metric=empty_generation_slots": "Empty Generation Slots",
-    "general_type=iteration_counter": "Iteration Counter",
-    "general_type=timestamp": "Timestamp",
-}
-
-
-class CustomMetricsTest(unittest.TestCase):
-
-    def _parse_log_file(self, filename):
-        with open(filename) as log_file:
-            for line in reversed(list(log_file)):
-                if "Active Request Count" in line:
-                    match = re.search(r'({.*})', line)
-                    if match:
-                        json_string = match.group(1)
-                        try:
-                            json_string = json_string.replace('\\"', '"')
-                        except json.JSONDecodeError as e:
-                            raise Exception("Error parsing the JSON string: ",
-                                            e)
-                    else:
-                        raise Exception("No JSON found in the log file")
-
-                    return json.loads(json_string)
-
-    def _parse_triton_metrics(self, filename, is_v1):
-        curl_counts = {}
-        with open(filename) as metrics_file:
-            for line in metrics_file:
-                metric_value = ""
-                if line[0] != "#" and "nv_trt_llm" in line:
-                    metric_output = re.sub(r"^.*?{", "{", line).split()
-                    metric_key = metric_output[0]
-                    metric_value = metric_output[1]
-                    key = self._convert_metric_key_to_stats_key(
-                        metric_key, is_v1)
-                    curl_counts[key] = metric_value
-        return curl_counts
-
-    def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
-        # Converts:
-        # '{model="tensorrt_llm",request_type="context",version="1"}'
-        # to:
-        # ['model=tensorrt_llm', 'request_type=context', 'version=1']
-        base = metric_output.replace('"', "").strip("{}").split(",")
-        key = [
-            i for i in base
-            if not i.startswith('model') and not i.startswith('version')
-        ][0]
-        self.assertIn(key, metric_to_stat_dict)
-        if (is_v1):
-            self.assertNotIn("inflight_batcher_specific_metric", key)
-        else:
-            self.assertNotIn("v1_specific_metric", key)
-        return metric_to_stat_dict[key]
-
-    def _base_test(self, stats_file, metrics_file, is_v1):
-        stats = self._parse_log_file(stats_file)
-        metrics = self._parse_triton_metrics(metrics_file, is_v1)
-        self.assertEqual(len(stats.keys()), len(metrics.keys()))
-        self.assertEqual(
-            list(stats.keys()).sort(),
-            list(metrics.keys()).sort())
-        for metric_key in stats.keys():
-            if metric_key != "Timestamp":
-                self.assertEqual(int(stats[metric_key]),
-                                 int(metrics[metric_key]))
-            else:
-                dt_log = datetime.strptime(stats[metric_key],
-                                           '%m-%d-%Y %H:%M:%S')
-                dt_curl = datetime.utcfromtimestamp(
-                    int(metrics[metric_key]) // 1000000)
-                difference = dt_log - dt_curl
-                self.assertTrue(
-                    timedelta(seconds=-1) <= difference,
-                    difference <= timedelta(seconds=1))
-
-    def test_1_gpu_v1(self):
-        self._base_test("1gpu_v1_no_streaming_server.log",
-                        "1gpu_v1_no_stream_metrics.out", True)
-
-    def test_1_gpu_IFB_no_stream(self):
-        self._base_test("1gpu_IFB_no_streaming_server.log",
-                        "1gpu_IFB_no_stream_metrics.out", False)
-
-    def test_1_gpu_IFB_stream(self):
-        self._base_test("1gpu_IFB_streaming_server.log",
-                        "1gpu_IFB_stream_metrics.out", False)
-
-    if AVAILABLE_GPUS >= 2:
-
-        def test_2_gpu_v1(self):
-            self._base_test("2gpu_v1_no_streaming_server.log",
-                            "2gpu_v1_no_stream_metrics.out", True)
-
-        def test_2_gpu_IFB_no_stream(self):
-            self._base_test("2gpu_IFB_no_streaming_server.log",
-                            "2gpu_IFB_no_stream_metrics.out", False)
-
-        def test_2_gpu_IFB_stream(self):
-            self._base_test("2gpu_IFB_streaming_server.log",
-                            "2gpu_IFB_stream_metrics.out", False)
-
-    if AVAILABLE_GPUS >= 4:
-
-        def test_4_gpu_v1(self):
-            self._base_test("4gpu_v1_no_streaming_server.log",
-                            "4gpu_v1_no_stream_metrics.out", True)
-
-        def test_4_gpu_IFB_no_stream(self):
-            self._base_test("4gpu_IFB_no_streaming_server.log",
-                            "4gpu_IFB_no_stream_metrics.out", False)
-
-        def test_4_gpu_IFB_stream(self):
-            self._base_test("4gpu_IFB_streaming_server.log",
-                            "4gpu_IFB_stream_metrics.out", False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/ci/L0_backend_trtllm/generate_engines.sh b/ci/L0_backend_trtllm/generate_engines.sh
deleted file mode 100644
index e51bcbc1..00000000
--- a/ci/L0_backend_trtllm/generate_engines.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-BASE_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm
-GPT_DIR=/opt/tritonserver/tensorrtllm_backend/tensorrt_llm/examples/gpt
-TRTLLM_DIR=/opt/tritonserver/tensorrtllm_backend/tensorrt_llm/
-
-function build_base_model {
-    local NUM_GPUS=$1
-    cd ${GPT_DIR}
-    rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2
-    pushd gpt2 && rm pytorch_model.bin model.safetensors && wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin && popd
-    python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir ./c-model/gpt2/${NUM_GPUS}-gpu/
-    cd ${BASE_DIR}
-}
-
-function build_tensorrt_engine_inflight_batcher {
-    local NUM_GPUS=$1
-    cd ${GPT_DIR}
-    local GPT_MODEL_DIR=./c-model/gpt2/${NUM_GPUS}-gpu/
-    local OUTPUT_DIR=inflight_${NUM_GPUS}_gpu/
-    # ./c-model/gpt2/ must already exist (it will if build_base_model
-    # has already been run)
-    extra_args=""
-    # If no nvlink, disable custom all reduce.
-    if [ "$(nvidia-smi nvlink -s | wc -l)" -eq "0" ] || [ $(nvidia-smi nvlink --status | grep inActive | wc -l) -ge 1 ]; then
-        extra_args+="--use_custom_all_reduce=disable"
-    fi
-    trtllm-build --checkpoint_dir "${GPT_MODEL_DIR}" \
-            --gpt_attention_plugin float16 \
-            --remove_input_padding enable \
-            --paged_kv_cache enable \
-            --gemm_plugin float16 \
-            --workers "${NUM_GPUS}" \
-            --output_dir "${OUTPUT_DIR}" \
-            ${extra_args}
-    cd ${BASE_DIR}
-}
-
-# Downgrade to legacy version to accommodate Triton CI runners
-pip install pynvml==11.4.0
-
-# Generate the TRT_LLM model engines
-NUM_GPUS_TO_TEST=("1" "2" "4")
-for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
-    AVAILABLE_GPUS=$(nvidia-smi -L | wc -l)
-    if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then
-        continue
-    fi
-
-    build_base_model "${NUM_GPU}"
-    build_tensorrt_engine_inflight_batcher "${NUM_GPU}"
-done
-
-# Move the TRT_LLM model engines to the CI directory
-mkdir engines
-mv ${GPT_DIR}/inflight_*_gpu/ engines/
-
-# Move the tokenizer into the CI directory
-mkdir tokenizer
-mv ${GPT_DIR}/gpt2/* tokenizer/
diff --git a/ci/L0_backend_trtllm/simple_data.json b/ci/L0_backend_trtllm/simple_data.json
deleted file mode 100644
index 9b7bebca..00000000
--- a/ci/L0_backend_trtllm/simple_data.json
+++ /dev/null
@@ -1,67 +0,0 @@
-[
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    },
-    {
-        "input": "MADRID, Spain -- Lionel Messi scored for the sixth game in a row as Barcelona defeated big-spending Atletico Madrid 3-0 to stay in touch with Primera Liga leaders Real Madrid. Messi (left) is congratulated by Ronaldinho after scoring again in Barcelona's 3-0 win over Atletico Madrid. Barcelona had thumped Atletico 6-0 on their own ground last season and the visitors were out for revenge -- but conceded twice in four minutes. After 15 minutes, Italian goalkeeper Christian Abbiati let a routine Messi cross slip out of his hands and Deco rolled home into the empty net. Four minutes later Messi played a great one-two with Ronaldinho and rifled a shot past Abbiati for his sixth goal of the season and Xavi added a third late on.",
-        "instruction": "Summarize the following news article:",
-        "output": "Lionel Messi scores for the sixth game in a row as Barca defeat Atletico 3-0 ."
-    }
-]
diff --git a/ci/L0_backend_trtllm/test.sh b/ci/L0_backend_trtllm/test.sh
deleted file mode 100644
index b947971a..00000000
--- a/ci/L0_backend_trtllm/test.sh
+++ /dev/null
@@ -1,410 +0,0 @@
-#!/bin/bash
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost}
-SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
-DATASET="$PWD/simple_data.json"
-TOOLS_DIR='/opt/tritonserver/tensorrtllm_backend/tools'
-STREAM_DIR='/opt/tritonserver/tensorrtllm_backend/inflight_batcher_llm/client'
-MODEL_DIR="$PWD/triton_model_repo"
-SERVER=/opt/tritonserver/bin/tritonserver
-TOKENIZER_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm/tokenizer
-BASE_DIR=/opt/tritonserver/tensorrtllm_backend/ci/L0_backend_trtllm
-BASE_METRICS_VERIFICATION_TEST=base_metrics_verification_tests.py
-BASE_METRICS_VERIFICATION_LOG="base_metrics_verification.log"
-CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py
-CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log"
-SERVER_PID=0
-SLEEP_DURATION=3
-
-# Force environment to use python version 3
-apt update -q=2 \
-    && apt install -y python-is-python3
-
-# Helpers ===============================
-function replace_config_tags {
-  tag_to_replace="${1}"
-  new_value="${2}"
-  config_file_path="${3}"
-  sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path}
-
-}
-
-function run_server {
-  SERVER_ARGS="${1}"
-  python3 /opt/tritonserver/tensorrtllm_backend/scripts/launch_triton_server.py ${SERVER_ARGS} > ${SERVER_LOG} 2>&1 &
-  sleep 2 # allow time to obtain the pid(s)
-  # Read PIDs into an array, trimming whitespaces
-  readarray -t SERVER_PID < <(pgrep "tritonserver")
-}
-
-# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
-# success, 1 on failure
-function wait_for_server_ready() {
-    local wait_time_secs="${1:-30}"; shift
-    local spids=("$@");
-
-    WAIT_RET=0
-
-    local wait_secs=$wait_time_secs
-    until test $wait_secs -eq 0 ; do
-        # Multi-GPU will spawn multiple pids
-        for pid in "${spids[@]}"; do
-            if ! kill -0 $pid > /dev/null 2>&1; then
-                echo "=== Server not running."
-                WAIT_RET=1
-                return
-            fi
-        done
-
-        sleep 1;
-
-        set +e
-        code=`curl -s -w %{http_code} ${SERVER_IPADDR}:8000/v2/health/ready`
-        set -e
-        if [ "$code" == "200" ]; then
-            code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' localhost:8000/v2/logging`
-            assert_curl_success "Failed to change log settings necessary for verification" ${BASH_LINENO}
-            return
-        fi
-
-        ((wait_secs--));
-    done
-
-    echo "=== Timeout $wait_time_secs secs. Server not ready."
-    WAIT_RET=1
-}
-
-function reset_model_repo {
-    rm -rf triton_model_repo/
-    mkdir ${MODEL_DIR}
-}
-
-function kill_server {
-    pgrep tritonserver | xargs kill -SIGINT
-    if pgrep -x "trtllmExecutorWorker" > /dev/null; then
-        pkill -SIGINT -f "trtllmExecutorWorker"
-    fi
-}
-
-function wait_for_server_terminated {
-    local spids=("$@");
-    for pid in "${spids[@]}"; do
-        echo "Waiting for proc ${pid} to terminate..."
-        while true; do
-            if ! (kill -0 $pid) > /dev/null 2>&1; then
-                break
-            fi
-            sleep 1
-        done
-    done
-}
-
-function assert_curl_success {
-  message="${1}"
-  original_line_no="${2}"
-  RET=0
-  if [ "$code" != "200" ]; then
-    cat ./curl.out
-    cat ${SERVER_LOG}
-    echo -e "\n***\n*** ${message} : line ${original_line_no}\n***"
-    RET=1
-  fi
-  return ${RET}
-}
-
-# =======================================
-
-rm -f *.log *.out
-# Generate TRT_LLM engines and install dependencies
-source ./generate_engines.sh
-python3 -m pip install --upgrade pip && \
-    pip3 install tritonclient[all] && \
-    pip3 install pandas && \
-    pip3 install tabulate
-
-export AVAILABLE_GPUS=$(nvidia-smi -L | wc -l)
-
-RET=0
-
-NUM_GPUS_TO_TEST=("1" "2" "4")
-for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
-    if [ "$AVAILABLE_GPUS" -lt "$NUM_GPU" ]; then
-        break
-    fi
-
-    SERVER_ARGS="--world_size=${NUM_GPU} --model_repo=${MODEL_DIR}"
-
-    reset_model_repo
-
-    cp -r /opt/tritonserver/tensorrtllm_backend/all_models/inflight_batcher_llm/* ${MODEL_DIR}
-    rm -rf ${MODEL_DIR}/tensorrt_llm_bls
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/ensemble/config.pbtxt"
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_DIR}/preprocessing/config.pbtxt"
-    replace_config_tags '${decoupled_mode}' 'False' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${batching_strategy}' 'INVALID' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${engine_dir}' "${MODEL_DIR}/tensorrt_llm/1/inflight_${NUM_GPU}_gpu/" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-    replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt"
-    replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt"
-    replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt"
-
-    # Copy the engine and place it into the model folder
-    cp -r ${BASE_DIR}/engines/inflight_${NUM_GPU}_gpu/ triton_model_repo/tensorrt_llm/1
-
-    # Invalid GPT model Type
-    SERVER_LOG="./${NUM_GPU}gpu_invalid_batch_strat.log"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-
-    # Expect invalid GPT model type error to be gracefully handled
-    if [ `grep -c "Invalid gpt_model_type" $SERVER_LOG` == "0" ]; then
-        echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    wait_for_server_terminated ${SERVER_PID[@]}
-
-    # inflight batching OFF (V1)
-    # streaming OFF
-    SERVER_LOG="./${NUM_GPU}gpu_v1_no_streaming_server.log"
-    replace_config_tags 'INVALID' 'V1' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-        --max-input-len=500 \
-        dataset --dataset=${DATASET} \
-        --tokenizer-dir=${TOKENIZER_DIR}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-        --max-input-len=500 \
-        --dataset=${DATASET}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-
-    # inflight batching ON
-    # streaming OFF
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_server.log"
-    replace_config_tags 'V1' 'inflight_fused_batching' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-        --max-input-len=500 \
-        dataset --dataset=${DATASET} \
-        --tokenizer-dir=${TOKENIZER_DIR}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-        --max-input-len=500 \
-        --dataset=${DATASET}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-
-    # Start a clean server to verify base metrics are being
-    # reported correctly
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_no_streaming_base_metrics.log"
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-    set -e
-
-    python3 ${BASE_METRICS_VERIFICATION_TEST} >> ${BASE_METRICS_VERIFICATION_LOG} 2>&1
-    if [ $? -ne 0 ]; then
-        cat ${BASE_METRICS_VERIFICATION_LOG}
-        echo -e "\n***\n*** Error executing base metrics verification test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        RET=1
-    fi
-    set +e
-
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-
-    # World size must be 1 when using multi-model
-    if [ "${NUM_GPU}" == "0" ]; then
-        # Multi-model
-        SERVER_LOG="./${NUM_GPU}gpu_multi_model.log"
-        run_server "${SERVER_ARGS} --multi-model"
-        wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        if [ "$WAIT_RET" != "0" ]; then
-            # Cleanup
-            kill $SERVER_PID > /dev/null 2>&1 || true
-            echo -e "\n***\n*** Failed to start $SERVER\n***"
-            cat $SERVER_LOG
-            exit 1
-        fi
-        set -e
-
-        python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-            --max-input-len=500 \
-            --dataset=${DATASET}
-
-        if [ $? -ne 0 ]; then
-            cat $SERVER_LOG
-            echo -e "\n***\n*** Error executing inflight batching end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-            kill_server
-            wait_for_server_terminated ${SERVER_PID[@]}
-            RET=1
-        fi
-        set +e
-
-        # Make sure the metrics is retrieved after the server has updated the metrics internally
-        sleep ${SLEEP_DURATION}
-        curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out
-
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-    fi
-
-    # inflight batching ON
-    # streaming ON
-    SERVER_LOG="./${NUM_GPU}gpu_IFB_streaming_server.log"
-    replace_config_tags 'decoupled: False' 'decoupled: True' "${MODEL_DIR}/tensorrt_llm/config.pbtxt"
-
-    run_server "${SERVER_ARGS}"
-    wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    set -e
-    python3 ${STREAM_DIR}/end_to_end_grpc_client.py \
-        --prompt="My name is"
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing inflight batching end-to-end streaming test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_PID[@]}
-
-done
-
-# Verify TRT LLM statistics are being properly reported as custom metrics
-python3 ${CUSTOM_METRICS_VERIFICATION_TEST} >> ${CUSTOM_METRICS_VERIFICATION_LOG} 2>&1
-if [ $? -ne 0 ]; then
-    cat ${CUSTOM_METRICS_VERIFICATION_LOG}
-    RET=1
-fi
-
-if [ $RET -eq 0 ]; then
-  echo -e "\n***\n*** Test Passed\n***"
-else
-  echo -e "\n***\n*** Test FAILED\n***"
-fi
-
-exit $RET
diff --git a/ci/README.md b/ci/README.md
deleted file mode 100644
index 8a20dfb0..00000000
--- a/ci/README.md
+++ /dev/null
@@ -1,112 +0,0 @@
-<!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--->
-
-# Testing TensorRT-LLM backend
-
-Tests in this CI directory can be run manually to provide extensive testing.
-
-## Run QA Tests
-
-Before the Triton 23.10 release, you can launch the Triton 23.09 container
-`nvcr.io/nvidia/tritonserver:23.09-py3` and add the directory
-`/opt/tritonserver/backends/tensorrtllm` within the container following the
-instructions in [Option 3 Build via CMake](../README.md#option-3-build-via-cmake).
-
-Run the testing within the Triton container.
-
-```bash
-docker run --rm -it --net host --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v /path/to/tensorrtllm_backend:/opt/tritonserver/tensorrtllm_backend nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 bash
-
-# Change directory to the test and run the test.sh script
-cd /opt/tritonserver/tensorrtllm_backend/ci/<test directory>
-bash -x ./test.sh
-```
-
-## Run the e2e/benchmark_core_model to benchmark
-
-These two tests are ran in the [L0_backend_trtllm](./L0_backend_trtllm/)
-test. Below are the instructions to run the tests manually.
-
-### Generate the model repository
-
-Follow the instructions in the
-[Create the model repository](../README.md#create-the-model-repository)
-section to prepare the model repository.
-
-### Modify the model configuration
-
-Follow the instructions in the
-[Modify the model configuration](../README.md#modify-the-model-configuration)
-section to modify the model configuration based on the needs.
-
-### End to end test
-
-[End to end test script](../tools/inflight_batcher_llm/end_to_end_test.py) sends
-requests to the deployed `ensemble` model.
-
-Ensemble model is ensembled by three models: `preprocessing`, `tensorrt_llm` and `postprocessing`:
-- "preprocessing": This model is used for tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints).
-- "tensorrt_llm": This model is a wrapper of your TensorRT-LLM model and is used for inferencing
-- "postprocessing": This model is used for de-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string).
-
-The end to end latency includes the total latency of the three parts of an ensemble model.
-
-```bash
-cd tools/inflight_batcher_llm
-python3 end_to_end_test.py --dataset <dataset path>
-```
-
-Expected outputs
-```
-[INFO] Functionality test succeed.
-[INFO] Warm up for benchmarking.
-[INFO] Start benchmarking on 125 prompts.
-[INFO] Total Latency: 11099.243 ms
-```
-
-### benchmark_core_model
-
-[benchmark_core_model script](../tools/inflight_batcher_llm/benchmark_core_model.py)
-sends requests directly to the deployed `tensorrt_llm` model, the benchmark_core_model
-latency indicates the inference latency of TensorRT-LLM, not including the
-pre/post-processing latency which is usually handled by a third-party library
-such as HuggingFace.
-
-```bash
-cd tools/inflight_batcher_llm
-python3 benchmark_core_model.py dataset --dataset <dataset path>
-```
-
-Expected outputs
-
-```
-[INFO] Warm up for benchmarking.
-[INFO] Start benchmarking on 125 prompts.
-[INFO] Total Latency: 10213.462 ms
-```
-*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
diff --git a/dockerfile/Dockerfile.triton.trt_llm_backend b/dockerfile/Dockerfile.triton.trt_llm_backend
index 524ca41a..e36da3ec 100644
--- a/dockerfile/Dockerfile.triton.trt_llm_backend
+++ b/dockerfile/Dockerfile.triton.trt_llm_backend
@@ -1,53 +1,84 @@
-ARG BASE_IMAGE # Use NGC PyTorch image as base image
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.06-py3-min
+ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:25.06-py3
+ARG NVRTC_VER=12.9.86-1
+ARG TRT_VER=10.11.0.33
+ARG NCCL_VER=2.27.5-1+cuda12.9
+ARG RELEASE_URL_TRT_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-${TRT_VER}.Linux.x86_64-gnu.cuda-12.9.tar.gz
+ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-${TRT_VER}.Linux.aarch64-gnu.cuda-12.9.tar.gz
+
+# Versions of packages to copy from pytorch image
+ARG TORCH_VER=2.8.0a0+5228986c39.nv25.6
+ARG TORCHVISION_VER=0.22.0a0+95f10a4e
+ARG SETUPTOOLS_VER=78.1.1
+ARG PYTORCH_TRITON_VER=3.3.0+git96316ce52.nvinternal
+ARG JINJA2_VER=3.1.6
+ARG NETWORKX_VER=3.5
+ARG SYMPY_VER=1.14.0
+ARG PACKAGING_VER=23.2
+ARG FLASH_ATTN_VER=2.7.4.post1
+
+ARG TENSORRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
+ARG TENSORRTLLM_REPO_TAG=release/1.0
+ARG TENSORRTLLM_VER=1.0.0
+
+FROM ${PYTORCH_IMAGE} AS pytorch_image
+FROM ${BASE_IMAGE} AS install_dependencies
 
-FROM ${BASE_IMAGE} as install_dependencies
+WORKDIR /workspace
+
+# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
+COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
 
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
 RUN apt-get update -q=2 \
     && apt-get install -y --no-install-recommends \
+        python3-dev \
         python3-pip \
-        ccache \
         git-lfs \
+        # Remove previous TRT installation
+    && apt-get purge -y "libnvinfer*" \
+    && pip3 uninstall -y tensorrt \
     && rm -rf /var/lib/apt/lists/*
-# Remove previous TRT installation
-# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
-RUN apt-get remove -y tensorrt*
-RUN pip3 uninstall -y tensorrt
 
 ARG TRT_VER
+ARG NVRTC_VER
+ARG NCCL_VER
 
 ENV TRT_VERSION=$TRT_VER \
     TRT_VER=$TRT_VER \
     CUDA_VER=$CUDA_VERSION \
     CUDNN_VER=$CUDNN_VERSION \
-    NCCL_VER=$NCCL_VERSION \
-    CUBLAS_VER=$CUBLAS_VERSION
+    NCCL_VER=$NCCL_VER \
+    CUBLAS_VER=$CUBLAS_VERSION \
+    NVRTC_VER="${NVRTC_VER}"
 
-LABEL TRT_VERSION $TRT_VER
+LABEL TRT_VERSION=$TRT_VER
+LABEL NCCL_VER=$NCCL_VER
 
-# Download & install internal TRT release
+# Install NVRTC
 RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
-    && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
+    && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.1-1_all.deb \
     && apt install /tmp/cuda-keyring.deb \
     && rm /tmp/cuda-keyring.deb \
-    && apt-get update -q=2 \
-    && rm -rf /var/lib/apt/lists/*
-
-ARG NVRTC_VER="12.4.99-1"
-ENV NVRTC_VER="${NVRTC_VER}"
-
-RUN apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
-RUN CUDA_VER_SHORT=$(echo $CUDA_VER | awk -F. '{print $1"."$2}') \
-    && NVRTC_CUDA_VERSION=$(echo $CUDA_VER_SHORT | sed 's/\./-/g') \
-    && apt update -qq \
-    && apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
+    && apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* \
+    && CUDA_VER_SHORT=${CUDA_VER: 0:4} \
+    && NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} \
+    && apt-get update -qq \
+    && apt-get install -y --no-install-recommends \
+        cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
+        libnccl2=${NCCL_VER} \
+        libnccl-dev=${NCCL_VER} \
     && rm -rf /var/lib/apt/lists/*
 
+# Download & install TRT release
 ARG RELEASE_URL_TRT_x86
 ARG RELEASE_URL_TRT_ARM
 
 RUN [ "$(uname -m)" != "x86_64" ] && RELEASE_URL_TRT=${RELEASE_URL_TRT_ARM} || RELEASE_URL_TRT=${RELEASE_URL_TRT_x86} \
     && curl -fSL -o /tmp/tensorrt.tar.gz ${RELEASE_URL_TRT} \
-    && tar xzvf /tmp/tensorrt.tar.gz -C /usr/local \
+    # Extract the tarball, excluding Windows libraries and static libraries as
+    # they are not needed for Linux build
+    && tar xzvf /tmp/tensorrt.tar.gz --exclude="lib*win.so*" --exclude="*.a" -C /usr/local \
     && rm /tmp/tensorrt.tar.gz \
     && find /usr/local -maxdepth 1 -name Tens* -type d -exec ln -s {} /usr/local/tensorrt \;
 
@@ -56,20 +87,110 @@ RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import s
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
 ENV TRT_ROOT=/usr/local/tensorrt
 
-FROM install_dependencies as tensorrt_llm_build
-RUN pip3 install --no-cache-dir \
-      cmake \
-      polygraphy==0.49.0 \
-      mpi4py==3.1.5
+FROM install_dependencies AS tensorrt_llm_code
 
 WORKDIR /workspace
-COPY scripts scripts
-COPY tensorrt_llm tensorrt_llm
-RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --job_count 18 && cd ..
 
-FROM install_dependencies as base
+ARG TENSORRTLLM_REPO
+ARG TENSORRTLLM_REPO_TAG
+RUN git-lfs install \
+    && git clone --single-branch --recurse-submodules --depth=1 -b ${TENSORRTLLM_REPO_TAG} ${TENSORRTLLM_REPO} tensorrt_llm
+
+# Final stage to build the TRT-LLM container
+FROM ${BASE_IMAGE} AS final_stage
+
+ARG TORCH_VER
+ARG TORCHVISION_VER
+ARG SETUPTOOLS_VER
+ARG PYTORCH_TRITON_VER
+ARG JINJA2_VER
+ARG NETWORKX_VER
+ARG SYMPY_VER
+ARG PACKAGING_VER
+ARG FLASH_ATTN_VER
+# Copy necessary files from the base stage
+COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch /usr/local/lib/python3.12/dist-packages/torch
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchgen /usr/local/lib/python3.12/dist-packages/torchgen
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision /usr/local/lib/python3.12/dist-packages/torchvision
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision.libs /usr/local/lib/python3.12/dist-packages/torchvision.libs
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools /usr/local/lib/python3.12/dist-packages/setuptools
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/functorch /usr/local/lib/python3.12/dist-packages/functorch
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/triton /usr/local/lib/python3.12/dist-packages/triton
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2 /usr/local/lib/python3.12/dist-packages/jinja2
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx /usr/local/lib/python3.12/dist-packages/networkx
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy /usr/local/lib/python3.12/dist-packages/sympy
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging /usr/local/lib/python3.12/dist-packages/packaging
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/lib/python3.12/dist-packages/flash_attn
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
+COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/
+
+# Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
+COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
+
+ARG NVRTC_VER
+ARG NCCL_VER
+ENV CUDA_VER=$CUDA_VERSION \
+    NVRTC_VER="${NVRTC_VER}" \
+    NCCL_VER="${NCCL_VER}"
+
+# Install the necessary dependencies and remove previous TRT installation in the
+# final image
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+RUN apt-get update -q=2 \
+    && apt-get install -y --no-install-recommends \
+        python3-dev \
+        python3-pip \
+        git-lfs \
+        perl \
+        # Remove previous TRT installation
+    && apt-get purge -y "libnvinfer*" \
+    && pip3 uninstall -y tensorrt \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5
+
+# Install NVRTC
+RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
+    && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.1-1_all.deb \
+    && apt install /tmp/cuda-keyring.deb \
+    && rm /tmp/cuda-keyring.deb \
+    && apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* \
+    && CUDA_VER_SHORT=${CUDA_VER: 0:4} \
+    && NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} \
+    && apt-get update -qq \
+    && apt-get install -y --no-install-recommends \
+        cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
+        libnccl2=${NCCL_VER} \
+        libnccl-dev=${NCCL_VER} \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install TRT
+COPY --from=install_dependencies /usr/local/tensorrt /usr/local/tensorrt
+RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )*
 
-WORKDIR /tmp
-COPY --from=tensorrt_llm_build /workspace/tensorrt_llm/build/tensorrt_llm*whl .
+# Set environment variables
+ARG TRT_VER
+ENV TRT_VERSION=$TRT_VER
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
+ENV TRT_ROOT=/usr/local/tensorrt
 
-RUN pip3 install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt_llm*.whl
+# Install TRT-LLM wheel after all the dependencies are installed
+ARG TENSORRTLLM_VER
+RUN --mount=type=secret,id=pypi_extra_values,env=PYPI_EXTRA_VALUES \
+    pip install --no-cache-dir ${PYPI_EXTRA_VALUES} tensorrt_llm==${TENSORRTLLM_VER}
+
+# Copying the Tensorrt LLM scripts and applications
+WORKDIR /app
+COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/scripts scripts
+COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/all_models all_models
+COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/inflight_batcher_llm/client client
+COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/triton_backend/tools tools
+COPY --from=tensorrt_llm_code /workspace/tensorrt_llm/examples examples
diff --git a/dockerfile/Dockerfile.trt_llm_backend b/dockerfile/Dockerfile.trt_llm_backend
deleted file mode 100644
index de520c6d..00000000
--- a/dockerfile/Dockerfile.trt_llm_backend
+++ /dev/null
@@ -1,67 +0,0 @@
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_TAG=24.05-py3
-
-FROM ${BASE_IMAGE}:${BASE_TAG} as base
-
-RUN apt-get update && apt-get install -y --no-install-recommends rapidjson-dev python-is-python3 ccache git-lfs
-
-COPY requirements.txt /tmp/
-RUN pip3 install -r /tmp/requirements.txt --extra-index-url https://pypi.ngc.nvidia.com
-
-# Remove previous TRT installation
-# We didn't remove libnvinfer* here because tritonserver depends on the pre-installed libraries.
-RUN apt-get remove --purge -y tensorrt*
-RUN pip uninstall -y tensorrt
-
-FROM base as dev
-
-# Download & install internal TRT release
-COPY tensorrt_llm/docker/common/install_tensorrt.sh /tmp/
-RUN bash /tmp/install_tensorrt.sh && rm /tmp/install_tensorrt.sh
-ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
-ENV TRT_ROOT=/usr/local/tensorrt
-
-# Install latest Polygraphy
-COPY tensorrt_llm/docker/common/install_polygraphy.sh /tmp/
-RUN bash /tmp/install_polygraphy.sh && rm /tmp/install_polygraphy.sh
-
-# CMake
-COPY tensorrt_llm/docker/common/install_cmake.sh /tmp/
-RUN bash /tmp/install_cmake.sh && rm /tmp/install_cmake.sh
-ENV PATH="/usr/local/cmake/bin:${PATH}"
-
-# Install mpi4py
-COPY tensorrt_llm/docker/common/install_mpi4py.sh /tmp/
-RUN bash /tmp/install_mpi4py.sh && rm /tmp/install_mpi4py.sh
-
-# Use "pypi" (default) for x86_64 arch and "src_non_cxx11_abi" for aarch64 arch
-ARG TORCH_INSTALL_TYPE="pypi"
-COPY tensorrt_llm/docker/common/install_pytorch.sh install_pytorch.sh
-RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
-
-FROM dev as trt_llm_builder
-
-WORKDIR /app
-COPY scripts scripts
-COPY tensorrt_llm tensorrt_llm
-RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" -i -c && cd ..
-
-FROM trt_llm_builder as trt_llm_backend_builder
-
-WORKDIR /app/
-COPY inflight_batcher_llm inflight_batcher_llm
-RUN cd inflight_batcher_llm && bash scripts/build.sh && cd ..
-
-FROM trt_llm_backend_builder as final
-
-# Install TensorRT-LLM
-WORKDIR /app/
-COPY --from=trt_llm_builder /app/tensorrt_llm/build /app/tensorrt_llm/build
-RUN cd /app/tensorrt_llm/build && pip3 install *.whl
-
-# Install TensorRT-LLM backend
-RUN mkdir /opt/tritonserver/backends/tensorrtllm
-ENV LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorrtllm:${LD_LIBRARY_PATH}
-COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm
-COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so /opt/tritonserver/backends/tensorrtllm
-COPY --from=trt_llm_backend_builder /app/inflight_batcher_llm/build/trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm
diff --git a/docs/baichuan.md b/docs/baichuan.md
index fef7f5eb..3d383330 100644
--- a/docs/baichuan.md
+++ b/docs/baichuan.md
@@ -12,7 +12,7 @@ python build.py --model_dir ${HF_BAICHUAN_MODEL} \
                 --enable_context_fmha \
                 --use_gemm_plugin float16 \
                 --output_dir /tmp/baichuan/13B/trt_engines/fp16/1-gpu/ \
-                --paged_kv_cache \
+                --kv_cache_type paged \
                 --max_batch_size 64
 
 [11/29/2023-08:20:34] [TRT] [I] Total Host Persistent Memory: 77008
@@ -38,20 +38,20 @@ python build.py --model_dir ${HF_BAICHUAN_MODEL} \
 * Prepare configs
 
 ```bash
-cp all_models/inflight_batcher_llm/ baichuan_ifb -r
+cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r
 
-python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
-python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
-python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0
-```
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
+````
 
 * Launch server
 
 ```bash
 pip install SentencePiece
-python3 scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/
 ```
 
 this setting requires about 35GB
@@ -145,7 +145,7 @@ curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What
 * Send request by `inflight_batcher_llm_client.py` (Remember to add `trust_remote_code=True` in tokenizer of `inflight_batcher_llm_client.py`)
 
 ```bash
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL}
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL}
 
 =========
 Input sequence:  [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650]
@@ -160,7 +160,7 @@ Output sequence:  [16814, 677, 5621, 1412, 4514, 678, 2835, 677, 31106, 53, 60,
 * Run test on dataset
 
 ```
-python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500
 
 [INFO] Start testing on 13 prompts.
 [INFO] Functionality test succeed.
@@ -172,19 +172,19 @@ python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_tr
 * Run with decoupled mode (streaming)
 
 ```bash
-cp all_models/inflight_batcher_llm/ baichuan_ifb -r
+cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r
 
-python3 tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
-python3 tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True
-python3 tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 
 pip install SentencePiece
 # please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default.
-python3 scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/
 
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} --streaming
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} --streaming
 ```
 
 <details>
diff --git a/docs/build.md b/docs/build.md
new file mode 100644
index 00000000..48d13fb4
--- /dev/null
+++ b/docs/build.md
@@ -0,0 +1,33 @@
+# Building from Source
+
+This document describes how to build the TensorRT-LLM backend and the Triton
+TRT-LLM container from source. The Triton container includes TensorRT-LLM,
+along with the TensorRT-LLM backend and the Python backend.
+
+## Build the TensorRT-LLM Backend from source
+
+Make sure TensorRT-LLM is installed before building the backend. Since the
+version of TensorRT-LLM and the TensorRT-LLM backend has to be aligned, it is
+recommended to directly use the Triton TRT-LLM container from NGC or build the
+whole container from source as described below in the Build the Docker Container
+section.
+
+```bash
+cd tensorrt_llm/triton_backend/inflight_batcher_llm
+bash scripts/build.sh
+```
+
+## Build the Docker Container
+
+> [!CAUTION]
+> [build.sh](../build.sh) is currently not working and will be fixed in the next weekly update.
+
+#### Build via Docker
+
+You can build the container using the instructions in the [TensorRT-LLM Docker Build](../tensorrt_llm/docker/README.md)
+with `tritonrelease` stage. Please make sure to add CUDA_ARCHS flag for your GPU, for example if compute capability of your GPU is 89:
+
+```bash
+cd tensorrt_llm/
+make -C docker tritonrelease_build CUDA_ARCHS='89-real'
+```
diff --git a/docs/encoder_decoder.md b/docs/encoder_decoder.md
new file mode 100755
index 00000000..40b89cae
--- /dev/null
+++ b/docs/encoder_decoder.md
@@ -0,0 +1,402 @@
+# End to end workflow to run an Encoder-Decoder model
+
+### Support Matrix
+For the specific models supported by encoder-decoder family, please visit [TensorRT-LLM encoder-decoder examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#encoder-decoder-model-support). The following two model types are supported:
+* T5
+* BART
+
+## Run Encoder-Decoder with Tritonserver
+### Tritonserver setup steps
+
+#### 1. Make sure that you have initialized the TRT-LLM submodule:
+
+```
+    git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend
+    git lfs install
+    git submodule update --init --recursive
+```
+
+#### 2. Start the Triton Server Docker container within `tensorrtllm_backend` repo:
+
+If you're using [Triton TRT-LLM NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags)
+
+```
+    # Replace <yy.mm> with the version of Triton you want to use. Here using 24.08.
+    # The commands below assumes the the current directory is the
+    # TRT-LLM backend root git repository.
+
+    docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 bash
+```
+
+If [building your own TensorRT-LLM Backend container](https://github.com/triton-inference-server/tensorrtllm_backend#option-2-build-via-docker) then you can run the `tensorrtllm_backend` container:
+
+```
+    docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace triton_trt_llm bash
+```
+
+#### 3. Build the engines:
+
+Clone the target model repository from HuggingFace. Here we use [T5-small model](https://huggingface.co/google-t5/t5-small) as example but you can also follow the same steps for BART model.
+
+
+    git lfs install
+    git clone https://huggingface.co/google-t5/t5-small /workspace/hf_models/t5-small
+
+
+Build TensorRT-LLM engines.
+
+```
+    export MODEL_NAME=t5-small # or bart-base
+    export MODEL_TYPE=t5 # or bart
+    export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME}
+    export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME}
+    export ENGINE_PATH=/workspace/engines/${MODEL_NAME}
+    export INFERENCE_PRECISION=float16
+    export TP_SIZE=1
+    export MAX_BEAM_WIDTH=1
+    export MAX_BATCH_SIZE=8
+    export INPUT_LEN=1024
+    export OUTPUT_LEN=201
+
+    python3 tensorrt_llm/examples/models/core/enc_dec/convert_checkpoint.py \
+    --model_type ${MODEL_TYPE} \
+    --model_dir ${HF_MODEL_PATH} \
+    --output_dir ${UNIFIED_CKPT_PATH} \
+    --dtype ${INFERENCE_PRECISION} \
+    --tp_size ${TP_SIZE}
+
+    trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \
+    --output_dir ${ENGINE_PATH}/encoder \
+    --kv_cache_type disabled \
+    --moe_plugin disable \
+    --max_beam_width ${MAX_BEAM_WIDTH} \
+    --max_input_len ${INPUT_LEN} \
+    --max_batch_size ${MAX_BATCH_SIZE} \
+    --gemm_plugin ${INFERENCE_PRECISION} \
+    --bert_attention_plugin ${INFERENCE_PRECISION} \
+    --gpt_attention_plugin ${INFERENCE_PRECISION} \
+    --context_fmha disable # remove for BART
+
+    trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \
+    --output_dir ${ENGINE_PATH}/decoder \
+    --moe_plugin disable \
+    --max_beam_width ${MAX_BEAM_WIDTH} \
+    --max_batch_size ${MAX_BATCH_SIZE} \
+    --gemm_plugin ${INFERENCE_PRECISION} \
+    --bert_attention_plugin ${INFERENCE_PRECISION} \
+    --gpt_attention_plugin ${INFERENCE_PRECISION} \
+    --max_input_len 1 \
+    --max_encoder_input_len ${INPUT_LEN} \
+    --max_seq_len ${OUTPUT_LEN} \
+    --context_fmha disable # remove for BART
+```
+
+> **NOTE**
+>
+> If you want to build multi-GPU engine using Tensor Parallelism then you can set `--tp_size` in convert_checkpoint.py. For example, for TP=2 on 2-GPU you can set `--tp_size=2`. If you want to use beam search then set `--max_beam_width` to higher value than 1. The `--max_input_len` in encoder trtllm-build controls the model input length and should be same as `--max_encoder_input_len` in decoder trtllm-build. Additionally, to control the model output len you should set `--max_seq_len` in decoder trtllm-build to `desired output length + 1`. It is also advisable to tune [`--max_num_tokens`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md#max_num_tokens) as the default value of 8192 might be too large or too small depending on your input, output len and use-cases. For BART family models, make sure to remove `--context_fmha disable` from both encoder and decoder trtllm-build commands. Please refer to [TensorRT-LLM enc-dec example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#build-tensorrt-engines) for more details.
+
+#### 4. Prepare Tritonserver configs <a id="prepare-tritonserver-configs"></a>
+
+```
+    cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+
+```
+
+> **NOTE**
+>
+> Currently, encoder-decoder models don't support running with chunked context.
+
+#### 5. Launch Tritonserver
+
+```
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/
+```
+
+### Send requests
+#### 1. Send request with CURL
+
+```
+curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": \"\", \"stop_words\": \"\"}"
+
+    {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Tickets will go on sale Monday, March 9 at 10 a.m. local time."}
+```
+
+#### 2. Send request with `bad_words` and `stop_words`
+
+After applying the `stop_words` and `bad_words`, the output avoids the bad words and stops at the first generated stop word.
+
+```
+curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}"
+
+    {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the home of 3rdEyeGirl's Hannah Welton."}
+```
+
+#### 3. Send request by `inflight_batcher_llm_client.py`
+If not already installed, install `tritonclient`
+
+```
+    pip install tritonclient[all]
+    python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH}
+
+    ========
+    Using pad_id:  0
+    Using end_id:  1
+    Input sequence:  [13959, 1566, 12, 2968, 10, 100, 19, 207, 1]
+    [TensorRT-LLM][WARNING] decoder_input_ids is not present in the request for encoder-decoder model. The decoder input tokens will be set to [padId]
+    Got completed request
+    Input: translate English to German: This is good
+    Output beam 0: Das is gut.
+    Output sequence:  [644, 229, 1806, 5]
+```
+
+> **NOTE**
+>
+> Please ignore any exception thrown with the output. It's a known issue to be fixed.
+
+#### 4. Run test on dataset
+
+```
+    python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500
+
+    [INFO] Start testing on 13 prompts.
+    [INFO] Functionality test succeed.
+    [INFO] Warm up for benchmarking.
+    [INFO] Start benchmarking on 13 prompts.
+    [INFO] Total Latency: 155.756 ms
+```
+
+#### 5. Run several requests at the same time
+
+```
+echo "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}" > tmp.txt
+
+printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt
+```
+#### 6. Evaluating performance with Gen-AI Perf
+
+Gen-AI Perf is a command line tool for measuring the throughput and latency of generative AI models as served through an inference server. You can read more about installing Gen-AI Perf [here](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html#installation).
+
+To use Gen-AI Perf, run the following command:
+
+```
+genai-perf profile \
+  -m ensemble \
+  --service-kind triton \
+  --backend tensorrtllm \
+  --num-prompts 100 \
+  --random-seed 123 \
+  --synthetic-input-tokens-mean 200 \
+  --synthetic-input-tokens-stddev 0 \
+  --output-tokens-mean 100 \
+  --output-tokens-stddev 0 \
+  --tokenizer ${HF_MODEL_PATH} \
+  --concurrency 1 \
+  --measurement-interval 4000 \
+  --profile-export-file my_profile_export.json \
+  --url localhost:8001
+```
+
+You should expect an output that looks like this (the output below was obtained on A100-80GB with TRT-LLM v0.12):
+
+```                                  LLM Metrics
+┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
+┃              Statistic ┃    avg ┃    min ┃    max ┃    p99 ┃    p90 ┃    p75 ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
+│   Request latency (ms) │  80.92 │  78.84 │ 323.55 │  85.14 │  79.90 │  79.64 │
+│ Output sequence length │  95.83 │  65.00 │ 100.00 │ 100.00 │  99.00 │  98.00 │
+│  Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.00 │ 200.00 │ 200.00 │
+└────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
+Output token throughput (per sec): 1182.70
+Request throughput (per sec): 12.34
+```
+
+#### 7. Run with decoupled mode (streaming)
+
+To enable streaming, we set `decoupled_mode:True` in config.pbtxt of `tensorrt_llm` and `tensorrt_llm_bls` model (if you are using BLS instead of ensemble).
+
+```
+    cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+
+```
+
+We launch Tritonserver
+
+```
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/
+```
+
+Then send request by `inflight_batcher_llm_client.py`
+
+```
+pip install tritonclient[all]
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH} --streaming
+```
+
+To use Gen-AI Perf to benchmark streaming/decoupled mode, run the following command:
+
+```
+genai-perf profile \
+  -m ensemble \
+  --service-kind triton \
+  --backend tensorrtllm \
+  --num-prompts 100 \
+  --random-seed 123 \
+  --synthetic-input-tokens-mean 200 \
+  --synthetic-input-tokens-stddev 0 \
+  --streaming \
+  --output-tokens-mean 100 \
+  --output-tokens-stddev 0 \
+  --tokenizer ${HF_MODEL_PATH} \
+  --concurrency 1 \
+  --measurement-interval 4000 \
+  --profile-export-file my_profile_export.json \
+  --url localhost:8001
+```
+
+You should see output like this (the output below was obtained on A100-80GB with TRT-LLM v0.12)
+
+```
+                                   LLM Metrics
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
+┃                Statistic ┃    avg ┃    min ┃    max ┃    p99 ┃    p90 ┃    p75 ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
+│ Time to first token (ms) │   4.69 │   3.99 │  14.05 │   5.70 │   5.04 │   4.76 │
+│ Inter token latency (ms) │   0.63 │   0.38 │   1.04 │   0.98 │   0.70 │   0.66 │
+│     Request latency (ms) │  75.32 │  46.34 │ 114.27 │  90.35 │  79.27 │  79.11 │
+│   Output sequence length │ 116.50 │  58.00 │ 197.00 │ 197.00 │ 132.00 │ 128.00 │
+│    Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.10 │ 200.00 │ 200.00 │
+└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
+Output token throughput (per sec): 1542.81
+Request throughput (per sec): 13.24
+```
+
+## Running multiple instances of encoder-decoder model on multiple GPUs
+
+In this section, we demonstrate how you can use
+[Leader Mode](../README.md#leader-mode) for running multiple instances of a encoder-decoder model on different GPUs.
+
+For this section, let's assume that we have four GPUs and the CUDA device ids
+are 0, 1, 2, and 3.  We will be launching two instances of the T5-small model
+with tensor parallelism 2 (TP=2). The first instance will run on GPUs 0 and 1
+and the second instance will run on GPUs 2 and 3. We will launch two separate `mpirun` commands to launch two separate Triton servers, one for each GPU (4 Triton Server instances in total). We also need to use a reverse proxy in front of them to load balance the requests between the servers.
+
+[Orchestrator Mode](../README.md#orchestrator-mode) currently not supported.
+
+
+### Triton setup steps
+1. Build the model, but add `--tp_size 2` when converting checkpoints. The rest of the steps are the same as [Tritonserver Setup
+](#Tritonserver-setup-steps).
+
+```
+    export MODEL_NAME=t5-small
+    export MODEL_TYPE=t5 # or bart
+    export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME}
+    export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME}-2tp-2gpu
+    export ENGINE_PATH=/workspace/engines/${MODEL_NAME}-2tp-2gpu
+
+    python tensorrt_llm/examples/models/core/enc_dec/convert_checkpoint.py \
+        --model_type ${MODEL_TYPE} \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16 \
+        --tp_size 2
+
+    trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \
+        --output_dir ${ENGINE_PATH}/encoder \
+        --kv_cache_type disabled \
+        --moe_plugin disable \
+        --max_batch_size 64 \
+        --gemm_plugin float16 \
+        --bert_attention_plugin float16 \
+        --gpt_attention_plugin float16 \
+        --max_input_len 2048 \
+        --context_fmha disable
+
+    trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \
+        --output_dir ${ENGINE_PATH}/decoder \
+        --moe_plugin disable \
+        --max_batch_size 64 \
+        --gemm_plugin float16 \
+        --bert_attention_plugin float16 \
+        --gpt_attention_plugin float16 \
+        --context_fmha disable \
+        --max_input_len 1 \
+        --max_encoder_input_len 2048
+```
+
+3. Setup Tritonserver config with the same commands in [step 4](#prepare-tritonserver-configs) above.
+
+4. Launch the servers:
+
+```
+    CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004
+    CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005
+```
+
+4. Install NGINX:
+
+```
+    apt update
+    apt install nginx -y
+```
+
+5. Setup the NGINX configuration and store it in `/etc/nginx/sites-available/tritonserver`:
+
+```
+    upstream tritonserver {
+        server localhost:8000;
+        server localhost:8002;
+    }
+
+    server {
+        listen 8080;
+
+        location / {
+            proxy_pass http://tritonserver;
+        }
+    }
+```
+
+6. Create a symlink and restart NGINX to enable the configuration:
+
+```
+    ln -s /etc/nginx/sites-available/tritonserver /etc/nginx/sites-enabled/tritonserver
+    service nginx restart
+```
+
+### Send the request
+
+1. Run test on dataset
+
+```
+    # Test the load on all the servers
+    python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080
+
+    # Test the load on one of the servers
+    python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000
+```
+
+### Kill the server
+```
+pgrep mpirun | xargs kill
+```
diff --git a/docs/gemma.md b/docs/gemma.md
index fed782ae..f8959ec6 100644
--- a/docs/gemma.md
+++ b/docs/gemma.md
@@ -14,20 +14,20 @@ ENGINE_PATH=/tmp/gemma/2B/bf16/1-gpu/
 Note that we use `tokenizer_type=sp` (sentencepiece) tokenizer.
 
 ```bash
-cp all_models/inflight_batcher_llm/ gemma -r
+cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ gemma -r
 
-python3 tools/fill_template.py -i gemma/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,preprocessing_instance_count:1,add_special_tokens:True
-python3 tools/fill_template.py -i gemma/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,postprocessing_instance_count:1
-python3 tools/fill_template.py -i gemma/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
-python3 tools/fill_template.py -i gemma/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i gemma/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:guaranteed_no_evict
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,preprocessing_instance_count:1,add_special_tokens:True
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:guaranteed_no_evict,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 
 ```
 
 * Launch server
 
 ```bash
-python3 scripts/launch_triton_server.py --world_size 1 --model_repo=gemma/
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=gemma/
 ```
 
 
diff --git a/docs/guided_decoding.md b/docs/guided_decoding.md
new file mode 100644
index 00000000..90854451
--- /dev/null
+++ b/docs/guided_decoding.md
@@ -0,0 +1,128 @@
+# End-to-End Workflow for Guided Decoding with TensorRT-LLM Backend
+
+This document outlines the process for running guided decoding using the TensorRT-LLM backend. Guided decoding ensures that generated outputs adhere to specified formats, such as JSON. Currently, this feature is supported through the [XGrammar](https://github.com/mlc-ai/xgrammar) backend.
+
+For more information, refer to the [guided decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/executor.md#structured-output-with-guided-decoding) from TensorRT-LLM. Additionally, you can explore another example of [guided decoding + LLM API example](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_guided_decoding.html).
+
+## Overview of Guided Decoding
+Guided decoding ensures that generated outputs conform to specific constraints or formats. Supported guide types include:
+- **None**: No constraints.
+- **JSON**: Outputs in JSON format.
+- **JSON Schema**: JSON format with schema validation.
+- **Regex**: Outputs matching a regular expression.
+- **EBNF Grammar**: Outputs adhering to extended Backus-Naur form (EBNF) grammar rules.
+
+# Build TensorRT-LLM engine and launch Tritonserver
+
+From this point, we assume you installed all requirements for tensorrtllm_backend. You can refer to [build.md](build.md) for installation and docker launch.
+
+## Build TensorRT-LLM engine
+```bash
+# Clone model from Hugging Face
+export MODEL_NAME=TinyLlama-1.1B-Chat-v1.0
+git clone https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0 hf_models/${MODEL_NAME}
+
+export HF_MODEL_PATH=hf_models/${MODEL_NAME}
+export UNIFIED_CKPT_PATH=trt_ckpts/tiny_llama_1b/1-gpu/fp16
+export ENGINE_PATH=trt_engines/tiny_llama_1b/1-gpu/fp16
+
+python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_MODEL_PATH} \
+                             --output_dir ${UNIFIED_CKPT_PATH} \
+                             --dtype float16
+
+trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+             --remove_input_padding enable \
+             --gpt_attention_plugin float16 \
+             --context_fmha enable \
+             --gemm_plugin float16 \
+             --output_dir ${ENGINE_PATH} \
+             --kv_cache_type paged \
+             --max_batch_size 64
+```
+## Launch Tritonserver
+
+## Python Backend
+```bash
+export GUIDED_DECODING_BACKEND=xgrammar
+export TRITON_BACKEND=python
+
+cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r
+
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,tokenizer_dir:${HF_MODEL_PATH},guided_decoding_backend:${GUIDED_DECODING_BACKEND}
+
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
+```
+
+## C++ Backend
+In order to do run `TRITON_BACKEND=tensorrtllm` which means to do run on C++ backend, you need an extra step to extract tokenizer's information into json format. `generate_xgrammar_tokenizer_info.py` will create `xgrammar_tokenizer_info.json` under given output_dir argument. And we fill the `xgrammer_tokenizer_info_path` parameter in `tensorrt_llm/config.pbtxt`.
+```bash
+export XGRAMMAR_TOKENIZER_INFO_DIR=tokenizer_info/${MODEL_NAME}
+
+python3 tensorrt_llm/examples/generate_xgrammar_tokenizer_info.py --model_dir ${HF_MODEL_PATH} --output_dir ${XGRAMMAR_TOKENIZER_INFO_DIR}
+
+export XGRAMMAR_TOKENIZER_INFO_PATH=tokenizer_info/${MODEL_NAME}/xgrammar_tokenizer_info.json
+export GUIDED_DECODING_BACKEND=xgrammar
+export TRITON_BACKEND=tensorrtllm
+
+cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r
+
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,guided_decoding_backend:${GUIDED_DECODING_BACKEND},xgrammar_tokenizer_info_path:${XGRAMMAR_TOKENIZER_INFO_PATH}
+
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
+```
+# Sending Guided Decoding Requests
+
+Use the provided gRPC client to send requests with different guide types.
+```bash
+# Set the prompt
+PROMPT="What is the year after 2024? Answer:"
+
+# 0. Guide type: None
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble
+
+# Output:
+# 0: 2025
+#
+# Question 3: What is the year after 2025? Answer: 2026
+#
+
+# 1. Guide type: json
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p  "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json
+
+# Output:
+# 0: [2025]
+
+# 2. Guide type: json_schema
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p  "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json_schema --guided-decoding-guide '{"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"}'
+
+# Output:
+# 0: {"answer": 2026}
+
+# 3. Guide type: regex
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type regex --guided-decoding-guide '\d+'
+
+# Output:
+# 0: 2025
+
+# 4. Guide type: ebnf_grammar
+python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type ebnf_grammar --guided-decoding-guide 'root ::= [0-9]+'
+
+# Output:
+# 0: 2025
+```
+
+Use curl method to send requests
+```bash
+curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is the year after 2024? Answer:", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2, "guided_decoding_guide_type":"json"}'
+
+# Output:
+# {"model_name":"ensemble","model_version":"1","sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"[2025]"}
+```
diff --git a/docs/llama.md b/docs/llama.md
index d1ae08ed..c67c31b3 100644
--- a/docs/llama.md
+++ b/docs/llama.md
@@ -36,7 +36,7 @@ docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggi
 export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json')).parent)"`
 export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b/
 export ENGINE_PATH=/tmp/engines/llama/7b/
-python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
+python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
                              --output_dir ${UNIFIED_CKPT_PATH} \
                              --dtype float16
 
@@ -46,27 +46,27 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
              --context_fmha enable \
              --gemm_plugin float16 \
              --output_dir ${ENGINE_PATH} \
-             --paged_kv_cache enable \
+             --kv_cache_type paged \
              --max_batch_size 64
 ```
 
 * Prepare configs
 
 ```bash
-cp all_models/inflight_batcher_llm/ llama_ifb -r
+cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r
 
-python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
-python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
-python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 ```
 
 * Launch server
 
 ```bash
 pip install SentencePiece
-python3 scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
 ```
 
 this setting requires about 25GB
@@ -114,7 +114,7 @@ curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What
 * Send request by `inflight_batcher_llm_client.py`
 
 ```bash
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL}
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL}
 
 =========
 [[1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263]]
@@ -128,7 +128,7 @@ output_ids =  [14547, 297, 3681, 322, 4517, 1434, 8401, 304, 1570, 3088, 297, 29
 * Run test on dataset
 
 ```
-python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500
 
 [INFO] Start testing on 13 prompts.
 [INFO] Functionality test succeed.
@@ -142,18 +142,18 @@ python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_tr
 * Run with decoupled mode (streaming)
 
 ```bash
-cp all_models/inflight_batcher_llm/ llama_ifb -r
+cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r
 
-python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
-python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True
-python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:Truelogits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 
 pip install SentencePiece
-python3 scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
 
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} --streaming
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} --streaming
 ```
 
 <details>
diff --git a/docs/llama_multi_instance.md b/docs/llama_multi_instance.md
index f7dbd3b9..5dce2a91 100644
--- a/docs/llama_multi_instance.md
+++ b/docs/llama_multi_instance.md
@@ -74,7 +74,7 @@ same GPU.
 4. Run the test client to measure performance:
 
 ```bash
-python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500
 ```
 
 If you plan to use the BLS version instead of the ensemble model, you might also
@@ -111,7 +111,7 @@ export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b-2tp-2gpu/
 export ENGINE_PATH=/tmp/engines/llama/7b-2tp-2gpu/
 
 # Create the checkpoint
-python tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
+python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
                              --output_dir ${UNIFIED_CKPT_PATH} \
                              --dtype float16 \
                              --tp_size 2
@@ -123,7 +123,7 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
              --context_fmha enable \
              --gemm_plugin float16 \
              --output_dir ${ENGINE_PATH} \
-             --paged_kv_cache enable \
+             --kv_cache_type paged \
              --max_batch_size 64
 ```
 
@@ -131,13 +131,13 @@ trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
 
 ```bash
 # Setup the model repository for the first instance.
-cp all_models/inflight_batcher_llm/ llama_ifb -r
+cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r
 
-python3 tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
-python3 tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
-python3 tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64
-python3 tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 ```
 
 ### Leader Mode
@@ -150,8 +150,8 @@ between the servers.
 3a. Launch the servers:
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,1 python3 scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004
-CUDA_VISIBLE_DEVICES=2,3 python3 scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005
+CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004
+CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005
 ```
 
 4a. Install NGINX:
@@ -191,10 +191,10 @@ service nginx restart
 pip3 install tritonclient[all]
 
 # Test the load on all the servers
-python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080
 
 # Test the load on one of the servers
-python3 tools/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000
 ```
 
 8a. Kill the server:
@@ -205,10 +205,14 @@ pgrep mpirun | xargs kill
 
 ### Orchestrator Mode
 
-In this mode, we will create a copy of the TensorRT-LLM model and use the
-`gpu_device_ids` field to specify which GPU should be used by each model
-instance. Then, we need to modify the client to distribute the requests between
-different models.
+With orchestrator mode, there are two options for running multiple instances
+of a single model:
+
+1. Creating separate Triton models
+
+2. Starting from the 24.08 release, you can use Triton `instance_group` field to specify the number TRT-LLM model instances. With that option, the load balancing decision will be done in Triton core.
+
+#### 1. Creating Separate Triton Models
 
 3b. Create a copy of the `tensorrt_llm` model:
 
@@ -234,7 +238,22 @@ sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_2"/g' llama_ifb/tensorrt_llm_
 5b. Launch the server:
 
 ```bash
-python3 scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/
+```
+
+Alternatively, you can start all MPI ranks at once and avoid dynamic process spawning
+by using the `--disable-spawn-processes`. The config file must specify which ranks each
+model should use:
+
+```bash
+sed -i 's/\${participant_ids}/1,2/g' llama_ifb/tensorrt_llm/config.pbtxt
+sed -i 's/\${participant_ids}/3,4/g' llama_ifb/tensorrt_llm_2/config.pbtxt
+```
+
+Note that rank 0 is reserved for the orchestrator rank.
+
+```bash
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ --disable-spawn-processes --world_size=5
 ```
 
 6b. Run the test client to measure performance:
@@ -243,11 +262,11 @@ python3 scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/
 pip3 install tritonclient[all]
 
 # We will only benchmark the core tensorrtllm models.
-python3 tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \
-     dataset --dataset ci/L0_backend_trtllm/simple_data.json \
-     --tokenizer-dir $HF_LLAMA_MODEL \
-     --tesnorrt-llm-model-name tensorrtllm \
-     --tensorrt-llm-model-name tensorrtllm_2
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \
+     --tensorrt-llm-model-name tensorrt_llm \
+     --tensorrt-llm-model-name tensorrt_llm_2 \
+     dataset --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json \
+     --tokenizer-dir $HF_LLAMA_MODEL
 ```
 
 7b. Kill the server:
@@ -256,14 +275,52 @@ python3 tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \
 pgrep mpirun | xargs kill
 ```
 
+#### 2. Using Triton Core's Load Balancing
+
+In order to use Triton core's load balancing for multiple instances, you can
+increase the number of instances in the `instance_group` field and use the
+`gpu_device_ids` parameter to specify which GPUs will be used by each model
+instance.
+
+For example, if you're running a TP=2 model on a 4-GPU system and you want
+to run one instance on GPUs 0 and 1 and the other instance on GPUs 2 and 3,
+you can use the following model configuration:
+
+```
+instance_group [
+    {kind: KIND_CPU, count: 2}
+]
+
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "0,1;2,3"
+  }
+}
+```
+
+Please note that the number of set of GPU device ids must equal the number of instances.
+
 ### Orchestrator Mode vs Leader Mode Summary
 
 The table below summarizes the differences between the orchestrator mode and
 leader mode:
 
-|                                   | Orchestrator Mode  | Leader Mode |
-| ----------------------------------| :----------------: | :----------:|
-| Multi-node Support                |         ❌         |      ✅     |
-| Requires Reverse Proxy            |         ❌         |      ✅     |
-| Requires Client Changes           |         ✅         |      ❌     |
-| Requires `MPI_Comm_Spawn` Support |         ✅         |      ❌     |
+|                                   | Orchestrator Mode (Separate Models)  | Orchestrator Mode (Triton Load Balancing) |Leader Mode |
+| ----------------------------------| :----------------: | :----------------: |:----------:|
+| Requires Reverse Proxy            |         ❌         |           ❌        |     ✅     |
+| Requires Client Changes           |         ✅         |           ❌         |     ❌     |
+
+Orchestrator mode by default uses `MPI_Comm_Spawn` to create the child
+processes. If `MPI_Comm_Spawn` is used, it is not possible to distribute
+the model across multiple nodes.
+
+It is also possible to use orchestrator mode with MPI processes that have been
+pre-spawned. In order to do that, you need to set `--disable-spawn-processes`
+when using the [launch_triton_server.py](../scripts/launch_triton_server.py)
+script or `export TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES=0`. In this mode,
+it is possible to run the server across different nodes in orchestrator mode.
+
+In order to use the orchestrator mode itself, you need to set the `--multi-model`
+flag when using the [launch_triton_server.py](../scripts/launch_triton_server.py)
+script or `export TRTLLM_ORCHESTRATOR=1`.
diff --git a/docs/llmapi.md b/docs/llmapi.md
new file mode 100644
index 00000000..b9ee16da
--- /dev/null
+++ b/docs/llmapi.md
@@ -0,0 +1,109 @@
+## End to end workflow to use the pytorch LLMAPI workflow
+
+* Start the Triton Server Docker container:
+
+```bash
+# Replace <yy.mm> with the version of Triton you want to use.
+# The command below assumes the the current directory is the
+# TRT-LLM backend root git repository.
+
+docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggingface --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
+```
+
+* Prepare config
+
+```bash
+ cp -R tensorrt_llm/triton_backend/all_models/llmapi/ llmapi_repo/
+```
+
+Edit `llmapi_repo/tensorrt_llm/1/model.yaml` to change the model. You can either use a HuggingFace path or a local path. The following is based on `meta-llama/Llama-3.1-8B`.
+
+This configuration file also allows you to enable CUDA graphs support and set pipeline parallelism and tensor parallelism sizes.
+
+* Launch server
+
+```bash
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --model_repo=llmapi_repo/
+```
+
+* Send request
+
+```bash
+curl -X POST localhost:8000/v2/models/tensorrt_llm/generate -d '{"text_input": "The future of AI is", "sampling_param_max_tokens":10}' | jq
+```
+
+* Optional: include performance metrics
+
+To retrieve detailed performance metrics per request such as KV cache usage, timing breakdowns, and speculative decoding statistics - add `"sampling_param_return_perf_metrics": true` to your request payload:
+
+```bash
+curl -X POST localhost:8000/v2/models/tensorrt_llm/generate -d '{"text_input": "Please explain to me what is machine learning?", "sampling_param_max_tokens":10, "sampling_param_return_perf_metrics":true}' | jq
+```
+
+Sample response with performance metrics
+```json
+{
+  "acceptance_rate": "0.0",
+  "arrival_time_ns": "76735247746000",
+  "first_scheduled_time_ns": "76735248284000",
+  "first_token_time_ns": "76735374300000",
+  "kv_cache_alloc_new_blocks": "1",
+  "kv_cache_alloc_total_blocks": "1",
+  "kv_cache_hit_rate": "0.0",
+  "kv_cache_missed_block": "1",
+  "kv_cache_reused_block": "0",
+  "last_token_time_ns": "76736545324000",
+  "model_name": "tensorrt_llm",
+  "model_version": "1",
+  "text_output": "Please explain to me what is machine learning? \n\nMachine learning is a field of computer science that involves the development of algorithms and models that can learn from data without being explicitly programmed. It is a",
+  "total_accepted_draft_tokens": "0",
+  "total_draft_tokens": "0"
+}
+```
+
+`inflight_batcher_llm_client.py` is not supported yet.
+
+* Run test on dataset
+
+```bash
+python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 --test-llmapi --model-name tensorrt_llm
+
+[INFO] Start testing on 13 prompts.
+[INFO] Functionality test succeeded.
+[INFO] Warm up for benchmarking.
+FLAGS.model_name: tensorrt_llm
+[INFO] Start benchmarking on 13 prompts.
+[INFO] Total Latency: 377.254 ms
+```
+
+* Run benchmark
+
+```bash
+ python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \
+    --tensorrt-llm-model-name tensorrt_llm \
+    --test-llmapi \
+    dataset --dataset ./tensorrt_llm/triton_backend/tools/dataset/mini_cnn_eval.json \
+    --tokenizer-dir meta-llama/Llama-3.1-8B
+
+dataset
+Tokenizer: Tokens per word =  1.308
+[INFO] Warm up for benchmarking.
+[INFO] Start benchmarking on 39 prompts.
+[INFO] Total Latency: 1446.623 ms
+```
+
+### Start the server on a multi-node configuration
+
+The `srun` tool can be used to start the server in a multi-node environment:
+
+```
+srun -N 2 \
+    --ntasks-per-node=8 \
+    --mpi=pmix \
+    --container-image=<your image> \
+    --container-mounts=$(pwd)/tensorrt_llm/:/code \
+    trtllm-llmapi-launch /opt/tritonserver/bin/tritonserver --model-repository llmapi_repo
+
+```
+
+Note: inter-node tensor parallelism is not yet supported.
diff --git a/docs/lora.md b/docs/lora.md
new file mode 100644
index 00000000..28452556
--- /dev/null
+++ b/docs/lora.md
@@ -0,0 +1,269 @@
+# Running LoRA inference with inflight batching
+
+Below is an example of how to run LoRA inference with inflight batching. See the
+[LoRA documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/lora.md)
+in the TensorRT-LLM repository for more information about running gpt-2b with
+LoRA using inflight batching.
+
+## Launch Triton TensorRT-LLM container
+
+```bash
+docker run --rm -it --net host --shm-size=2g \
+    --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \
+    -v </path/to/tensorrtllm_backend>:/tensorrtllm_backend \
+    -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
+    -v </path/to/engines>:/engines \
+    nvcr.io/nvidia/tritonserver:<xx.yy>-trtllm-python-py3
+```
+
+## Prepare TensorRT-LLM engines with LoRA enable
+
+(Optional) Download the LLaMa model from HuggingFace if you haven't already.
+
+```bash
+huggingface-cli login
+huggingface-cli download meta-llama/Llama-2-7b-hf
+```
+
+> **NOTE**
+>
+> Make sure that you have access to https://huggingface.co/meta-llama/Llama-2-7b-hf.
+
+```bash
+cd /tensorrtllm_backend/tensorrt_llm/examples/models/core/llama
+BASE_LLAMA_MODEL=/path/to/llama-7b-hf
+
+python3 convert_checkpoint.py --model_dir ${BASE_LLAMA_MODEL} \
+                            --output_dir ./c-model/llama/fp16/1-gpu \
+                            --dtype float16
+
+trtllm-build --checkpoint_dir ./c-model/llama/fp16/1-gpu \
+            --output_dir /engines/llama_7b_with_lora_qkv/fp16/1-gpu \
+            --gemm_plugin float16 \
+            --max_batch_size 8 \
+            --max_seq_len 562 \
+            --gpt_attention_plugin float16 \
+            --kv_cache_type paged \
+            --remove_input_padding enable \
+            --use_paged_context_fmha enable \
+            --lora_plugin float16 \
+            --lora_target_modules attn_q attn_k attn_v \
+            --max_lora_rank 8
+```
+
+Note that you still need to use `hf_lora_convert.py` to convert the lora weights and store in `/tmp/lora_prefetch`. But users don't need to send the `--lora-path` when you run the inference at the first time.
+
+## Generate LoRA tensors
+
+Now generate LoRA tensors that will be passed in with each request to triton.
+
+```bash
+git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1
+git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0
+
+python3 ..//hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16
+python3 ../hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16
+```
+
+## Create a Triton model repository and launch the Triton server
+
+Create a Triton model repository following the instructions
+[here](../README.md#prepare-the-model-repository), and modify the model
+configuration following the steps
+[here](../README.md#modify-the-model-configuration).
+
+## LoRA Cache
+
+As LoRA weights are passed to the backend they will be cached in a host cache.
+As requests are scheduled, those weights with be prefetched to a gpu cache.
+After a LoRA is loaded into the cache, only `lora_task_id` is needed for inference.
+
+### lora_cache_optimal_adapter_size
+
+Optimal adapter size used to size cache pages. Typically optimally sized
+adapters will fix exactly into 1 cache page. (default: 8)
+
+```
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+```
+
+### lora_cache_max_adapter_size
+
+Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default: 64)
+
+```
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+```
+
+### lora_cache_gpu_memory_fraction
+
+Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded (default: 0.05)
+
+```
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+```
+
+### lora_cache_host_memory_bytes
+
+Size of host LoRA cache in bytes (default: 1G)
+
+```
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+```
+
+### prefetch lora cache during initializing the model instance
+
+If users want to load the lora models during initializing the model instance,
+instead of passing the lora weight as input, users can store the lora weights in `<lora_prefetch_dir>`
+and pass it as a parameter to initialize the model instance.
+Then, the model instance will try to load the lora weights from the folder.
+In the folder, users can put many folders for different lora tasks.
+For example, assume we want to store lora weights in `/tmp/lora_prefetch` and
+there are three lora tasks `0`, `1` and `3`, then the architecture of the folder would be like
+
+```bash
+/tmp/lora_prefetch
+├── 0
+│   ├── model.lora_config.npy
+│   └── model.lora_weights.npy
+├── 1
+│   ├── model.lora_config.npy
+│   └── model.lora_weights.npy
+└── 3
+    ├── model.lora_config.npy
+    └── model.lora_weights.npy
+```
+
+Note that you must name the folder by digit because the lora cache manager will view these name as lora task ids.
+
+```pbtxt
+parameters: {
+  key: "lora_prefetch_dir"
+  value: {
+    string_value: "${lora_prefetch_dir}"
+  }
+}
+```
+
+## Launch tritonserver
+
+```bash
+MODEL_FOLDER=/path/to/triton_model_repo
+# 'world_size' is the number of GPUs you want to use for serving. This should
+# be aligned with the number of GPUs used to build the TensorRT-LLM engine.
+python3 /tensorrtllm_backend/tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size=1 --model_repo=${MODEL_FOLDER}
+```
+
+Run Multi-LoRA example by issuing multiple concurrent requests.
+The inflight batcher will execute mixed batches with multiple LoRAs in the same batch.
+
+First we cache the LoRAs by sending dummy requests for each adapter.  The TASK_IDS are uniq to the adapter
+
+```bash
+pip3 install tritonclient[all]
+
+TASK_IDS=("1" "2")
+LORA_PATHS=("luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights")
+INFLIGHT_BATCHER_LLM_CLIENT=/tensorrtllm_backend/tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py
+
+for index in ${!TASK_IDS[@]}; do
+    text="dummy"
+    lora_path=${LORA_PATHS[$index]}
+    task_id=${TASK_IDS[$index]}
+    lora_arg="--lora-path ${lora_path} --lora-task-id ${task_id}"
+
+    python3 ${INFLIGHT_BATCHER_LLM_CLIENT} \
+        --top-k 0 \
+        --top-p 0.5 \
+        --request-output-len 10 \
+        --text "${text}" \
+        --tokenizer-dir /path/to/llama-7b-hf \
+        ${lora_arg} &
+done
+```
+
+Now perform inference with just `--lora-task-id`
+
+```bash
+INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:")
+TASK_IDS=("" "1" "2" "" "1" "2")
+
+for index in ${!INPUT_TEXT[@]}; do
+    text=${INPUT_TEXT[$index]}
+    task_id=${TASK_IDS[$index]}
+    lora_arg=""
+    if [ "${task_id}" != "" ]; then
+        lora_arg="--lora-task-id ${task_id}"
+    fi
+
+    python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \
+        --top-k 0 \
+        --top-p 0.5 \
+        --request-output-len 10 \
+        --text "${text}" \
+        --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \
+        ${lora_arg} &
+done
+
+wait
+```
+
+Example Output:
+
+```
+Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
+Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
+Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
+Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
+Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
+Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
+Got completed request
+Input: アメリカ合衆国の首都はどこですか? \n答え:
+Output beam 0: ワシントン D.C.
+Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889]
+Got completed request
+Input: 美国的首都在哪里? \n答案:
+Output beam 0: Washington, D.C.
+What is the
+Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278]
+Got completed request
+Input: 美国的首都在哪里? \n答案:
+Output beam 0: Washington D.C.
+Washington D.
+Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889]
+Got completed request
+Input: アメリカ合衆国の首都はどこですか? \n答え:
+Output beam 0: Washington, D.C.
+Which of
+Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310]
+Got completed request
+Input: アメリカ合衆国の首都はどこですか? \n答え:
+Output beam 0: Washington D.C.
+1. ア
+Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310]
+Got completed request
+Input: 美国的首都在哪里? \n答案:
+Output beam 0: 华盛顿
+W
+Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956]
+```
diff --git a/docs/model_config.md b/docs/model_config.md
new file mode 100644
index 00000000..b5e05d0c
--- /dev/null
+++ b/docs/model_config.md
@@ -0,0 +1,376 @@
+# Model Configuration
+
+## Model Parameters
+
+The following tables show the parameters in the `config.pbtxt` of the models in
+[all_models/inflight_batcher_llm](../tensorrt_llm/triton_backend/all_models/inflight_batcher_llm).
+that can be modified before deployment. For optimal performance or custom
+parameters, please refer to
+[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md).
+
+The names of the parameters listed below are the values in the `config.pbtxt`
+that can be modified using the
+[`fill_template.py`](../tensorrt_llm/triton_backend/tools/fill_template.py) script.
+
+**NOTE** For fields that have comma as the value (e.g. `gpu_device_ids`,
+`participant_ids`), you need to escape the comma with
+a backslash. For example, if you want to set `gpu_device_ids` to `0,1` you need
+to run `python3 fill_template.py -i config.pbtxt "gpu_device_ids:0\,1".`
+
+The mandatory parameters must be set for the model to run. The optional
+parameters are not required but can be set to customize the model.
+
+### ensemble model
+
+See
+[here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models)
+to learn more about ensemble models.
+
+*Mandatory parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). |
+| `logits_datatype` | The data type for context and generation logits. |
+
+### preprocessing model
+
+*Mandatory parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. |
+| `tokenizer_dir` | The path to the tokenizer for the model. |
+| `preprocessing_instance_count` | The number of instances of the model to run. |
+| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. |
+| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. |
+
+*Optional parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `add_special_tokens` | The `add_special_tokens` flag used by [HF tokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens). |
+| `multimodal_model_path` | The vision engine path used in multimodal workflow. |
+| `engine_dir` | The path to the engine for the model. This parameter is only needed for *multimodal processing* to extract the `vocab_size` from the engine_dir's config.json for `fake_prompt_id` mappings. |
+
+
+### multimodal_encoders model
+
+*Mandatory parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. |
+| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. |
+| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. |
+| `multimodal_model_path` | The vision engine path used in multimodal workflow. |
+| `hf_model_path` | The Huggingface model path used for `llava_onevision` and `mllama` models. |
+
+
+### postprocessing model
+
+*Mandatory parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `triton_max_batch_size` | The maximum batch size that Triton should use with the model. |
+| `tokenizer_dir` | The path to the tokenizer for the model. |
+| `postprocessing_instance_count` | The number of instances of the model to run. |
+
+*Optional parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `skip_special_tokens` | The `skip_special_tokens` flag used by [HF detokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.decode). |
+
+### tensorrt_llm model
+
+The majority of the `tensorrt_llm` model parameters and input/output tensors
+can be mapped to parameters in the TRT-LLM C++ runtime API defined in
+[`executor.h`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/executor/executor.h).
+Please refer to the Doxygen comments in `executor.h` for a more detailed
+description of the parameters below.
+
+*Mandatory parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `triton_backend` | The backend to use for the model. Set to `tensorrtllm` to utilize the C++ TRT-LLM backend implementation. Set to `python` to utlize the TRT-LLM Python runtime. |
+| `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). |
+| `decoupled_mode` | Whether to use decoupled mode. Must be set to `true` for requests setting the `stream` tensor to `true`. |
+| `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. |
+| `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. |
+| `engine_dir` | The path to the engine for the model. |
+| `batching_strategy` | The batching strategy to use. Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` |
+| `encoder_input_features_data_type` | The dtype for the input tensor `encoder_input_features`. For the mllama model, this must be `TYPE_BF16`. For other models like whisper, this is `TYPE_FP16`. |
+| `logits_datatype` | The data type for context and generation logits. |
+
+*Optional parameters*
+
+- General
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `encoder_engine_dir` | When running encoder-decoder models, this is the path to the folder that contains the model configuration and engine for the encoder model. |
+| `max_attention_window_size` | When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. (default=max_sequence_length) |
+| `sink_token_length` | Number of sink tokens to always keep in attention window. |
+| `exclude_input_in_output` | Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens. (default=`false`) |
+| `cancellation_check_period_ms` | The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. (default=100) |
+| `stats_check_period_ms` | The time for the statistics reporting thread to sleep before doing the next check. (default=100) |
+| `recv_poll_period_ms` | The time for the receiving thread in orchestrator mode to sleep before doing the next check. (default=0) |
+| `iter_stats_max_iterations` | The maximum number of iterations for which to keep statistics. (default=ExecutorConfig::kDefaultIterStatsMaxIterations) |
+| `request_stats_max_iterations` | The maximum number of iterations for which to keep per-request statistics. (default=executor::kDefaultRequestStatsMaxIterations) |
+| `normalize_log_probs` | Controls if log probabilities should be normalized or not. Set to `false` to skip normalization of `output_log_probs`. (default=`true`) |
+| `gpu_device_ids` | Comma-separated list of GPU IDs to use for this model. Use semicolons to separate multiple instances of the model. If not provided, the model will use all visible GPUs. (default=unspecified) |
+| `participant_ids` | Comma-separated list of MPI ranks to use for this model. Mandatory when using orchestrator mode with -disable-spawn-process (default=unspecified) |
+| `num_nodes` | Number of MPI nodes to use for this model. (default=1) |
+| `gpu_weights_percent` | Set to a number between 0.0 and 1.0 to specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime. Values less than 1.0 are only supported for an engine built with `weight_streaming` on. (default=1.0) |
+
+- KV cache
+
+Note that the parameter `enable_trt_overlap` has been removed from the
+config.pbtxt. This option allowed to overlap execution of two micro-batches to
+hide CPU overhead. Optimization work has been done to reduce the CPU overhead
+and it was found that the overlapping of micro-batches did not provide
+additional benefits.
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `max_tokens_in_paged_kv_cache` | The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. (default=unspecified) |
+| `kv_cache_free_gpu_mem_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache. (default=0.9) |
+| `cross_kv_cache_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of KV cache that may be used for cross attention, and the rest will be used for self attention. Optional param and should be set for encoder-decoder models ONLY. (default=0.5) |
+| `kv_cache_host_memory_bytes` |  Enable offloading to host memory for the given byte size. |
+| `enable_kv_cache_reuse` | Set to `true` to reuse previously computed KV cache values (e.g. for system prompt) |
+
+- LoRA cache
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `lora_cache_optimal_adapter_size` | Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. (default=8) |
+| `lora_cache_max_adapter_size` | Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default=64) |
+| `lora_cache_gpu_memory_fraction` | Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded. (default=0.05) |
+| `lora_cache_host_memory_bytes` | Size of host LoRA cache in bytes. (default=1G) |
+| `lora_prefetch_dir` | Folder to store the LoRA weights we hope to load during engine initialization. |
+
+- Decoding mode
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `max_beam_width` | The beam width value of requests that will be sent to the executor. (default=1) |
+| `decoding_mode` | Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. Same applies to the ReDrafter, Lookahead and Eagle. |
+
+- Optimization
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `enable_chunked_context` | Set to `true` to enable context chunking. (default=`false`) |
+| `multi_block_mode` | Set to `false` to disable multi block mode. (default=`true`) |
+| `enable_context_fmha_fp32_acc` | Set to `true` to enable FMHA runner FP32 accumulation. (default=`false`) |
+| `cuda_graph_mode` | Set to `true` to enable cuda graph. (default=`false`) |
+| `cuda_graph_cache_size` | Sets the size of the CUDA graph cache, in numbers of CUDA graphs. (default=0) |
+
+- Scheduling
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `batch_scheduler_policy` | Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused. (default=`guaranteed_no_evict`) |
+
+- Medusa
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `medusa_choices` | To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. |
+
+- Eagle
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `eagle_choices` | To specify default per-server Eagle choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. |
+
+- Guided decoding
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `guided_decoding_backend` | Set to `xgrammar` to activate guided decoder. |
+| `tokenizer_dir` | The guided decoding of tensorrt_llm python backend requires tokenizer's information. |
+| `xgrammar_tokenizer_info_path` | The guided decoding of tensorrt_llm C++ backend requires xgrammar's tokenizer's info in 'json' format. |
+
+### tensorrt_llm_bls model
+
+See
+[here](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
+to learn more about BLS models.
+
+*Mandatory parameters*
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `triton_max_batch_size` | The maximum batch size that the model can handle. |
+| `decoupled_mode` | Whether to use decoupled mode. |
+| `bls_instance_count` | The number of instances of the model to run. When using the BLS model instead of the ensemble, you should set the number of model instances to the maximum batch size supported by the TRT engine to allow concurrent request execution. |
+| `logits_datatype` | The data type for context and generation logits. |
+
+*Optional parameters*
+
+- General
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `accumulate_tokens` | Used in the streaming mode to call the postprocessing model with all accumulated tokens, instead of only one token. This might be necessary for certain tokenizers. |
+
+- Speculative decoding
+
+The BLS model supports speculative decoding. Target and draft triton models are set with the parameters `tensorrt_llm_model_name` `tensorrt_llm_draft_model_name`. Speculative decodingis performed by setting `num_draft_tokens` in the request.  `use_draft_logits` may be set to use logits comparison speculative decoding. Note that `return_generation_logits` and `return_context_logits` are not supported when using speculative decoding. Also note that requests with batch size greater than 1 is not supported with speculative decoding right now.
+
+| Name | Description |
+| :----------------------: | :-----------------------------: |
+| `tensorrt_llm_model_name` | The name of the TensorRT-LLM model to use. |
+| `tensorrt_llm_draft_model_name` | The name of the TensorRT-LLM draft model to use. |
+
+### Model Input and Output
+
+Below is the lists of input and output tensors for the `tensorrt_llm` and
+`tensorrt_llm_bls` models.
+
+#### Common Inputs
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `end_id` | [1] | `int32` | End token ID. If not specified, defaults to -1 |
+| `pad_id` | [1] | `int32` | Padding token ID |
+| `temperature` | [1] | `float32` | Sampling Config param: `temperature` |
+| `repetition_penalty` | [1] | `float` | Sampling Config param: `repetitionPenalty` |
+| `min_tokens` | [1] | `int32_t` | Sampling Config param: `minTokens` |
+| `presence_penalty` | [1] | `float` | Sampling Config param: `presencePenalty` |
+| `frequency_penalty` | [1] | `float` | Sampling Config param: `frequencyPenalty` |
+| `seed` | [1] | `uint64_t` | Sampling Config param: `seed` |
+| `return_log_probs` | [1] | `bool` | When `true`, include log probs in the output. Note: This requires at least one sampling parameter to be set (e.g., `runtime_top_k`, `runtime_top_p` for `tensorrt_llm` model, or `top_k`, `top_p` for `tensorrt_llm_bls` model). |
+| `return_context_logits` | [1] | `bool` | When `true`, include context logits in the output |
+| `return_generation_logits` | [1] | `bool` | When `true`, include generation logits in the output |
+| `num_return_sequences` | [1] | `int32_t` | Number of generated sequences per request. (Default=1) |
+| `beam_width` | [1] | `int32_t` | Beam width for this request; set to 1 for greedy sampling (Default=1) |
+| `prompt_embedding_table` | [1] | `float16` (model data type) | P-tuning prompt embedding table |
+| `prompt_vocab_size` | [1] | `int32` | P-tuning prompt vocab size |
+| `return_perf_metrics` | [1] | `bool` | When `true`, include perf metrics in the output, such as kv cache reuse stats |
+| `guided_decoding_guide_type` | [1] | `string` | Guided decoding param: `guide_type` |
+| `guided_decoding_guide` | [1] | `string` | Guided decoding param: `guide` |
+
+The following inputs for lora are for both `tensorrt_llm` and `tensorrt_llm_bls`
+models. The inputs are passed through the `tensorrt_llm` model and the
+`tensorrt_llm_bls` model will refer to the inputs from the `tensorrt_llm` model.
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `lora_task_id` | [1] | `uint64` | The unique task ID for the given LoRA. To perform inference with a specific LoRA for the first time, `lora_task_id`, `lora_weights`, and `lora_config` must all be given. The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. If the cache is full, the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached |
+| `lora_weights` | [ num_lora_modules_layers, D x Hi + Ho x D ] | `float` (model data type) | Weights for a LoRA adapter. See the config file for more details. |
+| `lora_config` | [ num_lora_modules_layers, 3] | `int32t` | Module identifier. See the config file for more details. |
+
+#### Common Outputs
+
+Note: the timing metrics oputputs are represented as the number of nanoseconds since epoch.
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `cum_log_probs` | [-1] | `float` | Cumulative probabilities for each output |
+| `output_log_probs` | [beam_width, -1] | `float` | Per-token log probabilities for each output. Only returned when `return_log_probs` is `true` and sampling parameters are set. |
+| `context_logits` | [-1, vocab_size] | `float` | Context logits for input |
+| `generation_logits` | [beam_width, seq_len, vocab_size] | `float` | Generation logits for each output |
+| `batch_index` | [1] | `int32` | Batch index |
+| `kv_cache_alloc_new_blocks` | [1] | `int32` | KV cache reuse metrics. Number of newly allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_new_blocks` in the outputs. |
+| `kv_cache_reused_blocks` | [1] | `int32` | KV cache reuse metrics. Number of reused blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_reused_blocks` in the outputs. |
+| `kv_cache_alloc_total_blocks` | [1] | `int32` | KV cache reuse metrics. Number of total allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_total_blocks` in the outputs. |
+| `arrival_time_ns` | [1] | `float` | Time when the request was received by TRT-LLM. Set the optional input `return_perf_metrics` to `true` to include `arrival_time_ns` in the outputs. |
+| `first_scheduled_time_ns` | [1] | `float` | Time when the request was first scheduled. Set the optional input `return_perf_metrics` to `true` to include `first_scheduled_time_ns` in the outputs. |
+| `first_token_time_ns` | [1] | `float` | Time when the first token was generated. Set the optional input `return_perf_metrics` to `true` to include `first_token_time_ns` in the outputs. |
+| `last_token_time_ns` | [1] | `float` | Time when the last token was generated. Set the optional input `return_perf_metrics` to `true` to include `last_token_time_ns` in the outputs. |
+| `acceptance_rate` | [1] | `float` | Acceptance rate of the speculative decoding model. Set the optional input `return_perf_metrics` to `true` to include `acceptance_rate` in the outputs. |
+| `total_accepted_draft_tokens` | [1] | `int32` | Number of tokens accepted by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_accepted_draft_tokens` in the outputs. |
+| `total_draft_tokens` | [1] | `int32` | Maximum number of draft tokens acceptable by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_draft_tokens` in the outputs. |
+
+#### Unique Inputs for tensorrt_llm model
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `input_ids` | [-1] | `int32` | Input token IDs |
+| `input_lengths` | [1] | `int32` | Input lengths |
+| `request_output_len` | [1] | `int32` | Requested output length |
+| `draft_input_ids` | [-1] | `int32` | Draft input IDs |
+| `decoder_input_ids` | [-1] | `int32` | Decoder input IDs |
+| `decoder_input_lengths` | [1] | `int32` | Decoder input lengths |
+| `draft_logits` | [-1, -1] | `float32` | Draft logits |
+| `draft_acceptance_threshold` | [1] | `float32` | Draft acceptance threshold |
+| `stop_words_list` | [2, -1] | `int32` | List of stop words |
+| `bad_words_list` | [2, -1] | `int32` | List of bad words |
+| `embedding_bias` | [-1] | `string` | Embedding bias words |
+| `runtime_top_k` | [1] | `int32` | Top-k value for runtime top-k sampling |
+| `runtime_top_p` | [1] | `float32` | Top-p value for runtime top-p sampling |
+| `runtime_top_p_min` | [1] | `float32` | Minimum value for runtime top-p sampling |
+| `runtime_top_p_decay` | [1] | `float32` | Decay value for runtime top-p sampling |
+| `runtime_top_p_reset_ids` | [1] | `int32` | Reset IDs for runtime top-p sampling |
+| `len_penalty` | [1] | `float32` | Controls how to penalize longer sequences in beam search (Default=0.f) |
+| `early_stopping` | [1] | `bool` | Enable early stopping |
+| `beam_search_diversity_rate` | [1] | `float32` | Beam search diversity rate |
+| `stop` | [1] | `bool` | Stop flag |
+| `streaming` | [1] | `bool` | Enable streaming |
+
+#### Unique Outputs for tensorrt_llm model
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `output_ids` | [-1, -1] | `int32` | Output token IDs |
+| `sequence_length` | [-1] | `int32` | Sequence length |
+
+#### Unique Inputs for tensorrt_llm_bls model
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `text_input` | [-1] | `string` | Prompt text |
+| `decoder_text_input` | [1] | `string` | Decoder input text |
+| `image_input` | [3, 224, 224] | `float16` | Input image |
+| `max_tokens` | [-1] | `int32` | Number of tokens to generate |
+| `bad_words` | [2, num_bad_words] | `int32` | Bad words list |
+| `stop_words` | [2, num_stop_words] | `int32` | Stop words list |
+| `top_k` | [1] | `int32` | Sampling Config param: `topK` |
+| `top_p` | [1] | `float32` | Sampling Config param: `topP` |
+| `length_penalty` | [1] | `float32` | Sampling Config param: `lengthPenalty` |
+| `stream` | [1] | `bool` | When `true`, stream out tokens as they are generated. When `false` return only when the full generation has completed (Default=`false`) |
+|`embedding_bias_words` | [-1] | `string` | Embedding bias words |
+| `embedding_bias_weights` | [-1] | `float32` | Embedding bias weights |
+| `num_draft_tokens` | [1] | `int32` | Number of tokens to get from draft model during speculative decoding |
+| `use_draft_logits` | [1] | `bool` | Use logit comparison during speculative decoding |
+
+#### Unique Outputs for tensorrt_llm_bls model
+
+| Name | Shape | Type | Description |
+| :------------: | :---------------: | :-----------: | :--------: |
+| `text_output` | [-1] | `string` | Text output |
+
+## Some tips for model configuration
+
+Below are some tips for configuring models for optimal performance. These
+recommendations are based on our experiments and may not apply to all use cases.
+For guidance on other parameters, please refer to the
+[perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md).
+
+- **Setting the `instance_count` for models to better utilize inflight batching**
+
+   The `instance_count` parameter in the config.pbtxt file specifies the number
+   of instances of the model to run. Ideally, this should be set to match the
+   maximum batch size supported by the TRT engine, as this allows for concurrent
+   request execution and reduces performance bottlenecks. However, it will also
+   consume more CPU memory resources. While the optimal value isn't something we
+   can determine in advance, it generally shouldn't be set to a very small
+   value, such as 1.
+   For most use cases, we have found that setting `instance_count` to 5 works
+   well across a variety of workloads in our experiments.
+
+- **Adjusting `max_batch_size` and `max_num_tokens` to optimize inflight batching**
+
+  `max_batch_size` and `max_num_tokens` are important parameters for optimizing
+  inflight batching. You can modify `max_batch_size` in the model configuration
+  file, while `max_num_tokens` is set during the conversion to a TRT-LLM engine
+  using the `trtllm-build` command. Tuning these parameters is necessary for
+  different scenarios, and experimentation is currently the best approach to
+  finding optimal values. Generally, the total number of requests should be
+  lower than `max_batch_size`, and the total tokens should be less than
+  `max_num_tokens`.
diff --git a/docs/multimodal.md b/docs/multimodal.md
new file mode 100755
index 00000000..a088ecf0
--- /dev/null
+++ b/docs/multimodal.md
@@ -0,0 +1,422 @@
+# End to end workflow to run a Multimodal model
+
+### Support Matrix
+The following multimodal model is supported in tensorrtllm_backend:
+* BLIP2-OPT
+* LLAVA
+* VILA
+* LLaVA OneVision
+* MLLAMA
+* Qwen2-VL
+
+For more multimodal models supported in TensorRT-LLM, please visit [TensorRT-LLM multimodal examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal).
+
+## Run Multimodal with single-GPU Tritonserver
+### Tritonserver setup steps
+0. Make sure that you have initialized the TRT-LLM submodule:
+
+    ```bash
+    git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend
+    git lfs install
+    git submodule update --init --recursive
+    ```
+
+1. Start the Triton Server Docker container:
+
+    1-1. If you're using Tritonserver from nvcr.io
+    ```bash
+    # Replace <yy.mm> with the version of Triton you want to use.
+    # The command below assumes the the current directory is the
+    # TRT-LLM backend root git repository.
+
+    docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
+    ```
+    1-2. If you are using `tensorrtllm_backend` container:
+    ```bash
+    docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm
+    ```
+
+2. Build the engine:
+
+    2-1. Clone the target model repository
+    ```bash
+    # For BLIP-OPT2
+    export MODEL_NAME="blip2-opt-2.7b"
+    git clone https://huggingface.co/Salesforce/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    # For LLAVA
+    export MODEL_NAME="llava-1.5-7b-hf"
+    git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    # For VILA
+    pip install -r all_models/multimodal/requirements-vila.txt
+
+    export MODEL_NAME="vila1.5-3b"
+    git clone https://huggingface.co/Efficient-Large-Model/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    export VILA_PATH="tmp/hf_models/VILA"
+    git clone https://github.com/Efficient-Large-Model/VILA.git ${VILA_PATH}
+
+    # For LLaVA OneVision
+    pip install -r all_models/multimodal/requirements-llava-onevision.txt
+
+    export MODEL_NAME="llava-onevision-qwen2-7b-ov-hf"
+    git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    # For MLLAMA
+    pip install -r all_models/multimodal/requirements-mllama.txt
+
+    export MODEL_NAME="Llama-3.2-11B-Vision"
+    git clone https://huggingface.co/meta-llama/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    # For Qwen2-VL
+    pip install -r all_models/multimodal/requirements-qwen2vl.txt
+
+    export MODEL_NAME="Qwen2-VL-7B-Instruct"
+    git clone https://huggingface.co/Qwen/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
+
+    export
+    ```
+    2-2. Build TensorRT-LLM engines
+    ```bash
+    export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME}
+    export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp16/1-gpu
+    export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu
+    export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder
+
+    # For BLIP-OPT2
+    python tensorrt_llm/examples/models/contrib/opt/convert_checkpoint.py --model_type blip2 \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16
+
+    trtllm-build \
+        --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+        --output_dir ${ENGINE_PATH} \
+        --gemm_plugin float16 \
+        --max_beam_width 1 \
+        --max_batch_size 8 \
+        --max_seq_len 1024 \
+        --max_input_len 924 \
+        --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_multimodal_features) for BLIP2
+
+    python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_type blip2 --model_path ${HF_MODEL_PATH} --max_batch_size 8
+
+    # For LLAVA
+    python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16
+
+    trtllm-build \
+        --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+        --output_dir ${ENGINE_PATH} \
+        --gemm_plugin float16 \
+        --max_batch_size 8 \
+        --max_input_len 2048 \
+        --max_seq_len 2560 \
+        --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA
+
+    python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8
+
+    # For VILA
+    python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16
+
+    trtllm-build \
+        --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+        --output_dir ${ENGINE_PATH} \
+        --gemm_plugin float16 \
+        --max_batch_size 8 \
+        --max_input_len 2048 \
+        --max_seq_len 2560 \
+        --max_multimodal_len 6272 # 8 (max_batch_size) * 196 (num_multimodal_features) * 4 (max_num_images_per_request)
+
+    python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type vila --vila_path ${VILA_PATH} --max_batch_size 32 #max_batch_size * max_num_images_per_request since vila support multiple images inference
+
+    # For LLaVA OneVision
+    python tensorrt_llm/examples/models/contrib/qwen/convert_checkpoint.py \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16
+
+    trtllm-build \
+        --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+        --output_dir ${ENGINE_PATH} \
+        --gemm_plugin float16 \
+        --max_batch_size 1 \
+        --max_input_len  7500 \
+        --max_seq_len  7600 \
+        --max_multimodal_len 7300 # max_batch_size * num_multimodal_features(depends on the image size or the specified video num frame)
+
+    python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava_onevision --max_batch_size 16 # max_batch_size * patch for image or frame for video
+
+    # For MLLAMA
+    python tensorrt_llm/examples/models/core/mllama/convert_checkpoint.py \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype bfloat16
+
+    trtllm-build \
+    --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+    --output_dir ${ENGINE_PATH} \
+    --gemm_plugin auto \
+    --max_batch_size 8 \
+    --max_seq_len 2048 \
+    --max_num_tokens 4096 \
+    --max_encoder_input_len 6404
+
+    python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type mllama --output_dir ${MULTIMODAL_ENGINE_PATH} --max_batch_size 8 #max_batch_size * max_num_images_per_request
+
+    # For Qwen2-VL
+    python3 ../qwen/convert_checkpoint.py \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16
+
+    trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+        --output_dir ${ENGINE_PATH} \
+        --gemm_plugin=float16 \
+        --gpt_attention_plugin=float16 \
+        --max_batch_size 4 \
+        --max_input_len 2048 \
+        --max_seq_len 3072 \
+        --max_multimodal_len 1296 #(max_batch_size) * 324 (num_multimodal_features), this's for image_shape=[504,504]
+
+    python build_multimodal_engine.py --model_type qwen2_vl --model_path tmp/hf_models/${MODEL_NAME} --output_dir ${MULTIMODAL_ENGINE_PATH}
+    ```
+
+    > **NOTE**:
+    >
+    > `max_multimodal_len = max_batch_size * num_multimodal_features`, so if you change `max_batch_size`, `max_multimodal_len` **MUST** be changed accordingly.
+    > For multi-image inference, where a single request could contain multiple images, `max_multimodal_len = max_batch_size * num_multimodal_features * max_num_images_per_request`
+    >
+    > The built visual engines are located in `tmp/trt_engines/${MODEL_NAME}/multimodal_encoder`.
+
+3. Prepare Tritonserver configs
+
+    ```bash
+    cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r
+    # Override the ensemble and creates new multimodal_encoders directories for multimodal
+    cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r
+    cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,cross_kv_cache_fraction:0.5
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32
+
+    # Newly added for multimodal
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000
+    ```
+    > **NOTE**:
+    >
+    > You can set the `decoupled_mode` option to True to use streaming mode.
+    >
+    > You can set the `accumulate_tokens` option to True in streaming mode to call the postprocessing model with all accumulated tokens.
+    >
+    > You can set the `enable_kv_cache_reuse` option to True to enable kv cache reuse. Requests with the same image/prompt table/input tokens will reuse the KV cache, which will help reduce latency. The specific performance improvement depends on the length of reuse.
+    >
+    > You can set the `max_num_images` to the max number of images per request. The value should be the same as the `max_num_images_per_request` value used at build the engine step above.
+    >
+    > Set `${ENCODER_INPUT_FEATURES_DTYPE}` to `TYPE_BF16` for mllama, and `TYPE_FP16` for other models.
+    > `cross_kv_cache_fraction` is used to determine the paged kv cache memory pool size of enc-dec models. For such case, we distinguish `free_fraction * (1 - cross_kv_cache_fraction)` to self attention kv caches, and `free_fraction * cross_kv_cache_fraction` to cross attention kv caches.
+
+4. Launch Tritonserver
+
+    ```bash
+    python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000
+    ```
+
+    > **NOTE**:
+    > If there is an error associated with 'MPI_Init_thread', please do `export PMIX_MCA_gds=hash`'
+    >
+    > When launching the server, since the prompt_embedding_table is in GPU memory, we need to set the CUDA pool memory for inter-step communication. For example, when we have a shape of (1, 576, 4096) promp_embedding table, we would need 300MB of CUDA pool memory, so we set 30MB to have some GPU buffers. (2(fp16=>2bytes) * 576 * 4096 * 8(max_batch_size) = 18,874,368)
+    >
+    > Also, the tensorrt_llm initialization assumes using another GPU, we need to initialize it but not use them.
+
+### Send requests
+1. Send request with `decoupled_mode` set to False
+    ```bash
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2
+
+    [beam 0 ]:
+    Question: which city is this? Answer: singapore
+    [INFO] Latency: 41.942 ms
+    ```
+2. Send request with `decoupled_mode` set to True
+    ```bash
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --streaming
+
+    [beam 0 ]:   sing
+    [beam 0 ]:  apore
+    [beam 0 ]:
+    [INFO] Latency: 43.441 ms
+    ```
+3. Send request to the `tensorrt_llm_bls` model
+    ```bash
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls
+
+    [beam 0 ]:
+    Question: which city is this? Answer: singapore
+    [INFO] Latency: 44.152 ms
+    ```
+
+4. Send request to the `tensorrt_llm_bls` model with `accumulate_tokens` set to True
+    ```bash
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls --streaming
+
+    [beam 0 ]:   sing
+    [beam 0 ]:   singapore
+    [beam 0 ]:   singapore
+    [INFO] Latency: 45.48 ms
+    ```
+
+5. Send request with `enable_kv_cache_reuse` set to True
+    ```bash
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --prompt_table_extra_id ${id}
+
+    [beam 0 ]:
+    Question: which city is this? Answer: singapore
+    [INFO] Latency: 42.514 ms
+    ```
+6. Send request with multiple images per request
+    ```bash
+    wget -O av.png https://raw.githubusercontent.com/Efficient-Large-Model/VILA/main/demo_images/av.png
+
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text '<image>\n<image>\n Please elaborate what you see in the images?' --image av.png,'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 68 --model_type vila --hf_model_dir ${HF_MODEL_PATH}
+
+    [beam 0 ]:
+    A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:  \n \n Please elaborate what you see in the images? ASSISTANT: The first image shows a busy street scene with a car driving through a crosswalk, surrounded by pedestrians and traffic lights. The second image captures a beautiful sunset with the iconic Merlion statue spouting water into the bay, with the Singapore Flyer and the city skyline in the background.
+
+    [INFO] Latency: 403.879 ms
+    ```
+
+7. Send request with curl
+    The triton server supports curl requests with an image url in the payload. For example here is a request sent to a Llama-3.2-11B-Vision (mLLama) model:
+    ``` bash
+    curl -X POST localhost:8000/v2/models/ensemble/generate_stream \
+    -d '{"id": "42", "text_input": "<|image|>If I had to write a haiku for this one", "image_url_input": "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png", "parameters": {"max_tokens": 16, "beam_width": 1, "end_id": 128001, "pad_id": 128004, "top_k": 1, "top_p": 0, "stream": false, "temperature": 0}}'
+
+    # response
+    data: {"batch_index":0,"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"id":"42","model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_index":0,"sequence_start":false,"text_output":"If I had to write a haiku for this one, it would be:.\\nMerlion spouts water.\\nMarina"}
+   ```
+   You can also send requests with base64 encoded images. Just replace the url above with `data:image/jpeg;base64,<base64_encoded_image>`.
+
+8. Send request with video input
+    ```bash
+    python tensorrt_llm/triton_backend/tools/multimodal/client.py --text "Why is this video funny?" --video sample_demo_1.mp4 --video_num_frames 8 --request-output-len 30 --model_type llava_onevision  --end-id 151645
+
+    [beam 0 ]:
+    user
+    Why is this video funny?assistant
+    The video is funny because the child's actions are playful and exaggerated, as if they are reading the book with great enthusiasm.
+    [INFO] Latency: 507.537 ms
+    ```
+
+> **NOTE**:
+> Please ignore any exception thrown with the output. It's a known issue to be fixed.
+>
+> When `enable_kv_cache_reuse` is set to true, the `prompt_table_extra_id` must be specified in the requests. The `prompt_table_extra_id` is a unique identifier representing the image (or prompt table), the same image uses the same id. The data type is `uint64`, and the minimum value is 1.
+
+### Kill the server
+```bash
+pkill tritonserver
+```
+
+### Supported image input types
+When programmatically preparing your own request for the server, note that `ensemble`:
+- `image_input`: a float16 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of images already processed (via transformers AutoProcessor) for the vision encoder.
+- `image_bytes_input`: a uint8 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of raw images.
+- `image_url_input`: a list of strings of shape `[batch_size, num_images]` representing a batch of image urls.
+
+You may populate only one of these image inputs in a request. We suggest you use `image_bytes_input` when using grpc requests and `image_url_input` when sending http requests. For grpc requests where the client can preprocess images to reduce load on the server, use `image_input`. Note that `tensorrt_llm_bls` only supports `image_input`.
+
+### Long multimodal context, FP8 KV cache and tensor parallelism
+
+Follow these steps to enable chunked context inference (using LLaVA as an example) with FP8 KV cache and 2-way tensor parallelism. Ensure you convert the checkpoint using `--tp_size 2` and build the model with `--use_paged_context_fmha enable` and `--use_fp8_context_fmha enable`. Set the chunked context to true in the Tritonserver configuration file. The chunk size is determined by the `max_num_tokens` flag when building the engine, which defaults to 8192. When launching the server, you need to change the `--world_size` to match your tensor parallelism size.
+1. Build the engine
+```bash
+    export MODEL_NAME="llava-1.5-7b-hf"
+    export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME}
+
+    # Convert checkpoint
+    # For fp16 KV cache
+    export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu
+    export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu
+    export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder
+    python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \
+        --model_dir ${HF_MODEL_PATH} \
+        --output_dir ${UNIFIED_CKPT_PATH} \
+        --dtype float16 \
+        --tp_size 2
+
+    # For fp8 KV cache
+    export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu
+    export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu
+    export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder
+    python ./tensorrt_llm/examples/quantization/quantize.py \
+                                --model_dir ${HF_MODEL_PATH} \
+                                --dtype float16 \
+                                --qformat fp8 \
+                                --kv_cache_dtype fp8 \
+                                --output_dir ${UNIFIED_CKPT_PATH} \
+                                --calib_size 512 \
+                                --tp_size 2
+
+    # Build the llm engine
+    # --use_paged_context_fmha and --use_fp8_context_fmha are defaultly enabled
+    # include --max_num_tokens to set the chunk size
+    trtllm-build \
+        --checkpoint_dir ${UNIFIED_CKPT_PATH} \
+        --output_dir ${ENGINE_PATH} \
+        --gemm_plugin auto \
+        --max_batch_size 8 \
+        --max_input_len 2048 \
+        --max_seq_len 2560 \
+        --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA
+
+    # Build the multimodal engine
+    python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8 --output_dir ${MULTIMODAL_ENGINE_PATH}
+```
+2. Prepare the Tritonserver config file
+Prepare the Tritonserver config file with `enable_chunked_context` set to True. Also, to further utilize the free memory, we can set `kv_cache_free_gpu_mem_fraction` to 0.9.
+```bash
+cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r
+# Override the ensemble and creates new multimodal_encoders directories for multimodal
+cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r
+cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r
+
+# Changes the enable_chunked_context to True, and set kv_cache_free_gpu_mem_fraction to 0.9
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:True,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,kv_cache_free_gpu_mem_fraction:0.9
+
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000
+
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1
+
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32
+
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32
+
+# Newly added for multimodal
+python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000
+```
+3. Launch the server
+```bash
+# Change --world_size to your tp size
+python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000
+```
+
+When you launch the server, you will see logs similar to the following. In theory, now you can process long multimodal context up to the "max tokens in paged KV cache" value, and the context prefill phase will be done in chunk sizes.
+```bash
+[TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 93.10 GiB, available: 85.57 GiB
+...
+[TensorRT-LLM][INFO] [MemUsageChange] Allocated 77.02 GiB for max tokens in paged KV cache (315488).
+```
diff --git a/docs/whisper.md b/docs/whisper.md
new file mode 100644
index 00000000..29f33af0
--- /dev/null
+++ b/docs/whisper.md
@@ -0,0 +1,142 @@
+# End to end workflow to run a Multimodal model
+
+### Support Matrix
+The following multimodal model is supported in tensorrtllm_backend:
+* Whisper
+* Distil-Whisper
+
+## Run Whisper with single-GPU Tritonserver
+### Tritonserver setup steps
+0. Make sure that you have initialized the TRT-LLM submodule:
+
+    ```bash
+    git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend
+    git lfs install
+    git submodule update --init --recursive
+    ```
+
+1. Start the Triton Server Docker container:
+
+    1-1. If you're using Tritonserver from nvcr.io
+    ```bash
+    # Replace <yy.mm> with the version of Triton you want to use.
+    # The command below assumes the the current directory is the
+    # TRT-LLM backend root git repository.
+
+    docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
+    ```
+    1-2. If you are using `tensorrtllm_backend` container:
+    ```bash
+    docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm
+    ```
+
+2. Build the engine:
+
+    2-1. Download the whisper models
+    ```bash
+    wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
+    wget --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
+    wget --directory-prefix=assets https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav
+    # take large-v3 model as an example
+    wget --directory-prefix=assets https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt
+    ```
+    2-2. Build TensorRT-LLM engines
+    ```bash
+    INFERENCE_PRECISION=float16
+    MAX_BEAM_WIDTH=4
+    MAX_BATCH_SIZE=64
+    checkpoint_dir=tllm_checkpoint
+    output_dir=whisper_large_v3_max_batch_${MAX_BATCH_SIZE}
+
+    python3 convert_checkpoint.py --model_dir ${MODEL_DIR} --output_dir ${checkpoint_dir}
+
+    trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
+                --output_dir ${output_dir}/encoder \
+                --moe_plugin disable \
+                --max_batch_size ${MAX_BATCH_SIZE} \
+                --gemm_plugin disable \
+                --bert_attention_plugin ${INFERENCE_PRECISION} \
+                --max_input_len 3000 --max_seq_len=3000
+
+    trtllm-build  --checkpoint_dir ${checkpoint_dir}/decoder \
+                --output_dir ${output_dir}/decoder \
+                --moe_plugin disable \
+                --max_beam_width ${MAX_BEAM_WIDTH} \
+                --max_batch_size ${MAX_BATCH_SIZE} \
+                --max_seq_len 114 \
+                --max_input_len 14 \
+                --max_encoder_input_len 3000 \
+                --gemm_plugin ${INFERENCE_PRECISION} \
+                --bert_attention_plugin ${INFERENCE_PRECISION} \
+                --gpt_attention_plugin ${INFERENCE_PRECISION}
+
+    ```
+
+    > **NOTE**:
+    >
+    > TensorRT-LLM also supports using [distil-whisper's](https://github.com/huggingface/distil-whisper) different models by first converting their params and weights from huggingface's naming format to [openai whisper](https://github.com/openai/whisper) naming format. You can do so by running the script [distil_whisper/convert_from_distil_whisper.py](./convert_from_distil_whisper.py).
+
+3. Prepare Tritonserver configs
+
+    ```bash
+    cp tensorrt_llm/triton_backend/all_models/whisper/ model_repo_whisper -r
+    cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm model_repo_whisper -r
+    wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
+    wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
+
+    BACKEND=tensorrtllm
+    DECOUPLED_MODE=false
+    DECODER_ENGINE_PATH=${output_dir}/decoder
+    ENCODER_ENGINE_PATH=${output_dir}/encoder
+    MAX_TOKENS_IN_KV_CACHE=24000
+    BATCHING_STRATEGY=inflight_fused_batching
+    KV_CACHE_FREE_GPU_MEM_FRACTION=0.5
+    EXCLUDE_INPUT_IN_OUTPUT=True
+    TRITON_MAX_BATCH_SIZE=8
+    MAX_QUEUE_DELAY_MICROSECONDS=0
+    MAX_BEAM_WIDTH=1
+    MAX_QUEUE_SIZE="0"
+    ENABLE_KV_CACHE_REUSE=false
+    ENABLE_CHUNKED_CONTEXT=false
+    CROSS_KV_CACHE_FRACTION="0.5"
+    n_mels=128
+    zero_pad=false
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},encoder_engine_dir:${ENCODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},cross_kv_cache_fraction:${CROSS_KV_CACHE_FRACTION},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
+
+    python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/whisper_bls/config.pbtxt engine_dir:${ENCODER_ENGINE_PATH},n_mels:$n_mels,zero_pad:$zero_pad,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE}
+    ```
+    > **NOTE**:
+    >
+    > TODO: You can set the `decoupled_mode` option to True to use streaming mode.
+
+4. Launch Tritonserver
+
+    ```bash
+    python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=model_repo_whisper/ --tensorrt_llm_model_name tensorrt_llm,whisper_bls --multimodal_gpu0_cuda_mem_pool_bytes 300000000
+    ```
+
+### Send requests
+1. Send request with a single audio file
+    ```bash
+    wget -nc https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav
+    # Test non-streaming
+    python3 tensorrt_llm/triton_backend/whisper/client.py --audio-path 1221-135766-0002.wav
+    ```
+2. Send requests with a whole audio dataset
+   ```bash
+    git clone https://github.com/yuekaizhang/Triton-ASR-Client.git
+    cd Triton-ASR-Client
+    num_task=16
+    python3 tensorrt_llm/triton_backend/whisper/client.py \
+        --server-addr localhost \
+        --model-name whisper_bls \
+        --num-tasks $num_task \
+        --text-prompt "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \
+        --manifest-dir ./datasets/aishell1_test \
+        --compute-cer
+    ```
+### Kill the server
+```bash
+pkill tritonserver
+```
diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt
deleted file mode 100644
index 21d0c711..00000000
--- a/inflight_batcher_llm/CMakeLists.txt
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met: *
-# Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer. * Redistributions in binary
-# form must reproduce the above copyright notice, this list of conditions and
-# the following disclaimer in the documentation and/or other materials provided
-# with the distribution. * Neither the name of NVIDIA CORPORATION nor the names
-# of its contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY EXPRESS
-# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
-# EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-cmake_minimum_required(VERSION 3.17)
-include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/set_ifndef.cmake)
-
-set_ifndef(TRTLLM_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm)
-
-include_directories(${TRTLLM_DIR} ${TRTLLM_DIR}/cpp/include)
-
-include(${TRTLLM_DIR}/cpp/cmake/modules/find_library_create_target.cmake)
-
-project(tritontensorrtllmbackend LANGUAGES C CXX)
-
-add_compile_options("-DENABLE_MULTI_DEVICE=1")
-# https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
-option(USE_CXX11_ABI "Using CXX11 ABI of libstdc++" OFF)
-message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")
-if(USE_CXX11_ABI)
-  add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=1")
-else()
-  add_compile_options("-D_GLIBCXX_USE_CXX11_ABI=0")
-endif()
-
-#
-# Options
-#
-# Must include options required for this project as well as any projects
-# included in this one by FetchContent.
-#
-# TRITON_ENABLE_GPU is set to OFF as currently the code does not use any GPU
-# related features since TRT-LLM backend manages the usage on GPUs itself.
-option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
-option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
-option(TRITON_ENABLE_METRICS "Include metrics support in server" ON)
-option(BUILD_TESTS "Build Google tests" OFF)
-
-if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS)
-  message(
-    FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON")
-endif()
-
-set(TRITON_COMMON_REPO_TAG
-    "main"
-    CACHE STRING "Tag for triton-inference-server/common repo")
-set(TRITON_CORE_REPO_TAG
-    "main"
-    CACHE STRING "Tag for triton-inference-server/core repo")
-set(TRITON_BACKEND_REPO_TAG
-    "main"
-    CACHE STRING "Tag for triton-inference-server/backend repo")
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDA_PATH}/include)
-message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
-
-#
-# Dependencies
-#
-# FetchContent requires us to include the transitive closure of all repos that
-# we depend on so that we can override the tags.
-#
-include(FetchContent)
-
-FetchContent_Declare(
-  repo-common
-  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
-  GIT_TAG ${TRITON_COMMON_REPO_TAG}
-  GIT_SHALLOW ON)
-FetchContent_Declare(
-  repo-core
-  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
-  GIT_TAG ${TRITON_CORE_REPO_TAG}
-  GIT_SHALLOW ON)
-FetchContent_Declare(
-  repo-backend
-  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
-  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
-  GIT_SHALLOW ON)
-FetchContent_MakeAvailable(repo-common repo-core repo-backend)
-
-#
-# The backend must be built into a shared library. Use an ldscript to hide all
-# symbols except for the TRITONBACKEND API.
-#
-configure_file(src/libtriton_tensorrtllm.ldscript
-               libtriton_tensorrtllm.ldscript COPYONLY)
-
-set(COMMON_SRCS src/model_instance_state.cc src/model_state.cc src/utils.cc)
-
-add_library(triton-tensorrt-llm-common SHARED ${COMMON_SRCS})
-
-set(BACKEND_SRCS src/libtensorrtllm.cc)
-
-add_library(triton-tensorrt-llm-backend SHARED ${BACKEND_SRCS})
-
-enable_language(CUDA)
-
-find_package(CUDA ${CUDA_REQUIRED_VERSION} REQUIRED)
-find_package(Python3 COMPONENTS Interpreter Development)
-
-find_library(
-  tensorrt_llm libtensorrt_llm.so REQUIRED
-  PATHS ${Python3_SITEARCH}/tensorrt_llm/libs
-        ${TRTLLM_DIR}/cpp/build/tensorrt_llm
-        ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm)
-
-find_library(
-  nvinfer_plugin_tensorrt_llm libnvinfer_plugin_tensorrt_llm.so REQUIRED
-  PATHS
-    ${Python3_SITEARCH}/tensorrt_llm/libs
-    ${TRTLLM_DIR}/cpp/build/tensorrt_llm/plugins
-    ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/plugins)
-
-find_program(
-  TRTLLM_EXECUTOR_WORKER executorWorker REQUIRED
-  PATHS
-    ${Python3_SITEARCH}/tensorrt_llm/bin
-    ${TRTLLM_DIR}/cpp/build/tensorrt_llm/executor_worker
-    ${CMAKE_CURRENT_SOURCE_DIR}/../tensorrt_llm/cpp/build/tensorrt_llm/executor_worker
-)
-install(
-  PROGRAMS ${TRTLLM_EXECUTOR_WORKER}
-  DESTINATION ${CMAKE_BINARY_DIR}
-  RENAME trtllmExecutorWorker)
-
-find_library(
-  CUDNN_LIB cudnn
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib)
-find_library(
-  CUBLAS_LIB cublas
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib lib/stubs)
-find_library(
-  CUBLASLT_LIB cublasLt
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib64 lib lib/stubs)
-find_library(
-  CUDART_LIB cudart
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64)
-find_library(
-  CUDA_DRV_LIB cuda
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs)
-find_library(
-  NVIDIA_ML_LIB nvidia-ml
-  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
-  PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs)
-set(CUDA_LIBRARIES ${CUDART_LIB} ${NVIDIA_ML_LIB})
-
-find_package(MPI REQUIRED)
-message(STATUS "Using MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
-message(STATUS "Using MPI_LIBRARIES: ${MPI_LIBRARIES}")
-
-# NCCL dependencies
-set_ifndef(NCCL_LIB_DIR /usr/lib/x86_64-linux-gnu/)
-set_ifndef(NCCL_INCLUDE_DIR /usr/include/)
-find_library(NCCL_LIB nccl HINTS ${NCCL_LIB_DIR})
-
-# TRT_LIB_DIR and TRT_INCLUDE_DIR should be aligned with the path in the
-# environment_setup.sh script
-set_ifndef(TRT_LIB_DIR
-           /usr/local/tensorrt/targets/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/lib)
-set_ifndef(
-  TRT_INCLUDE_DIR
-  /usr/local/tensorrt/targets/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/include)
-
-set(TRT_LIB nvinfer)
-find_library_create_target(${TRT_LIB} nvinfer SHARED ${TRT_LIB_DIR})
-
-file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
-     REGEX "#define NV_TENSORRT_.*")
-foreach(TYPE MAJOR MINOR PATCH BUILD)
-  string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
-               ${VERSION_STRINGS})
-  string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
-endforeach(TYPE)
-
-foreach(TYPE MAJOR MINOR PATCH)
-  string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING
-               ${VERSION_STRINGS})
-  string(REGEX MATCH "[0-9]+" TRT_SO_${TYPE} ${TRT_TYPE_STRING})
-endforeach(TYPE)
-
-set(TRT_VERSION
-    "${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH}"
-    CACHE STRING "TensorRT project version")
-set(TRT_SOVERSION
-    "${TRT_SO_MAJOR}"
-    CACHE STRING "TensorRT library so version")
-message(
-  STATUS
-    "Building for TensorRT version: ${TRT_VERSION}, library version: ${TRT_SOVERSION}"
-)
-
-if(${TRT_MAJOR} GREATER_EQUAL 10)
-  add_definitions("-DTRT_LLM_USE_DIM64")
-  message(
-    STATUS "TensorRT version ${TRT_MAJOR} >= 10, int64 dimension is enabled")
-endif()
-
-list(APPEND COMMON_HEADER_DIRS ${TORCH_INCLUDE_DIRS} ${TRT_INCLUDE_DIR})
-include_directories(${COMMON_HEADER_DIRS})
-
-target_include_directories(
-  triton-tensorrt-llm-common
-  PUBLIC ${TRTLLM_DIR}/cpp
-         ${TRTLLM_DIR}/cpp/include
-         ${CMAKE_CURRENT_SOURCE_DIR}/src
-         ${CUDA_INCLUDE_DIRS}
-         ${CUDNN_ROOT_DIR}/include
-         ${NCCL_INCLUDE_DIR}
-         ${3RDPARTY_DIR}/cutlass/include
-         ${MPI_INCLUDE_PATH}
-         ${COMMON_HEADER_DIR})
-
-target_compile_features(triton-tensorrt-llm-common PRIVATE cxx_std_17)
-target_compile_features(triton-tensorrt-llm-backend PRIVATE cxx_std_17)
-
-set(COMPILE_OPTIONS
-    $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-    -Wall
-    -Wextra
-    -Wno-unused-parameter
-    -Wno-deprecated-declarations
-    -Wno-type-limits>
-    $<$<CXX_COMPILER_ID:MSVC>:/Wall
-    /D_WIN32_WINNT=0x0A00
-    /EHsc>)
-
-target_compile_options(triton-tensorrt-llm-common PRIVATE ${COMPILE_OPTIONS})
-target_compile_options(triton-tensorrt-llm-backend PRIVATE ${COMPILE_OPTIONS})
-
-if(TRITON_ENABLE_METRICS)
-  list(APPEND REPORTER_SRCS
-       src/custom_metrics_reporter/custom_metrics_reporter.cc)
-  list(APPEND REPORTER_HDRS
-       src/custom_metrics_reporter/custom_metrics_reporter.h)
-
-  add_library(triton-custom-metrics-reporter-library EXCLUDE_FROM_ALL
-              ${REPORTER_SRCS} ${REPORTER_HDRS})
-  target_compile_features(triton-custom-metrics-reporter-library
-                          PRIVATE cxx_std_17)
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    target_compile_options(triton-custom-metrics-reporter-library
-                           PRIVATE /W1 /D_WIN32_WINNT=0x0A00 /EHsc)
-  else()
-    target_compile_options(
-      triton-custom-metrics-reporter-library
-      PRIVATE -Wall -Wextra -Wno-unused-parameter -Wno-deprecated-declarations
-              -Werror)
-  endif()
-
-  set_target_properties(triton-custom-metrics-reporter-library
-                        PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-  target_link_libraries(
-    triton-custom-metrics-reporter-library
-    PUBLIC triton-common-json # from repo-common
-           triton-common-logging # from repo-common
-           triton-core-serverapi # from repo-core
-           triton-core-serverstub # from repo-core
-           triton-backend-utils # from repo-backend
-           ${tensorrt_llm})
-
-  target_compile_definitions(triton-tensorrt-llm-common
-                             PRIVATE TRITON_ENABLE_METRICS=1)
-  target_link_libraries(triton-tensorrt-llm-common
-                        PRIVATE triton-custom-metrics-reporter-library)
-endif()
-
-target_link_libraries(
-  triton-tensorrt-llm-common
-  PUBLIC ${tensorrt_llm}
-         triton-core-serverapi # from repo-core
-         triton-core-backendapi # from repo-core
-         triton-core-serverstub # from repo-core
-         triton-backend-utils # from repo-backend
-         ${MPI_LIBRARIES}
-         ${CUDA_LIBRARIES}
-         nvinfer
-         ${nvinfer_plugin_tensorrt_llm})
-
-target_link_libraries(triton-tensorrt-llm-backend
-                      PRIVATE triton-tensorrt-llm-common)
-
-FetchContent_Declare(
-  json
-  GIT_REPOSITORY https://github.com/nlohmann/json.git
-  GIT_TAG v3.11.2)
-
-FetchContent_MakeAvailable(json)
-
-target_link_libraries(triton-tensorrt-llm-common
-                      PRIVATE nlohmann_json::nlohmann_json)
-
-if(WIN32)
-  set_target_properties(
-    triton-tensorrt-llm-backend PROPERTIES POSITION_INDEPENDENT_CODE ON
-                                           OUTPUT_NAME triton_tensorrtllm)
-  set_target_properties(
-    triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON
-                                          OUTPUT_NAME triton_tensorrtllm_common)
-else()
-  set_target_properties(
-    triton-tensorrt-llm-backend
-    PROPERTIES
-      POSITION_INDEPENDENT_CODE ON
-      OUTPUT_NAME triton_tensorrtllm
-      LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript
-      LINK_FLAGS
-      "-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined"
-  )
-  set_target_properties(
-    triton-tensorrt-llm-common PROPERTIES POSITION_INDEPENDENT_CODE ON
-                                          OUTPUT_NAME triton_tensorrtllm_common)
-endif()
-
-if(BUILD_TESTS)
-  enable_testing()
-  add_subdirectory(tests)
-endif()
diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md
deleted file mode 100644
index 849c1947..00000000
--- a/inflight_batcher_llm/README.md
+++ /dev/null
@@ -1,379 +0,0 @@
-# Instructions to run TRT-LLM in-flight batching Triton backend:
-
-## Build TensorRT-LLM engine for inflight batching
-
-To configure a Triton server that runs a model using TensorRT-LLM, it is needed to compile a TensorRT-LLM engine for that model.
-
-For example, for LLaMA 7B, change to the `tensorrt_llm/examples/llama` directory:
-
-```
-cd tensorrt_llm/examples/llama
-```
-Prepare the checkpoint of the model by following the instructions [here](https://huggingface.co/docs/transformers/main/en/model_doc/llama) and store it in a model directory. Then, create the engine:
-
-```
-python build.py --model_dir ${model_directory} \
-                --dtype bfloat16 \
-                --use_gpt_attention_plugin bfloat16 \
-                --use_inflight_batching \
-                --paged_kv_cache \
-                --remove_input_padding \
-                --use_gemm_plugin bfloat16 \
-                --output_dir engines/bf16/1-gpu/
-```
-
-To disable the support for in-flight batching (i.e. use the V1 batching mode), remove `--use_inflight_batching`.
-
-Similarly, for a GPT model, change to `tensorrt_llm/examples/gpt` directory:
-```
-cd tensorrt_llm/examples/gpt
-
-```
-Prepare the model checkpoint following the instructions in the README file, store it in a model directory and build the TRT engine with:
-
-```
-python3 build.py --model_dir=${model_directory} \
-                 --dtype float16 \
-                 --use_inflight_batching \
-                 --use_gpt_attention_plugin float16 \
-                 --paged_kv_cache \
-                 --use_gemm_plugin float16 \
-                 --remove_input_padding \
-                 --hidden_act gelu \
-                 --output_dir=engines/fp16/1-gpu
-```
-
-## Create a model repository folder
-
-First run:
-```
-rm -rf triton_model_repo
-mkdir triton_model_repo
-cp -R all_models/inflight_batcher_llm/* triton_model_repo
-```
-
-Then copy the TRT engine to `triton_model_repo/tensorrt_llm/1/`. For example for the LLaMA 7B example above, run:
-
-```
-cp -R tensorrt_llm/examples/llama/engines/bf16/1-gpu/ triton_model_repo/tensorrt_llm/1
-```
-
-For the GPT example above, run:
-```
-cp -R tensorrt_llm/examples/gpt/engines/fp16/1-gpu/ triton_model_repo/tensorrt_llm/1
-```
-
-
-Edit the `triton_model_repo/tensorrt_llm/config.pbtxt` file and replace `${decoupled_mode}` with `True` or `False`, and `${engine_dir}` with `/triton_model_repo/tensorrt_llm/1/1-gpu/` since the `triton_model_repo` folder created above will be mounted to `/triton_model_repo` in the Docker container. Decoupled mode must be set to true if using the streaming option from the client.
-
-
-To use V1 batching, the `config.pbtxt` should have:
-```
-parameters: {
-  key: "gpt_model_type"
-  value: {
-    string_value: "V1"
-  }
-}
-```
-
-For in-flight batching, use:
-```
-parameters: {
-  key: "gpt_model_type"
-  value: {
-    string_value: "inflight_fused_batching"
-  }
-}
-```
-
-Note that the parameter `enable_trt_overlap` has been removed from the `config.pbtxt`. This option allowed to overlap execution of two micro-batches to hide CPU overhead.
-Optimization work has been done to reduce the CPU overhead and it was found that the overlapping of micro-batches did not provide additional benefits.
-
-To reuse previously computed KV cache values (e.g. for system prompt), set `enable_kv_cache_reuse`
-parameter to `True` in the `config.pbtxt` file:
-
-```
-parameters: {
-  key: "enable_kv_cache_reuse"
-  value: {
-    string_value: "True"
-  }
-}
-```
-
-Or, equivalently, add `enable_kv_cache_reuse:True` to the invocation of the
-`fill_template.py` tool:
-
-```bash
-python3 tools/fill_template.py -i all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt "enable_kv_cache_reuse:True"
-```
-
-## Launch the Triton server container using the model_repository you just created
-
-```
-docker run --rm -it --net host --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --gpus='"'device=0'"' -v $(pwd)/triton_model_repo:/triton_model_repo tritonserver:w_trt_llm_backend /bin/bash -c "tritonserver --model-repository=/triton_model_repo"
-```
-
-## Run the provided client to send a request
-
-You can test the inflight batcher server with the provided reference python client as following:
-```
-python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200
-```
-
-You can also stop the generation process early by using the `--stop-after-ms` option to send a stop request after a few milliseconds:
-
-```
-python inflight_batcher_llm_client.py --stop-after-ms 200 --request-output-len 200
-```
-
-You will find that the generation process is stopped early and therefore the number of generated tokens is lower than 200.
-
-You can have a look at the client code to see how early stopping is achieved.
-
-## Running LoRA inference with inflight batching
-
-Build a model with LoRA enable
-
-
-```
-BASE_LLAMA_MODEL=llama-7b-hf/
-
-python convert_checkpoint.py --model_dir ${BASE_LLAMA_MODEL} \
-                            --output_dir ./tllm_checkpoint_1gpu \
-                            --dtype float16
-
-trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu \
-            --output_dir /tmp/llama_7b_with_lora_qkv/trt_engines/fp16/1-gpu/ \
-            --gemm_plugin float16 \
-            --max_batch_size 8 \
-            --max_input_len 512 \
-            --max_output_len 50 \
-            --gpt_attention_plugin float16 \
-            --paged_kv_cache enable \
-            --remove_input_padding enable \
-            --use_paged_context_fmha enable \
-            --use_custom_all_reduce disable
-            --lora_plugin float16 \
-            --lora_target_modules attn_q attn_k attn_v \
-            --max_lora_rank 8
-```
-
-Create a Triton model repository and launch the Triton server as described above.
-
-Now generate LoRA tensors that will be passed in with each request to triton.
-
-```
-git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1
-git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0
-
-python3 tensorrt_llm/examples/hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16
-python3 tensorrt_llm/examples/hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16
-```
-
-### LoRA Cache
-
-As LoRA weights are passed to the backend they will be cached in a host cache.  As requests are scheduled, those weights with be prefetched to a gpu cache.  After a LoRA is loaded into the cache, only `lora_task_id` is needed for inference.
-
-
-Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. (default: 8)
-```
-parameters: {
-  key: "lora_cache_optimal_adapter_size"
-  value: {
-    string_value: "${lora_cache_optimal_adapter_size}"
-  }
-}
-```
-
-
-Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default: 64)
-```
-parameters: {
-  key: "lora_cache_max_adapter_size"
-  value: {
-    string_value: "${lora_cache_max_adapter_size}"
-  }
-}
-```
-
-Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded (default: 0.05)
-```
-parameters: {
-  key: "lora_cache_gpu_memory_fraction"
-  value: {
-    string_value: "${lora_cache_gpu_memory_fraction}"
-  }
-}
-```
-
-Size of host LoRA cache in bytes (default: 1G)
-```
-parameters: {
-  key: "lora_cache_host_memory_bytes"
-  value: {
-    string_value: "${lora_cache_host_memory_bytes}"
-  }
-}
-```
-
-Launch tritonserver as describe above
-
-Run Multi-LoRA example by issuing  multiple concurrent requests.
-The inflight batcher will execute mixed batches with multiple LoRAs in the same batch.
-
-First we cache the LoRAs by sending dummy requests for each adapter.  The TASK_IDS are uniq to the adapter
-
-```
-TASK_IDS=("1" "2")
-LORA_PATHS=("luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights")
-
-for index in ${!TASK_IDS[@]}; do
-    text="dummy"
-    lora_path=${LORA_PATHS[$index]}
-    task_id=${TASK_IDS[$index]}
-    lora_arg="--lora-path ${lora_path} --lora-task-id ${task_id}"
-
-    python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \
-        --top-k 0 \
-        --top-p 0.5 \
-        --request-output-len 10 \
-        --text "${text}" \
-        --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \
-        ${lora_arg} &
-done
-```
-
-Now perform inference with just `--lora-task-id`
-
-```
-INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:")
-TASK_IDS=("" "1" "2" "" "1" "2")
-
-for index in ${!INPUT_TEXT[@]}; do
-    text=${INPUT_TEXT[$index]}
-    task_id=${TASK_IDS[$index]}
-    lora_arg=""
-    if [ "${task_id}" != "" ]; then
-        lora_arg="--lora-task-id ${task_id}"
-    fi
-
-    python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \
-        --top-k 0 \
-        --top-p 0.5 \
-        --request-output-len 10 \
-        --text "${text}" \
-        --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \
-        ${lora_arg} &
-done
-
-wait
-```
-
-Example Output:
-```
-Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
-Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
-Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
-Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
-Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
-Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
-Got completed request
-Input: アメリカ合衆国の首都はどこですか? \n答え:
-Output beam 0: ワシントン D.C.
-Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889]
-Got completed request
-Input: 美国的首都在哪里? \n答案:
-Output beam 0: Washington, D.C.
-What is the
-Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278]
-Got completed request
-Input: 美国的首都在哪里? \n答案:
-Output beam 0: Washington D.C.
-Washington D.
-Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889]
-Got completed request
-Input: アメリカ合衆国の首都はどこですか? \n答え:
-Output beam 0: Washington, D.C.
-Which of
-Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310]
-Got completed request
-Input: アメリカ合衆国の首都はどこですか? \n答え:
-Output beam 0: Washington D.C.
-1. ア
-Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310]
-Got completed request
-Input: 美国的首都在哪里? \n答案:
-Output beam 0: 华盛顿
-W
-Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956]
-```
-
-## Run the e2e/benchmark_core_model to benchmark
-
-### End to end test
-End to end test script sends requests to deployed ensemble model.
-
-Ensemble model is ensembled by three models: preprocessing, tensorrt_llm and postprocessing.
-* preprocessing: Tokenizing, meaning the conversion from prompts(string) to input_ids(list of ints).
-* tensorrt_llm: Inferencing.
-* postprocessing: De-tokenizing, meaning the conversion from output_ids(list of ints) to outputs(string).
-
-The end to end latency includes the total latency of the three parts of an ensemble model.
-
-```
-cd tools/inflight_batcher_llm
-python3 end_to_end_test.py --dataset <dataset path>
-```
-Expected outputs
-```
-[INFO] Functionality test succeed.
-[INFO] Warm up for benchmarking.
-[INFO] Start benchmarking on 125 prompts.
-[INFO] Total Latency: 11099.243 ms
-```
-
-### benchmark core model
-
-benchmark_core_model script sends requests directly to deployed tensorrt_llm model, the benchmark core model latency indicates the inference latency of TensorRT-LLM, not including the pre/post-processing latency which is usually handled by a third-party library such as HuggingFace.
-
-benchmark_core_model can generate traffic from 2 sources.
-1 - dataset (json file containning prompts and optional responses)
-2 - token normal distribution (user specified input, output seqlen)
-
-By default, the test uses exponential distrution to control arrival rate of requests. It can be changed to constant arrival time.
-
-```
-cd tools/inflight_batcher_llm
-```
-Example: Run dataset with 10 req/sec requested rate with provided tokenizer.
-```
-python3 benchmark_core_model.py -i grpc --request_rate 10 dataset --dataset <dataset path> --tokenizer_dir <> --num_requests 5000
-```
-Example: Generate I/O seqlen tokens with input normal distribution with mean_seqlen=128, stdev=10. Output normal distribution with mean_seqlen=20, stdev=2. Set stdev=0 to get constant seqlens.
-```
-python3 benchmark_core_model.py -i grpc --request_rate 10 token_norm_dist --input_mean 128 --input_stdev 5 --output_mean 20 --output_stdev 2 --num_requests 5000
-```
-Expected outputs
-```
-[INFO] Warm up for benchmarking.
-[INFO] Start benchmarking on 5000 prompts.
-[INFO] Total Latency: 26585.349 ms
-[INFO] Total request latencies: 11569672.000999955 ms
-+----------------------------+----------+
-|            Stat            |  Value   |
-+----------------------------+----------+
-|        Requests/Sec        |  188.09  |
-|       OP tokens/sec        | 3857.66  |
-|     Avg. latency (ms)      | 2313.93  |
-|      P99 latency (ms)      | 3624.95  |
-|      P90 latency (ms)      | 3127.75  |
-| Avg. IP tokens per request |  128.53  |
-| Avg. OP tokens per request |  20.51   |
-|     Total latency (ms)     | 26582.72 |
-|       Total requests       | 5000.00  |
-+----------------------------+----------+
-
-```
-*Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
diff --git a/inflight_batcher_llm/client/README.md b/inflight_batcher_llm/client/README.md
deleted file mode 100644
index 9b3bea05..00000000
--- a/inflight_batcher_llm/client/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# Sample TRT-LLM backend clients
-Three sample TRT-LLM Triton clients are provided with the TRT-LLM Triton backend implementation.
-* `e2e_grpc_speculative_decoding_client.py`: Demonstrates how to orchestrate between two independent TRT-LLM models - a draft model and a target model to achiever faster inferencing using speculative decoding. The high level design involves the client making a call to the draft model requesting a certain number of draft tokens, and then associating those draft tokens with a request to the target model. The target model returns some number of completion tokens internally leveraging the draft tokens to speed up inference. The client wraps these back-to-back calls to draft and target models in a loop to complete the full generation.
-Example command:
-```
-python3 e2e_grpc_speculative_decoding_client.py -p "The only thing we have to fear is" \
-              --url-draft ${DRAFT_MODEL_URL} \
-              --url-target ${TARGET_MODEL_URL}
-```
-To get draft model draft tokens's logits, you need to enable `gather_generation_logits` when building then engine, and add `--return-draft-model-draft-logits` when running `e2e_grpc_speculative_decoding_client.py`.
-
-To get the target model accepted tokens's logits, you need to enable `gather_generation_logits` when building the engine, and add `--return-target-model-accepted-token-logits` when running `e2e_grpc_speculative_decoding_client.py`.
-
-
-* `end_to_end_grpc_client.py`: Demonstrates sending a single request to a tritonserver running an ensemble including preprocessor (tokenizer), TRT-LLM model and postprocessor (detokenizer) and getting back a completion from it.
-Example command:
-```
-python3 end_to_end_grpc_client.py \
-        --streaming --output-len 10 \
-        --prompt "The only thing we have to fear is"
-
-```
-* `inflight_batcher_llm_client.py`: Isolates queries and responses to the TRT-LLM model alone. Invokes tokenizer and detokenizer in the client script i.e. outside the server running inference.
-Example command:
-```
-python3 inflight_batcher_llm_client.py \
-            --tokenizer-dir ${TOKENIZER_PATH} \
-            --tokenizer-type ${TOKENIZER_TYPE} \
-            --input-tokens-csv=${LOGDIR}/prompts.csv \
-            --output-tokens-csv=${LOGDIR}/completions.csv
-```
diff --git a/inflight_batcher_llm/client/__init__.py b/inflight_batcher_llm/client/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py b/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py
deleted file mode 100644
index ddcba288..00000000
--- a/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py
+++ /dev/null
@@ -1,564 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import queue
-import sys
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-        output = result.as_numpy('text_output')
-        print(output, flush=True)
-
-
-def get_preprocessor_inputs(prompt, output_len, bad_words, stop_words, end_id,
-                            pad_id):
-    input0 = [[prompt]]
-    input0_data = np.array(input0).astype(object)
-    output0_len = np.ones_like(input0).astype(np.int32) * output_len
-
-    preprocessor_inputs = [
-        prepare_tensor("QUERY", input0_data),
-        prepare_tensor("REQUEST_OUTPUT_LEN", output0_len),
-    ]
-
-    if bad_words:
-        bad_words_list = np.array([bad_words], dtype=object)
-        preprocessor_inputs += [
-            prepare_tensor("BAD_WORDS_DICT", bad_words_list)
-        ]
-
-    if stop_words:
-        stop_words_list = np.array([stop_words], dtype=object)
-        preprocessor_inputs += [
-            prepare_tensor("STOP_WORDS_DICT", stop_words_list)
-        ]
-
-    if end_id:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-        preprocessor_inputs += [prepare_tensor("END_ID", end_id_data)]
-
-    if pad_id:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-        preprocessor_inputs += [prepare_tensor("PAD_ID", pad_id_data)]
-
-    return preprocessor_inputs
-
-
-def extract_preprocessor_outputs(result):
-
-    input_ids = np.squeeze(result.as_numpy("INPUT_ID").astype(np.int32),
-                           axis=0)
-    bad_words_ids = result.as_numpy("BAD_WORDS_IDS").astype(np.int32)
-    stop_words_ids = result.as_numpy("STOP_WORDS_IDS").astype(np.int32)
-    end_id = result.as_numpy("OUT_END_ID").astype(np.int32)[0][0]
-    pad_id = result.as_numpy("OUT_PAD_ID").astype(np.int32)[0][0]
-
-    return input_ids, bad_words_ids, stop_words_ids, end_id, pad_id
-
-
-def get_trtllm_inputs(input_ids,
-                      input_length,
-                      request_output_len,
-                      draft_tokens,
-                      beam_width,
-                      temperature,
-                      repetition_penalty,
-                      presence_penalty,
-                      frequency_penalty,
-                      bad_words_ids,
-                      stop_words_ids,
-                      end_id,
-                      pad_id,
-                      return_draft_model_draft_logits=False,
-                      return_target_model_accepted_token_logits=False):
-
-    # These two flags correspond to the settings of draft model and target model respectively,
-    # and only one of them can be true at a time.
-    assert not (return_draft_model_draft_logits
-                and return_target_model_accepted_token_logits)
-
-    # input_ids is expected to have shape [input_length]
-    # Add batch dimension of 1
-    input_ids_data = np.expand_dims(input_ids, axis=0)
-    inputs = [
-        prepare_tensor("input_ids", input_ids_data),
-        prepare_tensor("input_lengths",
-                       np.array([[input_length]], dtype=np.int32)),
-        prepare_tensor("request_output_len",
-                       np.array([[request_output_len]], dtype=np.int32)),
-        prepare_tensor("bad_words_list", bad_words_ids),
-        prepare_tensor("stop_words_list", stop_words_ids),
-        prepare_tensor("beam_width", np.array([[beam_width]], dtype=np.int32)),
-        prepare_tensor("temperature",
-                       np.array([[temperature]], dtype=np.float32)),
-    ]
-
-    if draft_tokens is not None:
-        draft_tokens_data = np.array([draft_tokens], dtype=np.int32)
-        inputs.append(prepare_tensor("draft_input_ids", draft_tokens_data))
-
-    if repetition_penalty is not None:
-        repetition_penalty_data = np.array([[repetition_penalty]],
-                                           dtype=np.float32)
-        inputs.append(
-            prepare_tensor("repetition_penalty", repetition_penalty_data))
-
-    if presence_penalty is not None:
-        presence_penalty_data = np.array([[presence_penalty]],
-                                         dtype=np.float32)
-        inputs.append(prepare_tensor("presence_penalty",
-                                     presence_penalty_data))
-
-    if frequency_penalty is not None:
-        frequency_penalty_data = np.array([[frequency_penalty]],
-                                          dtype=np.float32)
-        inputs.append(
-            prepare_tensor("frequency_penalty", frequency_penalty_data))
-
-    if end_id is not None:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-        inputs.append(prepare_tensor("end_id", end_id_data))
-
-    if pad_id is not None:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-        inputs.append(prepare_tensor("pad_id", pad_id_data))
-
-    if return_draft_model_draft_logits:
-        return_draft_model_draft_logits_data = np.array(
-            [[return_draft_model_draft_logits]], dtype=bool)
-        inputs.append(
-            prepare_tensor("return_generation_logits",
-                           return_draft_model_draft_logits_data))
-
-    if return_target_model_accepted_token_logits:
-        return_target_model_accepted_token_logits_data = np.array(
-            [[return_target_model_accepted_token_logits]], dtype=bool)
-        inputs.append(
-            prepare_tensor("return_generation_logits",
-                           return_target_model_accepted_token_logits_data))
-
-    return inputs
-
-
-def check_result(result, model_name):
-    if type(result) == InferenceServerException:
-        print(
-            f"Received an error from server while calling {model_name}: {result}"
-        )
-
-
-def extract_trtllm_outputs(result):
-    # Get batch 0, beam 0 output_ids
-    output_ids = np.squeeze(result.as_numpy("output_ids").astype(np.int32),
-                            axis=(0, 1))
-    sequence_length_data = result.as_numpy("sequence_length").astype(np.int32)
-    assert sequence_length_data.shape[0] == 1
-    assert sequence_length_data.shape[1] == 1
-    sequence_length = sequence_length_data[0, 0]
-    cum_log_probs = result.as_numpy("cum_log_probs").astype(np.float32)
-    output_log_probs = result.as_numpy("output_log_probs").astype(np.float32)
-    context_logits = result.as_numpy("context_logits").astype(np.float32)
-    generation_logits = result.as_numpy("generation_logits").astype(np.float32)
-    return output_ids, sequence_length, cum_log_probs, output_log_probs, context_logits, generation_logits
-
-
-def get_postprocessor_inputs(output_ids, cum_log_probs, output_log_probs,
-                             context_logits, generation_logits):
-    output_ids_data = np.expand_dims(output_ids, axis=(0, 1))
-    inputs = [
-        prepare_tensor("TOKENS_BATCH", output_ids_data),
-        prepare_tensor("SEQUENCE_LENGTH",
-                       np.array([[len(output_ids)]], dtype=np.int32)),
-        prepare_tensor("CUM_LOG_PROBS", cum_log_probs),
-        prepare_tensor("OUTPUT_LOG_PROBS", output_log_probs),
-        prepare_tensor("CONTEXT_LOGITS", context_logits),
-        prepare_tensor("GENERATION_LOGITS", generation_logits)
-    ]
-
-    return inputs
-
-
-def encountered_stop_words(input_ids, stop_words_ids):
-    for stop_word_ids in stop_words_ids:
-        if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids):
-            return True
-    return False
-
-
-def run_speculative_inference(
-        client_draft, client_target, prompt, output_len, in_num_draft_tokens,
-        request_id, repetition_penalty, presence_penalty, frequency_penalty,
-        temperature, stop_words, bad_words, end_id, pad_id, beam_width,
-        preprocessor_model_name, draft_tensorrt_llm_model_name,
-        target_tensorrt_llm_model_name, postprocessor_model_name,
-        return_draft_model_draft_logits,
-        return_target_model_accepted_token_logits, verbose):
-
-    # Call the preprocessor
-    preprocessor_inputs = get_preprocessor_inputs(prompt, output_len,
-                                                  bad_words, stop_words,
-                                                  end_id, pad_id)
-    preprocessor_result = client_draft.infer(preprocessor_model_name,
-                                             preprocessor_inputs,
-                                             request_id=request_id)
-    check_result(preprocessor_result, preprocessor_model_name)
-    prompt_input_ids, bad_words_ids, stop_words_ids, end_id, pad_id = extract_preprocessor_outputs(
-        preprocessor_result)
-
-    input_ids = prompt_input_ids
-    last_input_ids = None
-    draft_output_ids = None
-
-    while True:
-
-        num_draft_tokens = min(
-            in_num_draft_tokens,
-            len(prompt_input_ids) + output_len - len(input_ids) - 1)
-
-        if num_draft_tokens > 0:
-
-            if verbose:
-                print("Draft model input ids:")
-                print(input_ids.tolist())
-
-            #Generate up to num_draft_tokens with draft model
-            draft_inputs = get_trtllm_inputs(
-                input_ids,
-                len(input_ids),
-                num_draft_tokens,
-                None,
-                beam_width,
-                temperature,
-                repetition_penalty,
-                presence_penalty,
-                frequency_penalty,
-                bad_words_ids,
-                stop_words_ids,
-                end_id,
-                pad_id,
-                return_draft_model_draft_logits=return_draft_model_draft_logits
-            )
-
-            draft_result = client_draft.infer(draft_tensorrt_llm_model_name,
-                                              draft_inputs,
-                                              request_id=request_id)
-            check_result(draft_result, draft_tensorrt_llm_model_name)
-            draft_output_ids, draft_seq_len, cum_log_probs, output_log_probs, context_logits, generation_logits = extract_trtllm_outputs(
-                draft_result)
-
-            if verbose:
-                print("Draft model output ids:")
-                print(draft_output_ids.tolist())
-                print("draft_sequence_length")
-                print(draft_seq_len)
-
-            # Set the draft token and call the target model to generate up to num_draft_tokens + 1
-            draft_tokens = draft_output_ids[len(input_ids):draft_seq_len]
-
-            if verbose:
-                print("draft_tokens")
-                print(draft_tokens.tolist())
-                if return_draft_model_draft_logits:
-                    draft_model_draft_token_logits = generation_logits.squeeze(
-                        0)  # [beam_width, num_draft_tokens, vocab_size]
-                    print(
-                        f"draft model draft tokens' logits: shape: {draft_model_draft_token_logits.shape}, value: {draft_model_draft_token_logits}"
-                    )
-
-        if verbose:
-            print("Target model input ids")
-            print(input_ids.tolist())
-
-        # Generate up to len(draft_tokens) + 1 with target model
-        target_inputs = get_trtllm_inputs(
-            input_ids,
-            len(input_ids),
-            len(draft_tokens) + 1 if num_draft_tokens > 0 else 1,
-            draft_tokens if num_draft_tokens > 0 else None,
-            beam_width,
-            temperature,
-            repetition_penalty,
-            presence_penalty,
-            frequency_penalty,
-            bad_words_ids,
-            stop_words_ids,
-            end_id,
-            pad_id,
-            return_target_model_accepted_token_logits=
-            return_target_model_accepted_token_logits)
-
-        target_result = client_target.infer(target_tensorrt_llm_model_name,
-                                            target_inputs,
-                                            request_id=request_id)
-        check_result(target_result, target_tensorrt_llm_model_name)
-        target_output_ids, seq_length, cum_log_probs, output_log_probs, context_logits, generation_logits = extract_trtllm_outputs(
-            target_result)
-
-        if verbose:
-            print("Target model output_ids")
-            print(target_output_ids.tolist())
-            print("target seq_length")
-            print(seq_length)
-            if return_target_model_accepted_token_logits:
-                target_model_accept_token_logits = generation_logits.squeeze(
-                    0).squeeze(0)  # [num_accepted_tokens, vocab_size]
-                print(
-                    f"target model accepted tokens' logits: shape: {target_model_accept_token_logits.shape}, value: {target_model_accept_token_logits}"
-                )
-
-        # Store the last iteration input_ids to check if EOS was encountered
-        last_input_ids = input_ids
-        # Update the input ids with new output_ids
-        input_ids = target_output_ids
-
-        # Evaluate criteria to stop generation loop.
-        # If we've hit or exceeded the max output length, should stop
-        length_stop = (len(input_ids) >= len(prompt_input_ids) + output_len)
-        # If draft and target have same outputs, should stop. Normally target should return 1 more token.
-        # If they are the same length, they should differ at the last token
-        target_draft_equal = draft_output_ids is not None and np.array_equal(
-            draft_output_ids, target_output_ids)
-        # If tokens no longer change, should stop, means we have hit early stopping
-        last_current_equal = np.array_equal(last_input_ids, input_ids)
-        # Need to check if stop words was encountered
-        hit_stop_words = encountered_stop_words(input_ids, stop_words_ids[0])
-
-        if verbose:
-            print("length_stop:", length_stop)
-            print("target_draft_equal:", target_draft_equal)
-            print("last_current_equal:", last_current_equal)
-            print("hit_stop_words:", hit_stop_words)
-
-        if (length_stop or target_draft_equal or last_current_equal
-                or hit_stop_words):
-            break
-
-    # Call the postprocessor
-    postprocessor_inputs = get_postprocessor_inputs(input_ids, cum_log_probs,
-                                                    output_log_probs,
-                                                    context_logits,
-                                                    generation_logits)
-    postprocessor_result = client_target.infer(postprocessor_model_name,
-                                               postprocessor_inputs,
-                                               request_id=request_id)
-    check_result(postprocessor_result, postprocessor_model_name)
-    output = postprocessor_result.as_numpy("OUTPUT")
-    return output[0].decode("utf8")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-
-    parser.add_argument('--url-target',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the target model')
-
-    parser.add_argument('--url-draft',
-                        type=str,
-                        required=False,
-                        help='Inference server URL for the draft model')
-
-    parser.add_argument(
-        '--preprocessor-model-name',
-        type=str,
-        required=False,
-        default="preprocessing",
-        help='Name of the preprocessor model (should be hosted at url-draft)')
-
-    parser.add_argument(
-        '--postprocessor-model-name',
-        type=str,
-        required=False,
-        default="postprocessing",
-        help='Name of the postprocessor model (should be hosted at url-target)'
-    )
-
-    parser.add_argument(
-        '--draft-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm draft model (hosted at url-draft)')
-
-    parser.add_argument(
-        '--target-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm draft model (hosted at url-target)')
-
-    parser.add_argument('-p',
-                        '--prompt',
-                        type=str,
-                        required=True,
-                        help='Input prompt.')
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument('-o',
-                        '--output-len',
-                        type=int,
-                        default=100,
-                        required=False,
-                        help='Specify output length')
-
-    parser.add_argument(
-        '--num-draft-tokens',
-        type=int,
-        default=5,
-        required=False,
-        help=
-        'Specify the number of speculative tokens for the draft model to generate per lookahead.'
-    )
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The end if token')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The pad if token')
-
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='1',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--stop-words',
-                        nargs='+',
-                        default=[],
-                        help='The stop words')
-
-    parser.add_argument('--bad-words',
-                        nargs='+',
-                        default=[],
-                        help='The bad words')
-
-    parser.add_argument(
-        "--return-draft-model-draft-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine"
-    )
-
-    parser.add_argument(
-        "--return-target-model-accepted-token-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine",
-    )
-
-    FLAGS = parser.parse_args()
-    if not FLAGS.url_target:
-        FLAGS.url_target = "localhost:8001"
-
-    if not FLAGS.url_draft:
-        FLAGS.url_draft = FLAGS.url_target
-
-    try:
-        client_target = grpcclient.InferenceServerClient(url=FLAGS.url_target)
-        client_draft = grpcclient.InferenceServerClient(
-            url=FLAGS.url_draft) if (
-                FLAGS.url_target != FLAGS.url_draft) else client_target
-    except Exception as e:
-        print("client creation failed: " + str(e))
-        sys.exit(1)
-
-    if (FLAGS.beam_width > 1):
-        raise Exception(
-            'Beam width > 1 is not yet supported with speculative decoding')
-
-    output_text = run_speculative_inference(
-        client_draft, client_target, FLAGS.prompt, FLAGS.output_len,
-        FLAGS.num_draft_tokens, FLAGS.request_id, FLAGS.repetition_penalty,
-        FLAGS.presence_penalty, FLAGS.frequency_penalty, FLAGS.temperature,
-        FLAGS.stop_words, FLAGS.bad_words, FLAGS.end_id, FLAGS.pad_id,
-        FLAGS.beam_width, FLAGS.preprocessor_model_name,
-        FLAGS.draft_tensorrt_llm_model_name,
-        FLAGS.target_tensorrt_llm_model_name, FLAGS.postprocessor_model_name,
-        FLAGS.return_draft_model_draft_logits,
-        FLAGS.return_target_model_accepted_token_logits, FLAGS.verbose)
-
-    # Print the final text
-    print("Final text:\n", output_text)
diff --git a/inflight_batcher_llm/client/end_to_end_grpc_client.py b/inflight_batcher_llm/client/end_to_end_grpc_client.py
deleted file mode 100644
index 4a0240cb..00000000
--- a/inflight_batcher_llm/client/end_to_end_grpc_client.py
+++ /dev/null
@@ -1,380 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-from functools import partial
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import queue
-import sys
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-
-
-def run_inference(triton_client,
-                  prompt,
-                  output_len,
-                  request_id,
-                  repetition_penalty,
-                  presence_penalty,
-                  frequency_penalty,
-                  temperature,
-                  stop_words,
-                  bad_words,
-                  embedding_bias_words,
-                  embedding_bias_weights,
-                  model_name,
-                  streaming,
-                  beam_width,
-                  overwrite_output_text,
-                  return_context_logits_data,
-                  return_generation_logits_data,
-                  end_id,
-                  pad_id,
-                  verbose,
-                  num_draft_tokens=0,
-                  use_draft_logits=None):
-
-    input0 = [[prompt]]
-    input0_data = np.array(input0).astype(object)
-    output0_len = np.ones_like(input0).astype(np.int32) * output_len
-    streaming_data = np.array([[streaming]], dtype=bool)
-    beam_width_data = np.array([[beam_width]], dtype=np.int32)
-    temperature_data = np.array([[temperature]], dtype=np.float32)
-
-    inputs = [
-        prepare_tensor("text_input", input0_data),
-        prepare_tensor("max_tokens", output0_len),
-        prepare_tensor("stream", streaming_data),
-        prepare_tensor("beam_width", beam_width_data),
-        prepare_tensor("temperature", temperature_data),
-    ]
-
-    if num_draft_tokens > 0:
-        inputs.append(
-            prepare_tensor("num_draft_tokens",
-                           np.array([[num_draft_tokens]], dtype=np.int32)))
-    if use_draft_logits is not None:
-        inputs.append(
-            prepare_tensor("use_draft_logits",
-                           np.array([[use_draft_logits]], dtype=bool)))
-
-    if bad_words:
-        bad_words_list = np.array([bad_words], dtype=object)
-        inputs += [prepare_tensor("bad_words", bad_words_list)]
-
-    if stop_words:
-        stop_words_list = np.array([stop_words], dtype=object)
-        inputs += [prepare_tensor("stop_words", stop_words_list)]
-
-    if repetition_penalty is not None:
-        repetition_penalty = [[repetition_penalty]]
-        repetition_penalty_data = np.array(repetition_penalty,
-                                           dtype=np.float32)
-        inputs += [
-            prepare_tensor("repetition_penalty", repetition_penalty_data)
-        ]
-
-    if presence_penalty is not None:
-        presence_penalty = [[presence_penalty]]
-        presence_penalty_data = np.array(presence_penalty, dtype=np.float32)
-        inputs += [prepare_tensor("presence_penalty", presence_penalty_data)]
-
-    if frequency_penalty is not None:
-        frequency_penalty = [[frequency_penalty]]
-        frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32)
-        inputs += [prepare_tensor("frequency_penalty", frequency_penalty_data)]
-
-    if return_context_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_context_logits",
-                           return_context_logits_data),
-        ]
-
-    if return_generation_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_generation_logits",
-                           return_generation_logits_data),
-        ]
-
-    if (embedding_bias_words is not None and embedding_bias_weights is None
-        ) or (embedding_bias_words is None
-              and embedding_bias_weights is not None):
-        assert 0, "Both embedding bias words and weights must be specified"
-
-    if (embedding_bias_words is not None
-            and embedding_bias_weights is not None):
-        assert len(embedding_bias_words) == len(
-            embedding_bias_weights
-        ), "Embedding bias weights and words must have same length"
-        embedding_bias_words_data = np.array([embedding_bias_words],
-                                             dtype=object)
-        embedding_bias_weights_data = np.array([embedding_bias_weights],
-                                               dtype=np.float32)
-        inputs.append(
-            prepare_tensor("embedding_bias_words", embedding_bias_words_data))
-        inputs.append(
-            prepare_tensor("embedding_bias_weights",
-                           embedding_bias_weights_data))
-    if end_id is not None:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-        inputs += [prepare_tensor("end_id", end_id_data)]
-
-    if pad_id is not None:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-        inputs += [prepare_tensor("pad_id", pad_id_data)]
-
-    user_data = UserData()
-    # Establish stream
-    triton_client.start_stream(callback=partial(callback, user_data))
-    # Send request
-    triton_client.async_stream_infer(model_name, inputs, request_id=request_id)
-
-    #Wait for server to close the stream
-    triton_client.stop_stream()
-
-    # Parse the responses
-    output_text = ""
-    while True:
-        try:
-            result = user_data._completed_requests.get(block=False)
-        except Exception:
-            break
-
-        if type(result) == InferenceServerException:
-            print("Received an error from server:")
-            print(result)
-        else:
-            output = result.as_numpy('text_output')
-            if streaming and beam_width == 1:
-                new_output = output[0].decode("utf-8")
-                if overwrite_output_text:
-                    output_text = new_output
-                else:
-                    output_text += new_output
-            else:
-                output_text = output[0].decode("utf-8")
-                if verbose:
-                    print(output, flush=True)
-
-            if return_context_logits_data is not None:
-                context_logits = result.as_numpy('context_logits')
-                if verbose:
-                    print(f"context_logits.shape: {context_logits.shape}")
-                    print(f"context_logits: {context_logits}")
-            if return_generation_logits_data is not None:
-                generation_logits = result.as_numpy('generation_logits')
-                if verbose:
-                    print(
-                        f"generation_logits.shape: {generation_logits.shape}")
-                    print(f"generation_logits: {generation_logits}")
-
-    if streaming and beam_width == 1:
-        if verbose:
-            print(output_text)
-
-    return output_text
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-
-    parser.add_argument('-p',
-                        '--prompt',
-                        type=str,
-                        required=True,
-                        help='Input prompt.')
-
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        default="ensemble",
-                        choices=["ensemble", "tensorrt_llm_bls"],
-                        help='Name of the Triton model to send request to')
-
-    parser.add_argument(
-        "-S",
-        "--streaming",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Enable streaming mode. Default is False.",
-    )
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument('-o',
-                        '--output-len',
-                        type=int,
-                        default=100,
-                        required=False,
-                        help='Specify output length')
-
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--stop-words',
-                        nargs='+',
-                        default=[],
-                        help='The stop words')
-
-    parser.add_argument('--bad-words',
-                        nargs='+',
-                        default=[],
-                        help='The bad words')
-
-    parser.add_argument('--embedding-bias-words',
-                        nargs='+',
-                        default=[],
-                        help='The biased words')
-
-    parser.add_argument('--embedding-bias-weights',
-                        nargs='+',
-                        default=[],
-                        help='The biased words weights')
-
-    parser.add_argument(
-        '--overwrite-output-text',
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        'In streaming mode, overwrite previously received output text instead of appending to it'
-    )
-
-    parser.add_argument(
-        "--return-context-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-generation-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        required=False,
-                        help='The token id for end token.')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        required=False,
-                        help='The token id for pad token.')
-
-    FLAGS = parser.parse_args()
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8001"
-
-    embedding_bias_words = FLAGS.embedding_bias_words if FLAGS.embedding_bias_words else None
-    embedding_bias_weights = FLAGS.embedding_bias_weights if FLAGS.embedding_bias_weights else None
-
-    try:
-        client = grpcclient.InferenceServerClient(url=FLAGS.url)
-    except Exception as e:
-        print("client creation failed: " + str(e))
-        sys.exit(1)
-
-    return_context_logits_data = None
-    if FLAGS.return_context_logits:
-        return_context_logits_data = np.array([[FLAGS.return_context_logits]],
-                                              dtype=bool)
-
-    return_generation_logits_data = None
-    if FLAGS.return_generation_logits:
-        return_generation_logits_data = np.array(
-            [[FLAGS.return_generation_logits]], dtype=bool)
-
-    output_text = run_inference(
-        client, FLAGS.prompt, FLAGS.output_len, FLAGS.request_id,
-        FLAGS.repetition_penalty, FLAGS.presence_penalty,
-        FLAGS.frequency_penalty, FLAGS.temperature, FLAGS.stop_words,
-        FLAGS.bad_words, embedding_bias_words, embedding_bias_weights,
-        FLAGS.model_name, FLAGS.streaming, FLAGS.beam_width,
-        FLAGS.overwrite_output_text, return_context_logits_data,
-        return_generation_logits_data, FLAGS.end_id, FLAGS.pad_id, True)
diff --git a/inflight_batcher_llm/client/inflight_batcher_llm_client.py b/inflight_batcher_llm/client/inflight_batcher_llm_client.py
deleted file mode 100755
index f0e3a837..00000000
--- a/inflight_batcher_llm/client/inflight_batcher_llm_client.py
+++ /dev/null
@@ -1,874 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-import csv
-import os
-import queue
-import sys
-import time
-from functools import partial
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from transformers import AutoTokenizer
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype
-
-#
-# Simple streaming client for TRT-LLM inflight bacthing backend
-#
-# In order for this code to work properly, config.pbtxt must contain these values:
-#
-# model_transaction_policy {
-#   decoupled: True
-# }
-#
-# parameters: {
-#   key: "gpt_model_type"
-#   value: {
-#     string_value: "inflight_batching"
-#   }
-# }
-#
-# In order for gpt_model_type 'inflight_batching' to work, you must copy engine from
-#
-# tensorrt_llm/cpp/tests/resources/models/rt_engine/gpt2/fp16-inflight-batching-plugin/1-gpu/
-#
-
-np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
-
-_str_to_np_dict = dict(
-    float16=np.float16,
-    float32=np.float32,
-    int32=np.int32,
-    bfloat16=np_bfloat16,
-)
-
-
-def curate_log_output(token_sequence,
-                      identifier="Input",
-                      log_max_sequence_len=256):
-    if len(token_sequence) > log_max_sequence_len:
-        print(f"{identifier} sequence starts with: ",
-              token_sequence[:log_max_sequence_len])
-    else:
-        print(f"{identifier} sequence: ", token_sequence)
-
-
-def str_dtype_to_np(dtype):
-    ret = _str_to_np_dict.get(dtype)
-    assert ret is not None, f'Unsupported dtype: {dtype}'
-    return ret
-
-
-def check_output_names(expected_outputs, infer_result):
-    if expected_outputs:
-        output_names = set([o.name for o in infer_result._result.outputs])
-        if set(expected_outputs) != output_names:
-            raise Exception(
-                f"expected outputs do not match actual outputs {expected_outputs} != {output_names}"
-            )
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def prepare_tensor(name, input):
-    t = grpcclient.InferInput(name, input.shape,
-                              np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-def prepare_outputs(output_names):
-
-    outputs = []
-    for output_name in output_names:
-        outputs.append(grpcclient.InferRequestedOutput(output_name))
-    return outputs
-
-
-def prepare_inputs(input_ids_data, input_lengths_data, request_output_len_data,
-                   beam_width_data, temperature_data, repetition_penalty_data,
-                   presence_penalty_data, frequency_penalty_data,
-                   streaming_data, end_id, pad_id, prompt_embedding_table_data,
-                   prompt_vocab_size_data, lora_task_id_data,
-                   lora_weights_data, lora_config_data, return_log_probs_data,
-                   top_k_data, top_p_data, draft_ids_data,
-                   return_context_logits_data, return_generation_logits_data,
-                   decoder_input_ids_data):
-    inputs = [
-        prepare_tensor("input_ids", input_ids_data),
-        prepare_tensor("input_lengths", input_lengths_data),
-        prepare_tensor("request_output_len", request_output_len_data),
-        prepare_tensor("beam_width", beam_width_data),
-        prepare_tensor("temperature", temperature_data),
-        prepare_tensor("streaming", streaming_data),
-        prepare_tensor("end_id", end_id),
-        prepare_tensor("pad_id", pad_id),
-        prepare_tensor("return_log_probs", return_log_probs_data),
-        prepare_tensor("runtime_top_k", top_k_data),
-        prepare_tensor("runtime_top_p", top_p_data),
-    ]
-    if prompt_embedding_table_data is not None:
-        inputs += [
-            prepare_tensor("prompt_embedding_table",
-                           prompt_embedding_table_data),
-            prepare_tensor("prompt_vocab_size", prompt_vocab_size_data)
-        ]
-    if lora_task_id_data is not None:
-        inputs += [prepare_tensor("lora_task_id", lora_task_id_data)]
-    if lora_weights_data is not None:
-        inputs += [
-            prepare_tensor("lora_weights", lora_weights_data),
-            prepare_tensor("lora_config", lora_config_data),
-        ]
-    if repetition_penalty_data is not None:
-        inputs += [
-            prepare_tensor("repetition_penalty", repetition_penalty_data),
-        ]
-    if presence_penalty_data is not None:
-        inputs += [
-            prepare_tensor("presence_penalty", presence_penalty_data),
-        ]
-    if frequency_penalty_data is not None:
-        inputs += [
-            prepare_tensor("frequency_penalty", frequency_penalty_data),
-        ]
-    if draft_ids_data is not None:
-        inputs += [
-            prepare_tensor("draft_input_ids", draft_ids_data),
-        ]
-    if return_context_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_context_logits",
-                           return_context_logits_data),
-        ]
-    if return_generation_logits_data is not None:
-        inputs += [
-            prepare_tensor("return_generation_logits",
-                           return_generation_logits_data),
-        ]
-    if decoder_input_ids_data is not None:
-        inputs += [
-            prepare_tensor("decoder_input_ids", decoder_input_ids_data),
-        ]
-    return inputs
-
-
-def prepare_stop_signals():
-
-    inputs = [
-        grpcclient.InferInput('input_ids', [1, 1], "INT32"),
-        grpcclient.InferInput('input_lengths', [1, 1], "INT32"),
-        grpcclient.InferInput('request_output_len', [1, 1], "INT32"),
-        grpcclient.InferInput('stop', [1, 1], "BOOL"),
-    ]
-
-    inputs[0].set_data_from_numpy(np.empty([1, 1], dtype=np.int32))
-    inputs[1].set_data_from_numpy(np.zeros([1, 1], dtype=np.int32))
-    inputs[2].set_data_from_numpy(np.array([[0]], dtype=np.int32))
-    inputs[3].set_data_from_numpy(np.array([[True]], dtype='bool'))
-
-    return inputs
-
-
-# Define the callback function. Note the last two parameters should be
-# result and error. InferenceServerClient would povide the results of an
-# inference as grpcclient.InferResult in result. For successful
-# inference, error will be None, otherwise it will be an object of
-# tritonclientutils.InferenceServerException holding the error details
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-        if (FLAGS.streaming):
-            if result.get_output('output_ids') is not None:
-                output_ids = result.as_numpy('output_ids')
-                seq_lens = result.as_numpy('sequence_length')
-                if seq_lens == None or seq_lens[0][0] > 0:
-                    tokens = list(output_ids[0][0])
-                    print(tokens, flush=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Enable verbose output",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        type=str,
-        required=False,
-        default="localhost:8001",
-        help="Inference server URL. Default is localhost:8001.",
-    )
-    parser.add_argument(
-        '--text',
-        type=str,
-        required=False,
-        default='Born in north-east France, Soyer trained as a',
-        help='Input text')
-
-    parser.add_argument('--input-tokens-csv',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Path to csv file containing the input tokens')
-
-    parser.add_argument('--draft-tokens-csv',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Path to csv file containing the draft tokens')
-
-    parser.add_argument(
-        '--output-tokens-csv',
-        type=str,
-        required=False,
-        default='',
-        help='Path to csv file containing the expected output tokens')
-
-    parser.add_argument(
-        '--end-id',
-        type=int,
-        required=False,
-        default=50256,
-        help='The token id for end token. Only needed if tokenizer is not used.'
-    )
-
-    parser.add_argument(
-        '--pad-id',
-        type=int,
-        required=False,
-        default=50256,
-        help='The token id for pad token. Only needed if tokenizer is not used.'
-    )
-
-    parser.add_argument(
-        "-s",
-        "--ssl",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Enable SSL encrypted channel to the server",
-    )
-    parser.add_argument(
-        "-t",
-        "--stream-timeout",
-        type=float,
-        required=False,
-        default=None,
-        help="Stream timeout in seconds. Default is None.",
-    )
-    parser.add_argument(
-        "-r",
-        "--root-certificates",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded root certificates. Default is None.",
-    )
-    parser.add_argument(
-        "-p",
-        "--private-key",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded private key. Default is None.",
-    )
-    parser.add_argument(
-        "-x",
-        "--certificate-chain",
-        type=str,
-        required=False,
-        default=None,
-        help="File holding PEM-encoded certificate chain. Default is None.",
-    )
-    parser.add_argument(
-        "-C",
-        "--grpc-compression-algorithm",
-        type=str,
-        required=False,
-        default=None,
-        help=
-        "The compression algorithm to be used when sending request to server. Default is None.",
-    )
-    parser.add_argument(
-        "-S",
-        "--streaming",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Enable streaming mode. Default is False.",
-    )
-    parser.add_argument(
-        "-c",
-        "--check-output",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Enable check of output ids for CI",
-    )
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument(
-        "--request-output-len",
-        type=int,
-        required=False,
-        default=16,
-        help="Request output length",
-    )
-    parser.add_argument(
-        '--stop-after-ms',
-        type=int,
-        required=False,
-        default=0,
-        help='Early stop the generation after a few milliseconds')
-    parser.add_argument(
-        "--stop-via-request-cancel",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Early stop use request cancellation instead of stop request")
-    parser.add_argument('--tokenizer-dir',
-                        type=str,
-                        required=False,
-                        default='',
-                        help='Specify tokenizer directory')
-    parser.add_argument('--tokenizer-type',
-                        type=str,
-                        default='auto',
-                        required=False,
-                        choices=['auto', 't5', 'llama'],
-                        help='Specify tokenizer type')
-    parser.add_argument('--request-id',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The request_id for the stop request')
-
-    parser.add_argument('--prompt-embedding-table-path',
-                        type=str,
-                        default='',
-                        required=False,
-                        help='The prompt embedding table to use for ptuning')
-    parser.add_argument("--lora-path",
-                        type=str,
-                        default='',
-                        required=False,
-                        help="LoRA weights")
-    parser.add_argument("--lora-task-id",
-                        type=int,
-                        default=None,
-                        required=False,
-                        help="LoRA task id")
-    parser.add_argument(
-        "--exclude-input-in-output",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Expect that output IDs do not contain input IDs",
-    )
-
-    parser.add_argument(
-        '--prompt-task-id',
-        type=int,
-        default=0,
-        required=False,
-        help='The prompt task id in the prompt embedding table')
-
-    parser.add_argument('--dtype',
-                        type=str,
-                        default='float16',
-                        choices=['float16', 'float32', 'bfloat16'])
-
-    parser.add_argument(
-        "--return-log-probs",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Enable computation of log probs",
-    )
-
-    parser.add_argument(
-        "--return-context-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return context logits, the engine must be built with gather_context_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--return-generation-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return generation logits, the engine must be built with gather_ generation_logits or gather_all_token_logits",
-    )
-
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        required=False,
-        default=1,
-        help="top k value",
-    )
-
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        required=False,
-        default=0.,
-        help="top p value",
-    )
-
-    parser.add_argument('--requested-outputs',
-                        nargs='+',
-                        default=[],
-                        help='The requested output tensors')
-
-    parser.add_argument('--model-name',
-                        type=str,
-                        required=False,
-                        default='tensorrt_llm',
-                        help='Specify model name')
-
-    FLAGS = parser.parse_args()
-
-    tokenizer = None
-    draft_ids = None
-    decoder_input_ids = None
-    if FLAGS.input_tokens_csv != "":
-        with open(FLAGS.input_tokens_csv) as csv_file:
-            csv_reader = csv.reader(csv_file, delimiter=",")
-            for row in csv_reader:
-                input_ids = [[int(val) for val in row]]
-                break
-
-            curate_log_output(input_ids[0], "Input")
-
-        if FLAGS.draft_tokens_csv != "":
-            with open(FLAGS.draft_tokens_csv) as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=",")
-                for row in csv_reader:
-                    draft_ids = [[int(val) for val in row]]
-                    break
-
-        end_id = FLAGS.end_id
-        pad_id = FLAGS.pad_id
-
-    else:
-        print('=========')
-        if (os.path.isdir(FLAGS.tokenizer_dir)
-                and not os.path.exists(FLAGS.tokenizer_dir)):
-            raise FileNotFoundError(
-                "Input tokens are not provided and tokenizer directory does"
-                f" not exist: {FLAGS.tokenizer_dir}", )
-
-        tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                                  legacy=False,
-                                                  padding_side='left',
-                                                  trust_remote_code=True)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        pad_id = tokenizer.encode(tokenizer.pad_token,
-                                  add_special_tokens=False)[0]
-        end_id = tokenizer.encode(tokenizer.eos_token,
-                                  add_special_tokens=False)[0]
-        print("Using pad_id: ", pad_id)
-        print("Using end_id: ", end_id)
-
-        input_ids = [tokenizer.encode(FLAGS.text)]
-        curate_log_output(input_ids[0], "Input")
-
-    end_id_data = np.array([[end_id]], dtype=np.int32)
-    pad_id_data = np.array([[pad_id]], dtype=np.int32)
-
-    #Get the prompt embedding table for the task id
-    prompt_embedding_table_data = None
-    prompt_vocab_size_data = None
-    if (FLAGS.prompt_embedding_table_path != ""):
-        prompt_table = np.load(FLAGS.prompt_embedding_table_path)
-        prompt_table = prompt_table.astype(str_dtype_to_np(FLAGS.dtype))
-        task_vocab_size = prompt_table.shape[1]
-
-        # squeeze the first 2 dimensions
-        prompt_embedding_table_data = prompt_table[FLAGS.prompt_task_id]
-        prompt_embedding_table_data = np.expand_dims(
-            prompt_table[FLAGS.prompt_task_id], axis=0)
-
-        prompt_vocab_size = [[task_vocab_size]]
-        prompt_vocab_size_data = np.array(prompt_vocab_size, dtype=np.int32)
-
-    lora_weights_data = None
-    lora_config_data = None
-    if (FLAGS.lora_path != ""):
-        lora_weights_data = np.load(
-            os.path.join(FLAGS.lora_path, "model.lora_weights.npy"))
-        try:
-            lora_config_data = np.load(
-                os.path.join(FLAGS.lora_path, "model.lora_config.npy"))
-        except Exception:
-            lora_config_data = np.load(
-                os.path.join(FLAGS.lora_path, "model.lora_keys.npy"))
-    lora_task_id_data = None
-    if FLAGS.lora_task_id is not None and FLAGS.lora_task_id != 0:
-        lora_task_id_data = np.array([[FLAGS.lora_task_id]], dtype=np.uint64)
-
-    input_ids_data = np.array(input_ids, dtype=np.int32)
-    input_lengths = [[len(ii)] for ii in input_ids]
-    input_lengths_data = np.array(input_lengths, dtype=np.int32)
-    request_output_len = [[FLAGS.request_output_len]]
-    request_output_len_data = np.array(request_output_len, dtype=np.int32)
-    beam_width = [[FLAGS.beam_width]]
-    beam_width_data = np.array(beam_width, dtype=np.int32)
-    top_k = [[FLAGS.top_k]]
-    top_k_data = np.array(top_k, dtype=np.int32)
-    top_p = [[FLAGS.top_p]]
-    top_p_data = np.array(top_p, dtype=np.float32)
-    temperature = [[FLAGS.temperature]]
-    temperature_data = np.array(temperature, dtype=np.float32)
-    return_log_probs = [[FLAGS.return_log_probs]]
-    return_log_probs_data = np.array(return_log_probs, dtype=bool)
-
-    return_context_logits_data = None
-    if FLAGS.return_context_logits:
-        return_context_logits_data = np.array([[FLAGS.return_context_logits]],
-                                              dtype=bool)
-
-    return_generation_logits_data = None
-    if FLAGS.return_generation_logits:
-        return_generation_logits_data = np.array(
-            [[FLAGS.return_generation_logits]], dtype=bool)
-
-    repetition_penalty_data = None
-    if FLAGS.repetition_penalty is not None:
-        repetition_penalty = [[FLAGS.repetition_penalty]]
-        repetition_penalty_data = np.array(repetition_penalty,
-                                           dtype=np.float32)
-    presence_penalty_data = None
-    if FLAGS.presence_penalty is not None:
-        presence_penalty = [[FLAGS.presence_penalty]]
-        presence_penalty_data = np.array(presence_penalty, dtype=np.float32)
-    frequency_penalty_data = None
-    if FLAGS.frequency_penalty is not None:
-        frequency_penalty = [[FLAGS.frequency_penalty]]
-        frequency_penalty_data = np.array(frequency_penalty, dtype=np.float32)
-    streaming = [[FLAGS.streaming]]
-    streaming_data = np.array(streaming, dtype=bool)
-
-    draft_ids_data = None
-    if draft_ids is not None:
-        draft_ids_data = np.array(draft_ids, dtype=np.int32)
-
-    decoder_input_ids_data = None
-    if decoder_input_ids is not None:
-        decoder_input_ids_data = np.array(decoder_input_ids, dtype=np.int32)
-
-    inputs = prepare_inputs(
-        input_ids_data, input_lengths_data, request_output_len_data,
-        beam_width_data, temperature_data, repetition_penalty_data,
-        presence_penalty_data, frequency_penalty_data, streaming_data,
-        end_id_data, pad_id_data, prompt_embedding_table_data,
-        prompt_vocab_size_data, lora_task_id_data, lora_weights_data,
-        lora_config_data, return_log_probs_data, top_k_data, top_p_data,
-        draft_ids_data, return_context_logits_data,
-        return_generation_logits_data, decoder_input_ids_data)
-
-    if FLAGS.requested_outputs:
-        # Must have at least output_ids in requested outputs
-        if "output_ids" not in FLAGS.requested_outputs:
-            raise Exception(
-                "requested outputs must at least have \"output_ids\"")
-        outputs = prepare_outputs(FLAGS.requested_outputs)
-    else:
-        outputs = None
-
-    stop_inputs = None
-    if FLAGS.stop_after_ms > 0 and not FLAGS.stop_via_request_cancel:
-        stop_inputs = prepare_stop_signals()
-
-    request_id = FLAGS.request_id
-
-    if FLAGS.output_tokens_csv != "":
-        with open(FLAGS.output_tokens_csv) as csv_file:
-            csv_reader = csv.reader(csv_file, delimiter=",")
-            for row in csv_reader:
-                expected_output_ids = [int(val) for val in row]
-                break
-    else:
-        expected_output_ids = ([] if FLAGS.exclude_input_in_output else
-                               input_ids[0]) + [
-                                   21221, 290, 257, 4255, 379, 262, 1957, 7072,
-                                   11, 4689, 347, 2852, 2564, 494, 13, 679
-                               ]
-
-    if FLAGS.streaming:
-        actual_output_ids = [
-            [] if FLAGS.exclude_input_in_output else input_ids[0]
-        ]
-    else:
-        actual_output_ids = []
-
-    sequence_lengths = []
-    cum_log_probs = None
-    output_log_probs = None
-    context_logits = None
-    generation_logits = None
-
-    user_data = UserData()
-    with grpcclient.InferenceServerClient(
-            url=FLAGS.url,
-            verbose=FLAGS.verbose,
-            ssl=FLAGS.ssl,
-            root_certificates=FLAGS.root_certificates,
-            private_key=FLAGS.private_key,
-            certificate_chain=FLAGS.certificate_chain,
-    ) as triton_client:
-        try:
-
-            if FLAGS.streaming:
-
-                # Establish stream
-                triton_client.start_stream(
-                    callback=partial(callback, user_data),
-                    stream_timeout=FLAGS.stream_timeout,
-                )
-                # Send request
-                triton_client.async_stream_infer(
-                    FLAGS.model_name,
-                    inputs,
-                    outputs=outputs,
-                    request_id=request_id,
-                )
-
-                if FLAGS.stop_after_ms > 0:
-                    time.sleep(FLAGS.stop_after_ms / 1000.0)
-
-                    if not FLAGS.stop_via_request_cancel:
-                        triton_client.async_stream_infer(
-                            FLAGS.model_name,
-                            stop_inputs,
-                            request_id=request_id,
-                            parameters={'Streaming': FLAGS.streaming})
-
-                # Close the grpc stream
-                cancel_requests = FLAGS.stop_after_ms > 0 and FLAGS.stop_via_request_cancel
-                triton_client.stop_stream(cancel_requests=cancel_requests)
-
-                # Parse the responses
-                while True:
-                    try:
-                        result = user_data._completed_requests.get(block=False)
-                    except Exception:
-                        break
-
-                    if type(result) == InferenceServerException:
-                        if result.status() == "StatusCode.CANCELLED":
-                            print("Request is cancelled")
-                        else:
-                            print("Received an error from server:")
-                            print(result)
-                            raise result
-                    else:
-                        check_output_names(FLAGS.requested_outputs, result)
-                        output_ids = result.as_numpy('output_ids')
-                        sequence_lengths = result.as_numpy('sequence_length')
-                        if output_ids is not None:
-                            # Only one beam is supported
-                            if sequence_lengths == None or sequence_lengths[0][
-                                    0] > 0:
-                                tokens = list(output_ids[0][0])
-                                actual_output_ids[
-                                    0] = actual_output_ids[0] + tokens
-                        else:
-                            print("Got cancellation response from server")
-            else:
-                # Send request
-                infer_future = triton_client.async_infer(
-                    FLAGS.model_name,
-                    inputs,
-                    outputs=outputs,
-                    request_id=request_id,
-                    callback=partial(callback, user_data),
-                    parameters={'Streaming': FLAGS.streaming})
-
-                expected_responses = 1
-
-                if FLAGS.stop_after_ms > 0:
-
-                    time.sleep(FLAGS.stop_after_ms / 1000.0)
-
-                    if FLAGS.stop_via_request_cancel:
-                        infer_future.cancel()
-                    else:
-                        triton_client.async_infer(
-                            FLAGS.model_name,
-                            stop_inputs,
-                            request_id=request_id,
-                            callback=partial(callback, user_data),
-                            parameters={'Streaming': FLAGS.streaming})
-                        expected_responses += 1
-
-                processed_count = 0
-                while processed_count < expected_responses:
-                    try:
-                        result = user_data._completed_requests.get()
-                        print("Got completed request", flush=True)
-                    except Exception:
-                        break
-
-                    if type(result) == InferenceServerException:
-                        if result.status() == "StatusCode.CANCELLED":
-                            print("Request is cancelled")
-                        else:
-                            print("Received an error from server:")
-                            print(result)
-                            raise result
-                    else:
-                        check_output_names(FLAGS.requested_outputs, result)
-                        output_ids = result.as_numpy('output_ids')
-                        if FLAGS.return_log_probs:
-                            cum_log_probs = result.as_numpy('cum_log_probs')
-                            output_log_probs = result.as_numpy(
-                                'output_log_probs')
-                        if FLAGS.return_context_logits:
-                            context_logits = result.as_numpy('context_logits')
-                        if FLAGS.return_generation_logits:
-                            generation_logits = result.as_numpy(
-                                'generation_logits')
-                        if output_ids is not None:
-                            sequence_lengths = result.as_numpy(
-                                'sequence_length')
-                            for beam_output_ids in output_ids[0]:
-                                tokens = list(beam_output_ids)
-                                actual_output_ids.append(tokens)
-                        else:
-                            print("Got cancellation response from server")
-
-                    processed_count = processed_count + 1
-        except Exception as e:
-            err = "Encountered error: " + str(e)
-            print(err)
-            sys.exit(err)
-
-        passed = True
-
-        for beam in range(FLAGS.beam_width):
-            seq_len = sequence_lengths[0][beam] if (
-                not FLAGS.streaming and len(sequence_lengths) > 0) else len(
-                    actual_output_ids[beam])
-            # These should be equal when input IDs are excluded from output
-            output_ids_w_prompt = actual_output_ids[beam][:seq_len]
-            output_ids_wo_prompt = (
-                output_ids_w_prompt if FLAGS.exclude_input_in_output else
-                output_ids_w_prompt[input_ids_data.shape[1]:])
-            if tokenizer != None:
-                output_text = tokenizer.decode(output_ids_wo_prompt)
-                print(f'Input: {FLAGS.text}')
-                print(f'Output beam {beam}: {output_text}')
-
-            # If cancelled, the number of output tokens should be less than request output length.
-            if FLAGS.stop_after_ms > 0 and len(
-                    output_ids_wo_prompt) >= FLAGS.request_output_len:
-                raise AssertionError("expect less than " +
-                                     str(FLAGS.request_output_len) +
-                                     " output tokens, got " +
-                                     str(len(output_ids_wo_prompt)))
-
-            curate_log_output(output_ids_w_prompt, "Output")
-
-            if (FLAGS.check_output and beam == 0):
-                passed = (output_ids_w_prompt == expected_output_ids)
-                print("expected_output_ids = ", expected_output_ids)
-                print("\n=====")
-                print("PASS!" if passed else "FAIL!")
-                print("=====")
-
-        if FLAGS.return_log_probs:
-            print(cum_log_probs)
-            print(output_log_probs)
-
-        if FLAGS.return_context_logits:
-            print(f"context_logits.shape: {context_logits.shape}")
-            print(f"context_logits: {context_logits}")
-
-        if FLAGS.return_generation_logits:
-            print(f"generation_logits.shape: {generation_logits.shape}")
-            print(f"generation_logits: {generation_logits}")
-
-        sys.exit(not passed)
diff --git a/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in b/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in
deleted file mode 100644
index 84239071..00000000
--- a/inflight_batcher_llm/cmake/TritonTensorRTLLMBackendConfig.cmake.in
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include(CMakeFindDependencyMacro)
-
-get_filename_component(
-  TRITONTRTLLMBACKEND_CMAKE_DIR  "${CMAKE_CURRENT_LIST_FILE}" PATH
-)
-
-list(APPEND CMAKE_MODULE_PATH ${TRITONTRTLLMBACKEND_CMAKE_DIR })
-
-if(NOT TARGET TritonTRTLLMBackend::triton-trtllm-backend)
-  include("${TRITONTRTLLMBACKEND_CMAKE_DIR }/TritonTRTLLMBackendTargets.cmake")
-endif()
-
-set(TRITONTRTLLMBACKEND_LIBRARIES TritonTRTLLMBackend::triton-trtllm-backend)
diff --git a/inflight_batcher_llm/cmake/modules/set_ifndef.cmake b/inflight_batcher_llm/cmake/modules/set_ifndef.cmake
deleted file mode 100644
index bd8f0a3e..00000000
--- a/inflight_batcher_llm/cmake/modules/set_ifndef.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not
-# use this file except in compliance with the License. You may obtain a copy of
-# the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations under
-# the License.
-#
-
-function(set_ifndef variable value)
-  if(NOT DEFINED ${variable})
-    set(${variable}
-        ${value}
-        PARENT_SCOPE)
-  endif()
-endfunction()
diff --git a/inflight_batcher_llm/scripts/build.sh b/inflight_batcher_llm/scripts/build.sh
deleted file mode 100644
index 22ac8bb6..00000000
--- a/inflight_batcher_llm/scripts/build.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-Help()
-{
-   # Display Help
-   echo "Syntax: build.sh [h|-t <trt_root>|u]"
-   echo "options:"
-   echo "h     Print this Help."
-   echo "t     Location of tensorrt library"
-   echo "u     Option to build unit tests"
-   echo
-}
-
-TRT_ROOT='/usr/local/tensorrt'
-BUILD_UNIT_TESTS='false'
-
-# Get the options
-while getopts ":ht:u" option; do
-   case $option in
-      h) # display Help
-         Help
-         exit;;
-      t) # Location of tensorrt
-         TRT_ROOT=$OPTARG;;
-      u) # Option to build unit tests
-         BUILD_UNIT_TESTS='true';;
-     \?) # Invalid option
-         echo "Error: Invalid option"
-         echo ""
-         Help
-         exit;;
-   esac
-done
-
-echo "Using TRT_ROOT=${TRT_ROOT}"
-echo "Using BUILD_UNIT_TESTS=${BUILD_UNIT_TESTS}"
-
-set -x
-apt-get update
-apt-get install -y --no-install-recommends rapidjson-dev
-
-BUILD_DIR=$(dirname $0)/../build
-mkdir $BUILD_DIR
-BUILD_DIR=$(cd -- "$BUILD_DIR" && pwd)
-cd $BUILD_DIR
-
-export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib.real:${LD_LIBRARY_PATH}"
-
-BUILD_TESTS_ARG=""
-if [[ "$BUILD_UNIT_TESTS" == "true" ]]; then
-  BUILD_TESTS_ARG="-DBUILD_TESTS=ON"
-fi
-
-cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} ..
-make install
diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
deleted file mode 100644
index b1791b99..00000000
--- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#include "custom_metrics_reporter.h"
-#include "triton/backend/backend_common.h"
-#include <iomanip>
-#include <sstream>
-#include <vector>
-
-using namespace ::triton::common; // TritonJson
-
-namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
-{
-
-const std::vector<std::string> CustomMetricsReporter::request_keys_{
-    "Active Request Count", "Max Request Count", "Scheduled Requests", "Context Requests"};
-const std::vector<std::string> CustomMetricsReporter::request_labels_{"active", "max", "scheduled", "context"};
-
-const std::vector<std::string> CustomMetricsReporter::runtime_memory_keys_{
-    "Runtime CPU Memory Usage", "Runtime GPU Memory Usage", "Runtime Pinned Memory Usage"};
-const std::vector<std::string> CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"};
-
-const std::vector<std::string> CustomMetricsReporter::kv_cache_keys_{
-    "Max KV cache blocks", "Free KV cache blocks", "Used KV cache blocks", "Tokens per KV cache block"};
-const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{"max", "free", "used", "tokens_per"};
-
-const std::vector<std::string> CustomMetricsReporter::v1_specific_keys_{
-    "Total Context Tokens", "Total Generation Tokens", "Empty Generation Slots"};
-const std::vector<std::string> CustomMetricsReporter::v1_specific_labels_{
-    "total_context_tokens", "total_generation_tokens", "empty_generation_slots"};
-
-const std::vector<std::string> CustomMetricsReporter::IFB_specific_keys_{
-    "Total Context Tokens", "Generation Requests", "MicroBatch ID", "Paused Requests"};
-const std::vector<std::string> CustomMetricsReporter::IFB_specific_labels_{
-    "total_context_tokens", "generation_requests", "micro_batch_id", "paused_requests"};
-
-const std::vector<std::string> CustomMetricsReporter::general_metric_keys_{"Timestamp", "Iteration Counter"};
-const std::vector<std::string> CustomMetricsReporter::general_metric_labels_{"timestamp", "iteration_counter"};
-
-uint64_t convertTimestampToSeconds(std::string const& ts)
-{
-    std::tm tm = {};
-    std::stringstream ss(ts);
-    ss >> std::get_time(&tm, "%m-%d-%Y %H:%M:%S");
-    auto timestamp = std::chrono::system_clock::from_time_t(std::mktime(&tm));
-    auto epoch = std::chrono::time_point_cast<std::chrono::seconds>(timestamp).time_since_epoch();
-    uint64_t time_in_seconds = std::chrono::duration_cast<std::chrono::seconds>(epoch).count();
-    return time_in_seconds;
-}
-
-TritonMetricGroup::TritonMetricGroup(std::string const& metric_family_label,
-    std::string const& metric_family_description, std::string const& category_label,
-    std::vector<std::string> const& json_keys, std::vector<std::string> const& sub_labels)
-    : metric_family_label_(metric_family_label)
-    , metric_family_description_(metric_family_description)
-    , category_label_(category_label)
-    , json_keys_(json_keys)
-    , sub_labels_(sub_labels)
-{
-}
-
-TRITONSERVER_Error* TritonMetricGroup::CreateGroup(std::string const& model_name, const uint64_t version)
-{
-    TRITONSERVER_MetricFamily* metric_family = nullptr;
-    RETURN_IF_ERROR(TRITONSERVER_MetricFamilyNew(&metric_family, TRITONSERVER_METRIC_KIND_GAUGE,
-        metric_family_label_.c_str(), metric_family_description_.c_str()));
-    metric_family_.reset(metric_family);
-
-    std::vector<TRITONSERVER_Parameter const*> labels;
-    std::unique_ptr<TRITONSERVER_Parameter, ParameterDeleter> model_label(
-        TRITONSERVER_ParameterNew("model", TRITONSERVER_PARAMETER_STRING, model_name.c_str()));
-    std::unique_ptr<TRITONSERVER_Parameter, ParameterDeleter> model_version(
-        TRITONSERVER_ParameterNew("version", TRITONSERVER_PARAMETER_STRING, std::to_string(version).c_str()));
-    labels.emplace_back(model_label.get());
-    labels.emplace_back(model_version.get());
-
-    for (size_t i = 0; i < sub_labels_.size(); ++i)
-    {
-        TRITONSERVER_Metric* metric;
-        std::unique_ptr<TRITONSERVER_Parameter, ParameterDeleter> sub_label(
-            TRITONSERVER_ParameterNew(category_label_.c_str(), TRITONSERVER_PARAMETER_STRING, sub_labels_[i].c_str()));
-        labels.emplace_back(sub_label.get());
-        RETURN_IF_ERROR(TRITONSERVER_MetricNew(&metric, metric_family_.get(), labels.data(), labels.size()));
-        std::unique_ptr<TRITONSERVER_Metric, MetricDeleter> unique_metric(metric);
-        metrics_.push_back(std::move(unique_metric));
-        labels.pop_back();
-    }
-
-    return nullptr; // success
-}
-
-TRITONSERVER_Error* TritonMetricGroup::UpdateGroup(std::vector<uint64_t>& values)
-{
-    for (size_t i = 0; i < values.size(); ++i)
-    {
-        RETURN_IF_ERROR(TRITONSERVER_MetricSet(metrics_[i].get(), values[i]));
-    }
-    return nullptr; // success
-}
-
-std::vector<std::string> const& TritonMetricGroup::JsonKeys() const
-{
-    return json_keys_;
-}
-
-TRITONSERVER_Error* CustomMetricsReporter::InitializeReporter(
-    std::string const& model_name, const uint64_t version, bool const is_v1_model)
-{
-    /* REQUEST METRIC GROUP */
-    request_metric_family_ = std::make_unique<TritonMetricGroup>(
-        "nv_trt_llm_request_metrics", "TRT LLM request metrics", "request_type", request_keys_, request_labels_);
-
-    RETURN_IF_ERROR(request_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(request_metric_family_));
-
-    /* RUNTIME MEMORY METRIC GROUP */
-    runtime_memory_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_runtime_memory_metrics",
-        "TRT LLM runtime memory metrics", "memory_type", runtime_memory_keys_, runtime_memory_labels_);
-
-    RETURN_IF_ERROR(runtime_memory_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(runtime_memory_metric_family_));
-
-    /* KV CACHE METRIC GROUP */
-    kv_cache_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_kv_cache_block_metrics",
-        "TRT LLM KV cache block metrics", "kv_cache_block_type", kv_cache_keys_, kv_cache_labels_);
-
-    RETURN_IF_ERROR(kv_cache_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(kv_cache_metric_family_));
-
-    /* MODEL-TYPE METRIC GROUP (V1 / IFB) */
-    std::string model = (is_v1_model) ? "v1" : "inflight_batcher";
-    std::string model_metric_family_label = "nv_trt_llm_" + model + "_metrics";
-    std::string model_metric_family_description = "TRT LLM " + model + "-specific metrics";
-    std::string model_metric_family_category = model + "_specific_metric";
-
-    if (is_v1_model)
-    {
-        model_type_metric_family_ = std::make_unique<TritonMetricGroup>(model_metric_family_label,
-            model_metric_family_description, model_metric_family_category, v1_specific_keys_, v1_specific_labels_);
-    }
-    else
-    {
-        model_type_metric_family_ = std::make_unique<TritonMetricGroup>(model_metric_family_label,
-            model_metric_family_description, model_metric_family_category, IFB_specific_keys_, IFB_specific_labels_);
-    }
-
-    RETURN_IF_ERROR(model_type_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(model_type_metric_family_));
-
-    /* GENERAL METRIC GROUP */
-    general_metric_family_ = std::make_unique<TritonMetricGroup>("nv_trt_llm_general_metrics",
-        "General TRT LLM metrics", "general_type", general_metric_keys_, general_metric_labels_);
-
-    RETURN_IF_ERROR(general_metric_family_->CreateGroup(model_name, version));
-    metric_groups_.push_back(std::move(general_metric_family_));
-
-    return nullptr; // success
-}
-
-TRITONSERVER_Error* CustomMetricsReporter::UpdateCustomMetrics(std::string const& custom_metrics)
-{
-    triton::common::TritonJson::Value metrics;
-    std::vector<std::string> members;
-    metrics.Parse(custom_metrics);
-    metrics.Members(&members);
-
-    for (auto const& metric_group : metric_groups_)
-    {
-        std::vector<std::string> metric_group_keys = metric_group->JsonKeys();
-        std::vector<uint64_t> metric_group_values;
-        for (auto const& key : metric_group_keys)
-        {
-            triton::common::TritonJson::Value value_json;
-            uint64_t value;
-            if (!metrics.Find(key.c_str(), &value_json))
-            {
-                std::string errStr = std::string("Failed to find " + key + " in metrics.");
-                return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str());
-            }
-            if (key == "Timestamp")
-            {
-                std::string timestamp;
-                value_json.AsString(&timestamp);
-                value = convertTimestampToSeconds(timestamp);
-            }
-            else
-            {
-                value_json.AsUInt(&value);
-            }
-
-            metric_group_values.push_back(value);
-        }
-
-        RETURN_IF_ERROR(metric_group->UpdateGroup(metric_group_values));
-    }
-
-    return nullptr;
-}
-
-} // namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
deleted file mode 100644
index d0960178..00000000
--- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.h
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#pragma once
-
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-#include <map>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <vector>
-
-namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
-{
-
-/// TritonMetricGroups are handled by the CustomMetricsReporter class
-/// and encapsulate the creation/update functionality for a
-/// group of TRT LLM statistics to be reported as custom Triton metrics.
-/// The statistics (or custom metrics) handled by this class should
-/// not be confused with Triton base metrics.
-class TritonMetricGroup
-{
-public:
-    TritonMetricGroup(std::string const& metric_family_label, std::string const& metric_family_description,
-        std::string const& category_label, std::vector<std::string> const& json_keys,
-        std::vector<std::string> const& labels);
-    ~TritonMetricGroup(){};
-
-    /// Create a new Triton metric family with corresponding metric
-    /// pointers and parameters.
-    ///
-    /// \param model_name The name of the model to provide a metrics
-    /// group for.
-    /// \param version The version of the model to provide a metrics
-    /// group for.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* CreateGroup(std::string const& model_name, const uint64_t version);
-
-    /// Update the Triton metrics associated with this group using
-    /// the parsed TRT LLM backend statistics values.
-    ///
-    /// \param values Values parsed from the TRT LLM backend
-    /// statistics output, filtered by this group's JSON keys.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* UpdateGroup(std::vector<uint64_t>& values);
-
-    /// Return a list of JSON keys that correspond to the TRT LLM
-    /// statistics handled by this metric group.
-    ///
-    /// \return A const reference to vector of strings corresponding
-    /// to the JSON keys associated with this group.
-    std::vector<std::string> const& JsonKeys() const;
-
-    /// Custom deleter for a unique TRITONSERVER_MetricFamily pointer
-    struct MetricFamilyDeleter
-    {
-        void operator()(TRITONSERVER_MetricFamily* family)
-        {
-            if (family != nullptr)
-            {
-                TRITONSERVER_MetricFamilyDelete(family);
-            }
-        }
-    };
-
-    /// Custom deleter for a unique TRITONSERVER_Metric pointer
-    struct MetricDeleter
-    {
-        void operator()(TRITONSERVER_Metric* metric)
-        {
-            if (metric != nullptr)
-            {
-                TRITONSERVER_MetricDelete(metric);
-            }
-        }
-    };
-
-    /// Custom deleter for a unique TRITONSERVER_Parameter pointer
-    struct ParameterDeleter
-    {
-        void operator()(TRITONSERVER_Parameter* parameter)
-        {
-            if (parameter != nullptr)
-            {
-                TRITONSERVER_ParameterDelete(parameter);
-            }
-        }
-    };
-
-private:
-    std::unique_ptr<TRITONSERVER_MetricFamily, MetricFamilyDeleter> metric_family_;
-    std::vector<std::unique_ptr<TRITONSERVER_Metric, MetricDeleter>> metrics_;
-    std::string metric_family_label_;
-    std::string metric_family_description_;
-    std::string category_label_;
-    std::vector<std::string> json_keys_;
-    std::vector<std::string> sub_labels_;
-};
-
-/// CustomMetricsReporter is an interface class meant to facilitate the
-/// connection between TRT LLM backend statistics and Triton custom metrics.
-/// It functions by passing BatchManager statistics data from
-/// the TRT LLM backend to the multiple TritonMetricsGroup objects
-/// it handles.
-class CustomMetricsReporter
-{
-public:
-    CustomMetricsReporter(){};
-    ~CustomMetricsReporter(){};
-
-    /// Initialize the various TritonMetricGroups handled by
-    /// by this class using the static key/label members below.
-    ///
-    /// \param model The name of the model to provide metrics for.
-    /// \param version The version of the model to provide metrics for.
-    /// \param is_v1_model Whether the model type is v1 or an inflight
-    /// batching model.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* InitializeReporter(std::string const& model, const uint64_t version, bool const is_v1_model);
-
-    /// Updates the vector of TritonMetricGroup objects with a
-    /// JSON-formatted statistics string.
-    ///
-    /// \param statistics A JSON-formatted string of TRT LLM backend
-    /// statistics.
-    /// \return a TRITONSERVER_Error indicating success or failure.
-    TRITONSERVER_Error* UpdateCustomMetrics(std::string const& custom_metrics);
-
-    static const std::vector<std::string> request_keys_;
-    static const std::vector<std::string> request_labels_;
-
-    static const std::vector<std::string> runtime_memory_keys_;
-    static const std::vector<std::string> runtime_memory_labels_;
-
-    static const std::vector<std::string> kv_cache_keys_;
-    static const std::vector<std::string> kv_cache_labels_;
-
-    static const std::vector<std::string> v1_specific_keys_;
-    static const std::vector<std::string> v1_specific_labels_;
-
-    static const std::vector<std::string> IFB_specific_keys_;
-    static const std::vector<std::string> IFB_specific_labels_;
-
-    static const std::vector<std::string> general_metric_keys_;
-    static const std::vector<std::string> general_metric_labels_;
-
-private:
-    std::vector<std::unique_ptr<TritonMetricGroup>> metric_groups_;
-    std::unique_ptr<TritonMetricGroup> request_metric_family_;
-    std::unique_ptr<TritonMetricGroup> runtime_memory_metric_family_;
-    std::unique_ptr<TritonMetricGroup> kv_cache_metric_family_;
-    std::unique_ptr<TritonMetricGroup> model_type_metric_family_;
-    std::unique_ptr<TritonMetricGroup> general_metric_family_;
-};
-
-} // namespace triton::backend::inflight_batcher_llm::custom_metrics_reporter
diff --git a/inflight_batcher_llm/src/libtensorrtllm.cc b/inflight_batcher_llm/src/libtensorrtllm.cc
deleted file mode 100644
index c6266dff..00000000
--- a/inflight_batcher_llm/src/libtensorrtllm.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <cassert>
-#include <chrono>
-#include <fstream>
-#include <list>
-#include <memory>
-#include <thread>
-
-// Triton headers
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-// trtllm backend headers
-#include "model_instance_state.h"
-#include "model_state.h"
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-extern "C"
-{
-    // Triton calls TRITONBACKEND_ModelInitialize when a model is loaded
-    // to allow the backend to create any state associated with the model,
-    // and to also examine the model configuration to determine if the
-    // configuration is suitable for the backend. Any errors reported by
-    // this function will prevent the model from loading.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
-    {
-        // Create a ModelState object and associate it with the
-        // TRITONBACKEND_Model. If anything goes wrong with initialization
-        // of the model state then an error is returned and Triton will fail
-        // to load the model.
-        char const* cname;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
-        const std::string name(cname);
-
-        uint64_t version;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
-
-        ModelState* model_state;
-        RETURN_IF_ERROR(ModelState::Create(model, name, version, &model_state));
-        RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelFinalize when a model is no longer
-    // needed. The backend should cleanup any state associated with the
-    // model. This function will not be called until all model instances
-    // of the model have been finalized.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
-    {
-        void* vstate;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
-        ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
-        delete model_state;
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelInstanceInitialize when a model
-    // instance is created to allow the backend to initialize any state
-    // associated with the instance.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
-    {
-        // Get the model state associated with this instance's model.
-        TRITONBACKEND_Model* model;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-        void* vmodelstate;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
-        ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
-
-        // Create a ModelInstanceState object and associate it with the
-        // TRITONBACKEND_ModelInstance.
-        ModelInstanceState* instance_state;
-        RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
-
-        return nullptr; // success
-    }
-
-    // Triton calls TRITONBACKEND_ModelInstanceFinalize when a model
-    // instance is no longer needed. The backend should cleanup any state
-    // associated with the model instance.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
-    {
-        TRITONBACKEND_Model* model;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
-
-        void* vstate;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
-        ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
-        delete instance_state;
-
-        return nullptr; // success
-    }
-
-    // When Triton calls TRITONBACKEND_ModelInstanceExecute it is required
-    // that a backend create a response for each request in the batch. A
-    // response may be the output tensors required for that request or may
-    // be an error that is returned in the response.
-    //
-    TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
-        TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count)
-    {
-        ModelInstanceState* instance_state;
-        RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
-
-        instance_state->enqueue(requests, request_count);
-
-        return nullptr; // success
-    }
-
-} // extern "C"
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
deleted file mode 100644
index e098ba66..00000000
--- a/inflight_batcher_llm/src/model_instance_state.cc
+++ /dev/null
@@ -1,998 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "model_instance_state.h"
-#include "utils.h"
-
-#include <nlohmann/json.hpp>
-
-using executor::SizeType32;
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-TRITONSERVER_Error* ModelInstanceState::Create(
-    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state)
-{
-    try
-    {
-        *state = new ModelInstanceState(model_state, triton_model_instance);
-    }
-    catch (std::exception const& ex)
-    {
-        std::string errStr = std::string("unexpected error when creating modelInstanceState: ") + ex.what();
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str());
-    }
-
-    return nullptr; // success
-}
-
-executor::BatchingType ModelInstanceState::getBatchingTypeFromParams()
-{
-    executor::BatchingType batchingType;
-    auto gpt_model_type = model_state_->GetParameter<std::string>("gpt_model_type");
-
-    if (gpt_model_type == "V1" || gpt_model_type == "v1")
-    {
-        batchingType = executor::BatchingType::kSTATIC;
-    }
-    else if (gpt_model_type == "inflight_batching" || gpt_model_type == "inflight_fused_batching")
-    {
-        batchingType = executor::BatchingType::kINFLIGHT;
-    }
-    else
-    {
-        throw std::runtime_error(
-            "Invalid gpt_model_type. Must be "
-            "v1/inflight_batching/inflight_fused_batching.");
-    }
-    return batchingType;
-}
-
-executor::KvCacheConfig ModelInstanceState::getKvCacheConfigFromParams()
-{
-    std::optional<int32_t> maxTokensInPagedKvCache = std::nullopt;
-    try
-    {
-        maxTokensInPagedKvCache = model_state_->GetParameter<int32_t>("max_tokens_in_paged_kv_cache");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "max_tokens_in_paged_kv_cache is not specified, will "
-            "use default value");
-    }
-
-    std::optional<float> kvCacheFreeGpuMemFraction = std::nullopt;
-    try
-    {
-        kvCacheFreeGpuMemFraction = model_state_->GetParameter<float>("kv_cache_free_gpu_mem_fraction");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "kv_cache_free_gpu_mem_fraction is not specified, will use default value of 0.9 or "
-            "max_tokens_in_paged_kv_cache");
-    }
-
-    std::optional<size_t> kvCacheHostCacheSize = std::nullopt;
-    try
-    {
-        kvCacheHostCacheSize = model_state_->GetParameter<size_t>("kv_cache_host_memory_bytes");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("kv_cache_host_memory_bytes not set, defaulting to 0");
-    }
-
-    bool kvCacheOnboardBlocks = true;
-    try
-    {
-        kvCacheOnboardBlocks = model_state_->GetParameter<bool>("kv_cache_onboard_blocks");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("kv_cache_onboard_blocks not set, defaulting to true");
-    }
-
-    std::optional<int32_t> maxAttentionWindow = std::nullopt;
-    try
-    {
-        maxAttentionWindow = model_state_->GetParameter<int32_t>("max_attention_window_size");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "max_attention_window_size is not specified, will "
-            "use default value (i.e. max_sequence_length)");
-    }
-
-    std::optional<int32_t> sinkTokenLength = std::nullopt;
-    try
-    {
-        sinkTokenLength = model_state_->GetParameter<int32_t>("sink_token_length");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING(
-            "sink_token_length is not specified, will "
-            "use default value");
-    }
-
-    bool enableKVCacheReuse = false;
-    try
-    {
-        enableKVCacheReuse = model_state_->GetParameter<bool>("enable_kv_cache_reuse");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("enable_kv_cache_reuse is not specified, will be set to false");
-    }
-
-    std::optional<SizeType32> maxAttentionWindowSizeType = std::nullopt;
-    if (maxAttentionWindow.has_value())
-    {
-        maxAttentionWindowSizeType = static_cast<SizeType32>(maxAttentionWindow.value());
-    }
-
-    return executor::KvCacheConfig(enableKVCacheReuse, maxTokensInPagedKvCache, maxAttentionWindowSizeType,
-        sinkTokenLength, kvCacheFreeGpuMemFraction, kvCacheHostCacheSize, kvCacheOnboardBlocks);
-}
-
-executor::ParallelConfig ModelInstanceState::getParallelConfigFromParams()
-{
-    executor::ParallelConfig parallelConfig;
-    auto const gpuDeviceIds = model_state_->GetDeviceIds();
-    if (gpuDeviceIds.has_value())
-    {
-        parallelConfig.setDeviceIds(gpuDeviceIds.value());
-    }
-
-    char const* str = std::getenv("TRTLLM_ORCHESTRATOR");
-    if (str && std::atoi(str) != 0)
-    {
-        parallelConfig.setCommunicationMode(executor::CommunicationMode::kORCHESTRATOR);
-        auto workerExecutablePath = model_state_->GetExecutorWorkerPath();
-        auto orchestratorConfig = executor::OrchestratorConfig(true, workerExecutablePath);
-        parallelConfig.setOrchestratorConfig(orchestratorConfig);
-    }
-    return parallelConfig;
-}
-
-executor::PeftCacheConfig ModelInstanceState::getPeftCacheConfigFromParams()
-{
-    // parse LoRA / Peft cache parameters
-    // lora_cache_max_adapter_size
-    // lora_cache_optimal_adapter_size
-    // lora_cache_gpu_memory_fraction
-    // lora_cache_host_memory_bytes
-
-    SizeType32 maxAdapterSize = 64;
-    SizeType32 optimalAdapterSize = 8;
-    std::optional<size_t> hostCacheSize = std::nullopt;
-    std::optional<float> deviceCachePercent = std::nullopt;
-
-    std::string fieldName = "lora_cache_max_adapter_size";
-    try
-    {
-        maxAdapterSize = model_state_->GetParameter<SizeType32>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 64");
-    }
-
-    fieldName = "lora_cache_optimal_adapter_size";
-    try
-    {
-        optimalAdapterSize = model_state_->GetParameter<SizeType32>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 8");
-    }
-    fieldName = "lora_cache_gpu_memory_fraction";
-    try
-    {
-        deviceCachePercent = model_state_->GetParameter<float>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 0.05");
-    }
-    fieldName = "lora_cache_host_memory_bytes";
-    try
-    {
-        hostCacheSize = model_state_->GetParameter<size_t>(fieldName);
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(fieldName + " not set, defaulting to 1GB");
-    }
-
-    return executor::PeftCacheConfig(0, 0, optimalAdapterSize, maxAdapterSize,
-        ModelInstanceState::kPeftCacheNumPutWorkers, ModelInstanceState::kPeftCacheNumEnsureWorkers,
-        ModelInstanceState::kPeftCacheNumCopyStreams, 24, 8, deviceCachePercent, hostCacheSize);
-}
-
-executor::SchedulerConfig ModelInstanceState::getSchedulerConfigFromParams(bool enableChunkedContext)
-{
-    using executor::CapacitySchedulerPolicy;
-    auto schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT;
-    try
-    {
-        std::string schedulerPolicyStr = model_state_->GetParameter<std::string>("batch_scheduler_policy");
-        if (schedulerPolicyStr == "max_utilization")
-        {
-            schedulerPolicy = CapacitySchedulerPolicy::kMAX_UTILIZATION;
-        }
-        else if (schedulerPolicyStr == "guaranteed_no_evict")
-        {
-            schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT;
-        }
-        else
-        {
-            throw std::runtime_error(
-                "batch_scheduler_policy parameter was not found or is invalid "
-                "(must be max_utilization or guaranteed_no_evict)");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(e.what());
-    }
-
-    if (isDecoupled() && schedulerPolicy != CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
-    {
-        if (!enableChunkedContext)
-        {
-            TLLM_LOG_WARNING(
-                "Decoupled mode with a batch scheduler policy other than guaranteed_no_evict "
-                "requires building the model with use_paged_context_fmha and setting "
-                "enable_chunked_context to true. "
-                "The batch scheduler policy will be set to guaranteed_no_evict "
-                "since enable_chunked_context is false.");
-            schedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT;
-        }
-    }
-    return executor::SchedulerConfig(schedulerPolicy);
-}
-
-executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
-{
-    auto batchingType = getBatchingTypeFromParams();
-
-    int32_t maxBeamWidth = 1;
-    try
-    {
-        maxBeamWidth = model_state_->GetParameter<int32_t>("max_beam_width");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("max_beam_width is not specified, will use default value of 1");
-    }
-
-    int32_t iterStatsMaxIterations = executor::kDefaultIterStatsMaxIterations;
-    try
-    {
-        iterStatsMaxIterations = model_state_->GetParameter<int32_t>("iter_stats_max_iterations");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("iter_stats_max_iterations is not specified, will use default value of "
-            + std::to_string(iterStatsMaxIterations));
-    }
-
-    int32_t requestStatsMaxIterations = executor::kDefaultRequestStatsMaxIterations;
-    try
-    {
-        requestStatsMaxIterations = model_state_->GetParameter<int32_t>("request_stats_max_iterations");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("request_stats_max_iterations is not specified, will use default value of "
-            + std::to_string(requestStatsMaxIterations));
-    }
-
-    try
-    {
-        model_state_->GetParameter<bool>("enable_trt_overlap");
-        TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored");
-    }
-    catch (std::exception const& e)
-    {
-    }
-
-    bool normalizeLogProbs = true;
-    try
-    {
-        normalizeLogProbs = model_state_->GetParameter<bool>("normalize_log_probs");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("normalize_log_probs is not specified, will be set to true");
-    }
-
-    executor::ExecutorConfig executorConfig;
-
-    auto kvCacheConfig = getKvCacheConfigFromParams();
-
-    bool enableChunkedContext = false;
-    try
-    {
-        enableChunkedContext = model_state_->GetParameter<bool>("enable_chunked_context");
-        if (enableChunkedContext)
-        {
-            TLLM_LOG_WARNING(
-                "enable_chunked_context is set to true, will use context chunking "
-                "(requires building the model with use_paged_context_fmha).");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("enable_chunked_context is not specified, will be set to false.");
-    }
-
-    auto schedulerConfig = getSchedulerConfigFromParams(enableChunkedContext);
-
-    auto peftCacheConfig = getPeftCacheConfigFromParams();
-
-    auto parallelConfig = getParallelConfigFromParams();
-
-    std::optional<executor::DecodingMode> decodingMode = std::nullopt;
-    try
-    {
-        std::string decodingModeStr = model_state_->GetParameter<std::string>("decoding_mode");
-        if (decodingModeStr == "top_k")
-        {
-            decodingMode = executor::DecodingMode::TopK();
-        }
-        else if (decodingModeStr == "top_p")
-        {
-            decodingMode = executor::DecodingMode::TopP();
-        }
-        else if (decodingModeStr == "top_k_top_p")
-        {
-            decodingMode = executor::DecodingMode::TopKTopP();
-        }
-        else if (decodingModeStr == "beam_search")
-        {
-            decodingMode = executor::DecodingMode::BeamSearch();
-        }
-        else if (decodingModeStr == "medusa")
-        {
-            decodingMode = executor::DecodingMode::Medusa();
-        }
-        else
-        {
-            throw std::runtime_error("");
-        }
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING(
-            "decoding_mode parameter is invalid or not specified"
-            "(must be one of the {top_k, top_p, top_k_top_p, beam_search, medusa})."
-            "Using default: top_k_top_p if max_beam_width == 1, beam_search otherwise");
-    }
-
-    executor::DecodingConfig decodingConfig(decodingMode);
-
-    try
-    {
-        auto medusaChoices = model_state_->GetParameter<executor::MedusaChoices>("medusa_choices");
-        decodingConfig.setMedusaChoices(medusaChoices);
-    }
-    catch (std::exception const& e)
-    {
-        if (decodingMode && decodingMode->isMedusa())
-        {
-            TLLM_LOG_WARNING(
-                "medusa_choices parameter is not specified. "
-                "Will be using default mc_sim_7b_63 choices instead.");
-        }
-    }
-
-    float gpuWeightsPercent = 1.0f;
-    try
-    {
-        gpuWeightsPercent = model_state_->GetParameter<float>("gpu_weights_percent");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("gpu_weights_percent parameter is not specified, will use default value of 1.0");
-    }
-
-    return executor::ExecutorConfig(maxBeamWidth, schedulerConfig, kvCacheConfig, enableChunkedContext,
-        normalizeLogProbs, iterStatsMaxIterations, requestStatsMaxIterations, batchingType, std::nullopt,
-        parallelConfig, peftCacheConfig, std::nullopt, std::nullopt, decodingConfig, gpuWeightsPercent);
-}
-
-ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance)
-    : model_state_(model_state)
-    , modelInstance_(triton_model_instance)
-{
-
-    auto executorConfig = getExecutorConfigFromParams();
-
-#ifdef TRITON_ENABLE_METRICS
-    custom_metrics_reporter_ = std::make_unique<custom_metrics_reporter::CustomMetricsReporter>();
-    custom_metrics_reporter_->InitializeReporter(model_state->GetModelName(), model_state->GetModelVersion(),
-        (executorConfig.getBatchingType() == executor::BatchingType::kSTATIC));
-#endif
-
-    std::string decoderModelPath;
-    try
-    {
-        decoderModelPath = model_state_->GetParameter<std::string>("gpt_model_path");
-        TLLM_CHECK_WITH_INFO(std::filesystem::exists(decoderModelPath),
-            "Decoder (GPT) model path at %s does not exist.", decoderModelPath.c_str());
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("gpt_model_path is not specified, will be left empty");
-        decoderModelPath = "";
-    }
-
-    std::string encoderModelPath;
-    try
-    {
-        encoderModelPath = model_state_->GetParameter<std::string>("encoder_model_path");
-        TLLM_CHECK_WITH_INFO(std::filesystem::exists(encoderModelPath), "Encoder model path at %s does not exist.",
-            encoderModelPath.c_str());
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("encoder_model_path is not specified, will be left empty");
-        encoderModelPath = "";
-    }
-
-    TLLM_CHECK_WITH_INFO(
-        !decoderModelPath.empty() || !encoderModelPath.empty(), "Both encoder and decoder model paths are empty");
-
-    if (!decoderModelPath.empty())
-    {
-        // Encoder-decoder model
-        if (!encoderModelPath.empty())
-        {
-            mModelType = executor::ModelType::kENCODER_DECODER;
-            mExecutor
-                = std::make_unique<executor::Executor>(encoderModelPath, decoderModelPath, mModelType, executorConfig);
-        }
-        // Decoder only model
-        else
-        {
-            mModelType = executor::ModelType::kDECODER_ONLY;
-            mExecutor = std::make_unique<executor::Executor>(decoderModelPath, mModelType, executorConfig);
-        }
-    }
-    // Encoder only
-    else
-    {
-        mModelType = executor::ModelType::kENCODER_ONLY;
-        mExecutor = std::make_unique<executor::Executor>(encoderModelPath, mModelType, executorConfig);
-    }
-
-    bool excludeInputInOutput = false;
-    try
-    {
-        excludeInputInOutput = model_state_->GetParameter<bool>("exclude_input_in_output");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("exclude_input_in_output is not specified, will be set to false");
-    }
-    mInstanceSpecificConfig.excludeInputFromOutput = excludeInputInOutput;
-
-    int cancellationCheckPeriodMs = 100;
-    try
-    {
-        cancellationCheckPeriodMs = model_state_->GetParameter<int>("cancellation_check_period_ms");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("cancellation_check_period_ms is not specified, will be set to 100 (ms)");
-    }
-    mInstanceSpecificConfig.cancellationCheckPeriodMs = cancellationCheckPeriodMs;
-
-    int statsCheckPeriodMs = 100;
-    try
-    {
-        statsCheckPeriodMs = model_state_->GetParameter<int>("stats_check_period_ms");
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("stats_check_period_ms is not specified, will be set to 100 (ms)");
-    }
-    mInstanceSpecificConfig.statsCheckPeriodMs = statsCheckPeriodMs;
-
-    if (mExecutor->canEnqueueRequests())
-    {
-        mStopWaitForResponse = false;
-        mWaitForResponseThread = std::thread(&ModelInstanceState::WaitForResponse, this);
-
-        mStopWaitForStats = false;
-        mWaitForStatsThread = std::thread(&ModelInstanceState::WaitForStats, this);
-
-        mStopWaitForCancel = false;
-        mWaitForCancelThread = std::thread(&ModelInstanceState::WaitForCancel, this);
-    }
-    else
-    {
-        // Shutdown the worker ranks which will cause them to wait for leader/orchestrator to terminate
-        mExecutor->shutdown();
-    }
-}
-
-void ModelInstanceState::sendEnqueueResponse(TRITONBACKEND_Request* request, TRITONSERVER_Error* error)
-{
-    TRITONBACKEND_ResponseFactory* factory;
-    LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory, request), "failed to create triton response factory");
-    TRITONBACKEND_Response* tritonResponse;
-    LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&tritonResponse, factory), "Failed to create response");
-    LOG_IF_ERROR(TRITONBACKEND_ResponseSend(tritonResponse, TRITONSERVER_RESPONSE_COMPLETE_FINAL, error),
-        "Cannot send response");
-    LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), "Cannot release request");
-}
-
-bool ModelInstanceState::handleStopRequest(TRITONBACKEND_Request* request, std::string const& tritonRequestId)
-{
-    bool stopRequest = utils::getRequestBooleanInputTensor(request, kStopInputTensorName);
-    if (!stopRequest)
-    {
-        return false;
-    }
-
-    TRITONSERVER_Error* error = nullptr;
-
-    try
-    {
-        if (tritonRequestId == "")
-        {
-            throw std::runtime_error("Trying to stop a request but request ID is not provided");
-        }
-        std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-        if (mTritonRequestIdToRequestId.count(tritonRequestId))
-        {
-            auto requestId = mTritonRequestIdToRequestId[tritonRequestId];
-            mExecutor->cancelRequest(requestId);
-        }
-    }
-    catch (std::exception const& e)
-    {
-        error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, e.what());
-    }
-    // mTritonRequestIdToRequestId.count(tritonRequestId) == false doesn't necessary mean an error since the
-    // request to cancel may already be completed.
-    // Send an empty response to indicate the request has been successfully cancelled
-    sendEnqueueResponse(request, error);
-    return true;
-}
-
-executor::Request ModelInstanceState::createExecutorRequest(
-    TRITONBACKEND_Request* request, bool excludeInputFromOutput, bool isDecoupled, executor::ModelType modelType)
-{
-    auto inputsTensors = utils::readInputsTensors(request);
-    bool streaming = utils::getRequestBooleanInputTensor(request, kStreamingInputTensorName);
-    return utils::createRequestFromInputTensors(
-        inputsTensors, excludeInputFromOutput, isDecoupled, streaming, modelType);
-}
-
-void ModelInstanceState::enqueue(TRITONBACKEND_Request** requests, uint32_t const request_count)
-{
-
-    uint64_t exec_start_ns{0};
-    SET_TIMESTAMP(exec_start_ns);
-
-    for (uint32_t i = 0; i < request_count; ++i)
-    {
-        TRITONBACKEND_Request* request = requests[i];
-
-        try
-        {
-            char const* charRequestId = nullptr;
-            TRITONBACKEND_RequestId(request, &charRequestId);
-            std::string tritonRequestId;
-            if (charRequestId != nullptr)
-            {
-                tritonRequestId = charRequestId;
-            }
-
-            if (handleStopRequest(request, tritonRequestId))
-            {
-                continue;
-            }
-
-            auto executorRequest = createExecutorRequest(
-                request, mInstanceSpecificConfig.excludeInputFromOutput, isDecoupled(), mModelType);
-
-            int64_t inputTokensSize = executorRequest.getInputTokenIds().size();
-            executor::SizeType32 beamWidthCopy = executorRequest.getSamplingConfig().getBeamWidth();
-            std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-            uint64_t compute_start_ns{0};
-            SET_TIMESTAMP(compute_start_ns);
-            auto requestId = mExecutor->enqueueRequest(executorRequest);
-            if (mRequestIdToRequestData.count(requestId))
-            {
-                TLLM_LOG_ERROR(
-                    "Executor returns a request ID that already exists. This shouldn't happen unless there is "
-                    "something "
-                    "wrong in TRT-LLM runtime.");
-            }
-            TRITONBACKEND_ResponseFactory* factory;
-            LOG_IF_ERROR(
-                TRITONBACKEND_ResponseFactoryNew(&factory, request), "failed to create triton response factory");
-
-            auto requestOutputNames = utils::getRequestOutputNames(request);
-            mRequestIdToRequestData.emplace(requestId,
-                RequestData{factory, request, tritonRequestId, inputTokensSize, beamWidthCopy,
-                    std::move(requestOutputNames), {exec_start_ns, compute_start_ns, 0, 0}});
-            if (tritonRequestId != "")
-            {
-                mTritonRequestIdToRequestId[tritonRequestId] = requestId;
-            }
-        }
-        catch (std::exception const& e)
-        {
-            sendEnqueueResponse(request, TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, e.what()));
-        }
-    }
-    return;
-}
-
-TRITONSERVER_Error* ModelInstanceState::reportBaseMetrics(RequestData& requestData, TRITONSERVER_Error* error)
-{
-    auto& timestamps = requestData.timestamps;
-    SET_TIMESTAMP(timestamps.exec_end_ns);
-
-    RETURN_IF_ERROR(
-        TRITONBACKEND_ModelInstanceReportStatistics(modelInstance_, requestData.tritonRequest, (error == nullptr),
-            timestamps.exec_start_ns, timestamps.compute_start_ns, timestamps.compute_end_ns, timestamps.exec_end_ns));
-
-    // For now we will assume a batch size of 1 for each request. This may change in the future but for
-    // now it seems that even when requests are dynamically batched together each workItem is associated
-    // with its own request object and is handled independently due to the nature of IFB.
-    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceReportBatchStatistics(modelInstance_, 1 /* batch size */,
-        timestamps.exec_start_ns, timestamps.compute_start_ns, timestamps.compute_end_ns, timestamps.exec_end_ns));
-
-    return nullptr; // success
-}
-
-std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*> ModelInstanceState::fillTritonResponse(
-    TRITONBACKEND_ResponseFactory* factory, executor::Response const& response, RequestData const& requestData)
-{
-    TRITONBACKEND_Response* tritonResponse;
-    LOG_IF_ERROR(TRITONBACKEND_ResponseNewFromFactory(&tritonResponse, factory), "Failed to create response");
-
-    TRITONSERVER_Error* error = nullptr;
-    bool isFinal = false;
-    try
-    {
-        if (!response.hasError())
-        {
-            auto const& result = response.getResult();
-            isFinal = result.isFinal;
-            error = nullptr;
-            auto outputIds = result.outputTokenIds;
-            std::vector<int32_t> beamLength(outputIds.size());
-            int32_t maxBeamLength = -1;
-            for (size_t i = 0; i < outputIds.size(); ++i)
-            {
-                beamLength[i] = outputIds[i].size();
-                maxBeamLength = std::max(beamLength[i], maxBeamLength);
-            }
-            if (maxBeamLength == -1)
-            {
-                TLLM_LOG_ERROR("Output ids is empty");
-                maxBeamLength = 0;
-            }
-            for (auto& vec : outputIds)
-            {
-                vec.resize(maxBeamLength, -1);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::outputIds) > 0)
-            {
-                std::vector<int64_t> outputIdsShape{1, static_cast<int64_t>(outputIds.size()), maxBeamLength};
-                auto outputIdsType = TRITONSERVER_TYPE_INT32;
-                auto outputIdsBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, outputIdsShape, outputIdsType, OutputFieldsNames::outputIds);
-                utils::flatten<int32_t>(outputIds, outputIdsBuffer, outputIdsShape);
-            }
-            else
-            {
-                TLLM_THROW("%s tensor must be present in list of output tensors", OutputFieldsNames::outputIds);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::sequenceLength) > 0)
-            {
-                std::vector<int64_t> sequenceLengthShape{1, static_cast<int64_t>(outputIds.size())};
-                auto sequenceLengthType = TRITONSERVER_TYPE_INT32;
-                auto sequenceLengthBuffer = utils::getResponseBuffer<int32_t>(
-                    tritonResponse, sequenceLengthShape, sequenceLengthType, OutputFieldsNames::sequenceLength);
-                utils::flatten<int32_t>(beamLength, sequenceLengthBuffer, sequenceLengthShape);
-            }
-            else
-            {
-                TLLM_THROW("%s tensor must be present in list of output tensors", OutputFieldsNames::sequenceLength);
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::contextLogits) > 0)
-            {
-                if (result.contextLogits.has_value())
-                {
-                    auto contextLogitsShapeOriginal = result.contextLogits.value().getShape();
-                    std::vector<int64_t> contextLogitsShape{
-                        1, contextLogitsShapeOriginal[0], contextLogitsShapeOriginal[1]};
-                    auto contextLogitsType = TRITONSERVER_TYPE_FP32;
-                    auto contextLogitsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, contextLogitsShape, contextLogitsType, OutputFieldsNames::contextLogits);
-                    utils::flatten<float>(result.contextLogits.value(), contextLogitsBuffer, contextLogitsShape);
-                }
-                else
-                {
-                    std::vector<int64_t> contextLogitsShape{1, 1, 1};
-                    auto contextLogitsType = TRITONSERVER_TYPE_FP32;
-                    auto contextLogitsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, contextLogitsShape, contextLogitsType, OutputFieldsNames::contextLogits);
-                    utils::flatten<float>(std::vector<float>{0}, contextLogitsBuffer, contextLogitsShape);
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::generationLogits) > 0)
-            {
-                if (result.generationLogits.has_value())
-                {
-                    auto generationLogitsShapeOriginal = result.generationLogits.value().getShape();
-                    std::vector<int64_t> generationLogitsShape{1, generationLogitsShapeOriginal[0],
-                        generationLogitsShapeOriginal[1], generationLogitsShapeOriginal[2]};
-                    auto generationLogitsType = TRITONSERVER_TYPE_FP32;
-                    auto generationLogitsBuffer = utils::getResponseBuffer<float>(tritonResponse, generationLogitsShape,
-                        generationLogitsType, OutputFieldsNames::generationLogits);
-                    utils::flatten<float>(
-                        result.generationLogits.value(), generationLogitsBuffer, generationLogitsShape);
-                }
-                else
-                {
-                    std::vector<int64_t> generationLogitsShape{1, 1, 1, 1};
-                    auto generationLogitsType = TRITONSERVER_TYPE_FP32;
-                    auto generationLogitsBuffer = utils::getResponseBuffer<float>(tritonResponse, generationLogitsShape,
-                        generationLogitsType, OutputFieldsNames::generationLogits);
-                    utils::flatten<float>(std::vector<float>{0}, generationLogitsBuffer, generationLogitsShape);
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::outputLogProbs) > 0)
-            {
-                if (result.logProbs.has_value())
-                {
-                    std::vector<int64_t> outputLogProbsShape{1, static_cast<int64_t>(result.logProbs.value().size()),
-                        static_cast<int64_t>(result.logProbs.value()[0].size())};
-                    auto outputLogProbsType = TRITONSERVER_TYPE_FP32;
-                    auto outputLogProbsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, outputLogProbsShape, outputLogProbsType, OutputFieldsNames::outputLogProbs);
-                    utils::flatten<float>(result.logProbs.value(), outputLogProbsBuffer, outputLogProbsShape);
-                }
-                else
-                {
-                    std::vector<int64_t> outputLogProbsShape{1, 1, requestData.inputTokensSize};
-                    auto outputLogProbsType = TRITONSERVER_TYPE_FP32;
-                    auto outputLogProbsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, outputLogProbsShape, outputLogProbsType, OutputFieldsNames::outputLogProbs);
-                    utils::flatten<float>(
-                        std::vector<float>(requestData.inputTokensSize), outputLogProbsBuffer, outputLogProbsShape);
-                }
-            }
-
-            if (requestData.outputNames.count(OutputFieldsNames::cumLogProbs) > 0)
-            {
-                if (result.cumLogProbs.has_value())
-                {
-                    std::vector<int64_t> cumLogProbsShape{1, static_cast<int64_t>(result.cumLogProbs.value().size())};
-                    auto cumLogProbsType = TRITONSERVER_TYPE_FP32;
-                    auto cumLogProbsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, cumLogProbsShape, cumLogProbsType, OutputFieldsNames::cumLogProbs);
-                    utils::flatten<float>(result.cumLogProbs.value(), cumLogProbsBuffer, cumLogProbsShape);
-                }
-                else
-                {
-                    std::vector<int64_t> cumLogProbsShape{1, 1};
-                    auto cumLogProbsType = TRITONSERVER_TYPE_FP32;
-                    auto cumLogProbsBuffer = utils::getResponseBuffer<float>(
-                        tritonResponse, cumLogProbsShape, cumLogProbsType, OutputFieldsNames::cumLogProbs);
-                    utils::flatten<float>(std::vector<float>{0}, cumLogProbsBuffer, cumLogProbsShape);
-                }
-            }
-        }
-        else
-        {
-            isFinal = true;
-            std::string errMsg = "Executor failed process requestId " + std::to_string(response.getRequestId())
-                + " due to the following error: " + response.getErrorMsg();
-            error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str());
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // In case of error while processing response, return response with error
-        isFinal = true;
-        std::string errMsg = "Error encountered while populating response: " + std::string(e.what());
-        error = TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errMsg.c_str());
-    }
-
-    return {tritonResponse, isFinal, error};
-}
-
-void ModelInstanceState::WaitForResponse()
-{
-    while (!mStopWaitForResponse)
-    {
-        std::chrono::milliseconds waitTime(1);
-        auto responses = mExecutor->awaitResponses(waitTime);
-        uint64_t compute_end_ns{0};
-        SET_TIMESTAMP(compute_end_ns);
-
-        for (auto const& response : responses)
-        {
-            auto requestId = response.getRequestId();
-            RequestData requestData;
-            {
-                std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-                if (!mRequestIdToRequestData.count(requestId))
-                {
-                    TLLM_LOG_ERROR("Unexpected response for a request ID that is not active");
-                    continue;
-                }
-                requestData = mRequestIdToRequestData[requestId];
-            }
-
-            auto factory = requestData.factory;
-
-            auto [tritonResponse, isFinal, error] = fillTritonResponse(factory, response, requestData);
-
-            LOG_IF_ERROR(
-                TRITONBACKEND_ResponseSend(tritonResponse, isFinal ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0, error),
-                "Cannot send response");
-
-            if (isFinal)
-            {
-                std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-                if (requestData.tritonRequestId != "")
-                {
-                    mTritonRequestIdToRequestId.erase(requestData.tritonRequestId);
-                }
-
-                requestData.timestamps.compute_end_ns = compute_end_ns;
-                LOG_IF_ERROR(reportBaseMetrics(requestData, error), "Error reporting metrics");
-
-                LOG_IF_ERROR(TRITONBACKEND_RequestRelease(requestData.tritonRequest, TRITONSERVER_REQUEST_RELEASE_ALL),
-                    "Cannot release request");
-                LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryDelete(factory), "Cannot delete response factory");
-                mRequestIdToRequestData.erase(requestId);
-            }
-        }
-    }
-}
-
-void ModelInstanceState::WaitForStats()
-{
-    while (!mStopWaitForStats)
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(mInstanceSpecificConfig.statsCheckPeriodMs));
-        auto stats = mExecutor->getLatestIterationStats();
-        for (auto const& stat : stats)
-        {
-            std::string statJson = "{";
-            statJson.append("\"Active Request Count\":" + std::to_string(stat.numActiveRequests) + ",");
-            statJson.append("\"Iteration Counter\":" + std::to_string(stat.iter) + ",");
-            statJson.append("\"Max Request Count\":" + std::to_string(stat.maxNumActiveRequests) + ",");
-            statJson.append("\"Runtime CPU Memory Usage\":" + std::to_string(stat.cpuMemUsage) + ",");
-            statJson.append("\"Runtime GPU Memory Usage\":" + std::to_string(stat.gpuMemUsage) + ",");
-            statJson.append("\"Runtime Pinned Memory Usage\":" + std::to_string(stat.pinnedMemUsage) + ",");
-            statJson.append("\"Timestamp\":" + ("\"" + stat.timestamp + "\"") + ",");
-
-            if (stat.inflightBatchingStats.has_value())
-            {
-                auto const& modelStats = stat.inflightBatchingStats.value();
-                statJson.append("\"Context Requests\":" + std::to_string(modelStats.numContextRequests) + ",");
-                statJson.append("\"Generation Requests\":" + std::to_string(modelStats.numGenRequests) + ",");
-                statJson.append("\"MicroBatch ID\":" + std::to_string(modelStats.microBatchId) + ",");
-                statJson.append("\"Paused Requests\":" + std::to_string(modelStats.numPausedRequests) + ",");
-                statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");
-                statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
-            }
-            else if (stat.staticBatchingStats.has_value())
-            {
-                auto const& modelStats = stat.staticBatchingStats.value();
-                statJson.append("\"Context Requests\":" + std::to_string(modelStats.numContextRequests) + ",");
-                statJson.append("\"Scheduled Requests\":" + std::to_string(modelStats.numScheduledRequests) + ",");
-                statJson.append("\"Total Context Tokens\":" + std::to_string(modelStats.numCtxTokens) + ",");
-                statJson.append("\"Total Generation Tokens\":" + std::to_string(modelStats.numGenTokens) + ",");
-                statJson.append("\"Empty Generation Slots\":" + std::to_string(modelStats.emptyGenSlots) + ",");
-            }
-            else
-            {
-                TLLM_LOG_ERROR("Missing stats");
-                continue;
-            }
-
-            if (stat.kvCacheStats.has_value())
-            {
-                auto const& kvStats = stat.kvCacheStats.value();
-                statJson.append("\"Free KV cache blocks\":" + std::to_string(kvStats.freeNumBlocks) + ",");
-                statJson.append("\"Max KV cache blocks\":" + std::to_string(kvStats.maxNumBlocks) + ",");
-                statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ",");
-                statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ",");
-            }
-
-            statJson.back() = '}';
-
-            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, statJson.c_str());
-#ifdef TRITON_ENABLE_METRICS
-            LOG_IF_ERROR(custom_metrics_reporter_->UpdateCustomMetrics(statJson), "Failed updating TRT LLM statistics");
-#endif
-        }
-    }
-}
-
-void ModelInstanceState::WaitForCancel()
-{
-    while (!mStopWaitForCancel)
-    {
-        std::this_thread::sleep_for(std::chrono::milliseconds(mInstanceSpecificConfig.cancellationCheckPeriodMs));
-        std::lock_guard<std::mutex> lock(mRequestIdToRequestDataMutex);
-        for (auto const& pair : mRequestIdToRequestData)
-        {
-            auto const& requestId = pair.first;
-            auto const& requestData = pair.second;
-            bool isCancelled = false;
-            LOG_IF_ERROR(TRITONBACKEND_ResponseFactoryIsCancelled(requestData.factory, &isCancelled),
-                "Failed to query factory status");
-            if (isCancelled)
-            {
-                mExecutor->cancelRequest(requestId);
-            }
-        }
-    }
-}
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/model_instance_state.h b/inflight_batcher_llm/src/model_instance_state.h
deleted file mode 100644
index cc630a61..00000000
--- a/inflight_batcher_llm/src/model_instance_state.h
+++ /dev/null
@@ -1,220 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-#include "tensorrt_llm/batch_manager/callbacks.h"
-#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
-#include "tensorrt_llm/batch_manager/namedTensor.h"
-#include "tensorrt_llm/executor/types.h"
-
-#include "model_state.h"
-
-#ifdef TRITON_ENABLE_METRICS
-#include "custom_metrics_reporter/custom_metrics_reporter.h"
-#endif
-
-#include <map>
-#include <queue>
-#include <thread>
-
-using namespace tensorrt_llm;
-using namespace tensorrt_llm::batch_manager;
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-/// @brief Struct to hold configs that is will be used later when creating the executor requests
-struct InstanceSpecificConfig
-{
-    bool excludeInputFromOutput;
-    int cancellationCheckPeriodMs;
-    int statsCheckPeriodMs;
-};
-
-/// @brief Timestamps for each request, used to report Triton metrics
-struct Timestamps
-{
-    uint64_t exec_start_ns = 0;
-    uint64_t compute_start_ns = 0;
-    uint64_t compute_end_ns = 0;
-    uint64_t exec_end_ns = 0;
-
-    void Reset()
-    {
-        exec_start_ns = 0;
-        compute_start_ns = 0;
-        compute_end_ns = 0;
-        exec_end_ns = 0;
-    }
-};
-
-/// @brief Per-request data stored for handling requests
-struct RequestData
-{
-    TRITONBACKEND_ResponseFactory* factory;
-    TRITONBACKEND_Request* tritonRequest;
-    std::string tritonRequestId;
-    int64_t inputTokensSize;
-    executor::SizeType32 beamWidth;
-    std::unordered_set<std::string> outputNames;
-    Timestamps timestamps;
-};
-
-//
-// ModelInstanceState
-// State associated with a model instance. An object of this class is
-// created and associated with each
-// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
-//
-class ModelInstanceState
-{
-    using InferenceRequest = tensorrt_llm::batch_manager::InferenceRequest;
-    using NamedTensor = tensorrt_llm::batch_manager::NamedTensor;
-
-public:
-    // number of cpu workers used to move weights host cache to gpu cache
-    static constexpr executor::SizeType32 kPeftCacheNumEnsureWorkers = 4;
-    // number of cuda streams used for H2D copies of peft cache pages
-    static constexpr executor::SizeType32 kPeftCacheNumCopyStreams = 4;
-    // number of cpu workers used to load weight into host cache
-    static constexpr executor::SizeType32 kPeftCacheNumPutWorkers = 4;
-
-    /// @brief Create a ModelInstanceObject
-    static TRITONSERVER_Error* Create(
-        ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state);
-
-    virtual ~ModelInstanceState()
-    {
-        mStopWaitForResponse = true;
-        mWaitForResponseThread.join();
-
-        mStopWaitForStats = true;
-        mWaitForStatsThread.join();
-
-        mStopWaitForCancel = true;
-        mWaitForCancelThread.join();
-    }
-
-    // Get the state of the model that corresponds to this instance.
-    ModelState* StateForModel() const
-    {
-        return model_state_;
-    }
-
-    bool isDecoupled() const
-    {
-        return model_state_->IsDecoupled();
-    }
-
-    /// @brief Add the request to the executor
-    void enqueue(TRITONBACKEND_Request** requests, uint32_t const request_count);
-
-private:
-    /// @brief Get batching type
-    executor::BatchingType getBatchingTypeFromParams();
-
-    /// @brief Get kv cache config
-    executor::KvCacheConfig getKvCacheConfigFromParams();
-
-    /// @brief Get scheduler config
-    executor::SchedulerConfig getSchedulerConfigFromParams(bool enableChunkedContext);
-
-    /// @brief Get peft config
-    executor::PeftCacheConfig getPeftCacheConfigFromParams();
-
-    /// @brief Get parallel config
-    executor::ParallelConfig getParallelConfigFromParams();
-
-    /// @brief Get executor config
-    executor::ExecutorConfig getExecutorConfigFromParams();
-
-    /// @brief Constructor
-    ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance);
-
-    ModelState* model_state_;
-    TRITONBACKEND_ModelInstance* modelInstance_;
-
-    /// @brief Send a response during enqueue
-    void sendEnqueueResponse(TRITONBACKEND_Request* request, TRITONSERVER_Error* error);
-
-    /// @brief Cancel a request
-    bool handleStopRequest(TRITONBACKEND_Request* request, std::string const& tritonRequestId);
-
-    /// @brief Create an executor::Request from input tensors
-    static executor::Request createExecutorRequest(
-        TRITONBACKEND_Request* request, bool excludeInputFromOutput, bool isDecoupled, executor::ModelType modelType);
-
-    /// @brief Fill in a triton response based on executor response
-    std::tuple<TRITONBACKEND_Response*, bool, TRITONSERVER_Error*> fillTritonResponse(
-        TRITONBACKEND_ResponseFactory* factory, executor::Response const& response, RequestData const& requestData);
-
-    /// @brief TRT-LLM Executor that handles requests
-    std::unique_ptr<executor::Executor> mExecutor;
-    /// @brief Config to be used when sending requests to executor
-    InstanceSpecificConfig mInstanceSpecificConfig;
-
-    /// @brief Report Triton base metrics for a given request
-    TRITONSERVER_Error* reportBaseMetrics(RequestData& requestData, TRITONSERVER_Error* error);
-
-    /// @brief Retrieve responses from the executor
-    void WaitForResponse();
-    /// @brief The thread for WaitForResponse() to run
-    std::thread mWaitForResponseThread;
-    /// @brief Flag to stop the WaitForResponse thread when the model instance is being destroyed
-    bool mStopWaitForResponse;
-
-    /// @brief Retrieve stats from the executor
-    void WaitForStats();
-    /// @brief The thread for WaitForStats() to run
-    std::thread mWaitForStatsThread;
-    /// @brief Flag to stop the WaitForStats thread when the model instance is being destroyed
-    bool mStopWaitForStats;
-
-    /// @brief Cancel a request for executor if it is marked as cancelled by Triton backend
-    void WaitForCancel();
-    /// @brief The thread for WaitForCancel() to run
-    std::thread mWaitForCancelThread;
-    /// @brief Flag to stop the WaitForCancel thread when the model instance is being destroyed
-    bool mStopWaitForCancel;
-
-    std::unordered_map<executor::IdType, RequestData> mRequestIdToRequestData;
-    std::unordered_map<std::string, executor::IdType> mTritonRequestIdToRequestId;
-    std::mutex mRequestIdToRequestDataMutex;
-
-    // The type of model (encoder-only, decoder-only, encoder-decoder)
-    executor::ModelType mModelType;
-
-#ifdef TRITON_ENABLE_METRICS
-    std::unique_ptr<custom_metrics_reporter::CustomMetricsReporter> custom_metrics_reporter_;
-#endif
-};
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/model_state.cc b/inflight_batcher_llm/src/model_state.cc
deleted file mode 100644
index d0539311..00000000
--- a/inflight_batcher_llm/src/model_state.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "model_state.h"
-
-#include "utils.h"
-
-#include <algorithm>
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-TRITONSERVER_Error* ModelState::Create(
-    TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t const version, ModelState** state)
-{
-    TRITONSERVER_Message* config_message;
-    RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(triton_model, 1 /* config_version */, &config_message));
-    // We can get the model configuration as a json string from
-    // config_message, parse it with our favorite json parser to create
-    // DOM that we can access when we need to example the
-    // configuration. We use TritonJson, which is a wrapper that returns
-    // nice errors (currently the underlying implementation is
-    // rapidjson... but others could be added). You can use any json
-    // parser you prefer.
-    char const* buffer;
-    size_t byte_size;
-    RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));
-
-    common::TritonJson::Value model_config;
-    TRITONSERVER_Error* err = model_config.Parse(buffer, byte_size);
-    RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));
-    RETURN_IF_ERROR(err);
-
-    try
-    {
-        *state = new ModelState(triton_model, name, version, std::move(model_config));
-    }
-    catch (std::exception const& ex)
-    {
-        std::string errStr = std::string("unexpected error when creating modelState: ") + ex.what();
-        return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, errStr.c_str());
-    }
-
-    return nullptr; // success
-}
-
-void ModelState::LoadParameters()
-{
-    // Check if model is in decoupled mode:
-    triton::common::TritonJson::Value transaction_policy;
-    model_config_.MemberAsObject("model_transaction_policy", &transaction_policy);
-    transaction_policy.MemberAsBool("decoupled", &is_decoupled_);
-
-    try
-    {
-        gpu_device_ids_ = GetParameter<std::vector<int32_t>>("gpu_device_ids");
-
-        if (gpu_device_ids_)
-        {
-            std::string deviceIdInfo("Using GPU device ids: ");
-            for (auto const& deviceId : gpu_device_ids_.value())
-            {
-                deviceIdInfo += std::to_string(deviceId) + " ";
-            }
-            TLLM_LOG_INFO(deviceIdInfo);
-        }
-    }
-    catch (std::exception const& e)
-    {
-        // If parameter is not specified, just ignore
-        TLLM_LOG_WARNING("gpu_device_ids is not specified, will be automatically set");
-    }
-}
-
-common::TritonJson::Value& ModelState::GetModelConfig()
-{
-    return model_config_;
-}
-
-std::string const& ModelState::GetModelName() const
-{
-    return model_name_;
-}
-
-uint64_t ModelState::GetModelVersion() const
-{
-    return model_version_;
-}
-
-std::string const ModelState::GetExecutorWorkerPath()
-{
-
-    // Check if worker_path is specified, if so throw an error
-    try
-    {
-        auto workerPath = GetParameter<std::string>("worker_path");
-        TLLM_THROW(
-            "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path "
-            "instead to specify the location of the trtllmExecutorWorker executable.");
-    }
-    catch (std::exception const& e)
-    {
-    }
-
-    std::string executorWorkerPath = "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker";
-    try
-    {
-        executorWorkerPath = GetParameter<std::string>("executor_worker_path");
-    }
-    catch (std::exception const& e)
-    {
-        TLLM_LOG_WARNING("executor_worker_path is not specified, will use default value");
-    }
-
-    return executorWorkerPath;
-}
-
-std::vector<int64_t> ModelState::serialize() const
-{
-    // model name
-    // model version
-    // model config
-    size_t totalSize = 3;
-
-    int nameSize = (model_name_.size() + sizeof(int64_t)) / sizeof(int64_t);
-    totalSize += nameSize;
-
-    TritonJson::WriteBuffer buffer;
-    model_config_.Write(&buffer);
-
-    totalSize += buffer.Size();
-
-    std::vector<int64_t> packed(totalSize);
-    int64_t* ptr = packed.data();
-
-    *ptr++ = model_name_.size();
-    std::memcpy(ptr, model_name_.c_str(), model_name_.size());
-    ptr += nameSize;
-
-    *ptr++ = model_version_;
-    *ptr++ = buffer.Size();
-    std::memcpy(ptr, buffer.Base(), buffer.Size());
-
-    return packed;
-}
-
-ModelState ModelState::deserialize(int64_t const* packed_ptr)
-{
-    auto const nameSize = *packed_ptr++;
-    char const* cname = reinterpret_cast<char const*>(packed_ptr);
-    packed_ptr += (nameSize + sizeof(int64_t)) / sizeof(int64_t);
-
-    uint64_t const version = *packed_ptr++;
-
-    auto const jsonSize = *packed_ptr++;
-    char const* jsonBuffer = reinterpret_cast<char const*>(packed_ptr);
-    common::TritonJson::Value model_config;
-    TRITONSERVER_Error* err = model_config.Parse(jsonBuffer, jsonSize);
-    if (err)
-    {
-        throw std::runtime_error("Failed to parse model config");
-    }
-
-    return ModelState{nullptr, cname, version, std::move(model_config)};
-}
-
-ModelState ModelState::deserialize(std::vector<int64_t> const& packed)
-{
-    return ModelState::deserialize(packed.data());
-}
-
-template <>
-std::string ModelState::GetParameter<std::string>(std::string const& name)
-{
-    TritonJson::Value parameters;
-    TRITONSERVER_Error* err = model_config_.MemberAsObject("parameters", &parameters);
-    if (err != nullptr)
-    {
-        throw std::runtime_error("Model config doesn't have a parameters section");
-        TRITONSERVER_ErrorDelete(err);
-    }
-    TritonJson::Value value;
-    std::string str_value;
-    err = parameters.MemberAsObject(name.c_str(), &value);
-    if (err != nullptr)
-    {
-        std::string errStr = "Cannot find parameter with name: " + name;
-        throw std::runtime_error(errStr);
-        TRITONSERVER_ErrorDelete(err);
-    }
-    value.MemberAsString("string_value", &str_value);
-    return str_value;
-}
-
-template <>
-int32_t ModelState::GetParameter<int32_t>(std::string const& name)
-{
-    return std::stoi(GetParameter<std::string>(name));
-}
-
-template <>
-std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(std::string const& name)
-{
-    auto deviceIdsStr = GetParameter<std::string>(name);
-    // Parse as comma delimited string
-    return utils::csvStrToVecInt(deviceIdsStr);
-}
-
-template <>
-uint32_t ModelState::GetParameter<uint32_t>(std::string const& name)
-{
-    return (uint32_t) std::stoul(GetParameter<std::string>(name));
-}
-
-template <>
-int64_t ModelState::GetParameter<int64_t>(std::string const& name)
-{
-    return std::stoll(GetParameter<std::string>(name));
-}
-
-template <>
-uint64_t ModelState::GetParameter<uint64_t>(std::string const& name)
-{
-    return std::stoull(GetParameter<std::string>(name));
-}
-
-template <>
-float ModelState::GetParameter<float>(std::string const& name)
-{
-    return std::stof(GetParameter<std::string>(name));
-}
-
-template <>
-bool ModelState::GetParameter<bool>(std::string const& name)
-{
-    auto val = GetParameter<std::string>(name);
-    if (val == "True" || val == "true" || val == "TRUE" || val == "1")
-    {
-        return true;
-    }
-    else if (val == "False" || val == "false" || val == "FALSE" || val == "0")
-    {
-        return false;
-    }
-    else
-    {
-        std::string err = "Cannot convert " + val + " to a boolean.";
-        throw std::runtime_error(err);
-    }
-}
-
-template <>
-std::vector<std::vector<int32_t>> ModelState::GetParameter<std::vector<std::vector<int32_t>>>(std::string const& name)
-{
-    auto str = GetParameter<std::string>(name);
-    // Parse as comma delimited string and {} as array bounders
-    return utils::csvStrToVecVecInt(str);
-}
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/model_state.h b/inflight_batcher_llm/src/model_state.h
deleted file mode 100644
index fdd68de9..00000000
--- a/inflight_batcher_llm/src/model_state.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/plugins/api/tllmPlugin.h"
-#include "tensorrt_llm/runtime/tllmLogger.h"
-
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-
-#include <optional>
-
-using namespace ::triton::common; // TritonJson
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-// ModelState
-//
-// State associated with a model that is using this backend. An object
-// of this class is created and associated with each
-// TRITONBACKEND_Model.
-
-class ModelState
-{
-public:
-    static TRITONSERVER_Error* Create(
-        TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t const version, ModelState** state);
-
-    template <typename T>
-    T GetParameter(std::string const& name)
-    {
-        assert(false);
-        auto dummy = T();
-        return dummy;
-    }
-
-    virtual ~ModelState() = default;
-
-    common::TritonJson::Value& GetModelConfig();
-    std::string const& GetModelName() const;
-    uint64_t GetModelVersion() const;
-    std::string const GetExecutorWorkerPath();
-
-    std::optional<std::vector<int32_t>> GetDeviceIds()
-    {
-        return gpu_device_ids_;
-    }
-
-    bool IsDecoupled() const
-    {
-        return is_decoupled_;
-    }
-
-    [[nodiscard]] std::vector<int64_t> serialize() const;
-
-    static ModelState deserialize(int64_t const* packed_ptr);
-
-    static ModelState deserialize(std::vector<int64_t> const& packed);
-
-private:
-    std::string const model_name_;
-    uint64_t model_version_;
-    common::TritonJson::Value model_config_;
-    std::shared_ptr<nvinfer1::ILogger> mTrtLogger{};
-
-    // model parameters
-    std::optional<std::vector<int32_t>> gpu_device_ids_;
-    bool is_decoupled_ = false;
-
-    void LoadParameters();
-
-public:
-    ModelState(
-        TRITONBACKEND_Model* triton_model, std::string const& name, uint64_t version, TritonJson::Value&& model_config)
-        : model_name_(name)
-        , model_version_(version)
-        , model_config_(std::move(model_config))
-    {
-        mTrtLogger = std::make_shared<tensorrt_llm::runtime::TllmLogger>();
-        initTrtLlmPlugins(mTrtLogger.get());
-
-        LoadParameters();
-    }
-};
-
-template <>
-std::string ModelState::GetParameter<std::string>(std::string const& name);
-
-template <>
-int32_t ModelState::GetParameter<int32_t>(std::string const& name);
-
-template <>
-uint32_t ModelState::GetParameter<uint32_t>(std::string const& name);
-
-template <>
-int64_t ModelState::GetParameter<int64_t>(std::string const& name);
-
-template <>
-uint64_t ModelState::GetParameter<uint64_t>(std::string const& name);
-
-template <>
-float ModelState::GetParameter<float>(std::string const& name);
-
-template <>
-bool ModelState::GetParameter<bool>(std::string const& name);
-
-template <>
-std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(std::string const& name);
-
-template <>
-std::vector<std::vector<int32_t>> ModelState::GetParameter<std::vector<std::vector<int32_t>>>(std::string const& name);
-
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/inflight_batcher_llm/src/utils.cc b/inflight_batcher_llm/src/utils.cc
deleted file mode 100644
index bb611c5a..00000000
--- a/inflight_batcher_llm/src/utils.cc
+++ /dev/null
@@ -1,620 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "utils.h"
-
-using namespace tensorrt_llm::batch_manager;
-
-namespace triton::backend::inflight_batcher_llm::utils
-{
-
-nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type)
-{
-    if (data_type == TRITONSERVER_TYPE_INVALID)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_BOOL)
-    {
-        return nvinfer1::DataType::kBOOL;
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT8)
-    {
-        return nvinfer1::DataType::kUINT8;
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT16)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT32)
-    {
-        return nvinfer1::DataType::kINT32;
-    }
-    else if (data_type == TRITONSERVER_TYPE_UINT64)
-    {
-        return nvinfer1::DataType::kINT64;
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT8)
-    {
-        return nvinfer1::DataType::kINT8;
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT16)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT32)
-    {
-        return nvinfer1::DataType::kINT32;
-    }
-    else if (data_type == TRITONSERVER_TYPE_INT64)
-    {
-        return nvinfer1::DataType::kINT64;
-    }
-    else if (data_type == TRITONSERVER_TYPE_FP16)
-    {
-        return nvinfer1::DataType::kHALF;
-    }
-    else if (data_type == TRITONSERVER_TYPE_FP32)
-    {
-        return nvinfer1::DataType::kFLOAT;
-    }
-    else if (data_type == TRITONSERVER_TYPE_FP64)
-    {
-        assert(false);
-    }
-    else if (data_type == TRITONSERVER_TYPE_BYTES)
-    {
-        return nvinfer1::DataType::kINT8;
-    }
-    else if (data_type == TRITONSERVER_TYPE_BF16)
-    {
-        return nvinfer1::DataType::kBF16;
-    }
-    else
-    {
-        assert(false);
-    }
-    return nvinfer1::DataType(0);
-}
-
-std::unordered_map<std::string, NamedTensor> readInputsTensors(TRITONBACKEND_Request* request)
-{
-    std::unordered_map<std::string, NamedTensor> inputsTensors;
-    uint32_t num_inputs;
-    LOG_IF_ERROR(TRITONBACKEND_RequestInputCount(request, &num_inputs), "Error getting input count");
-    for (uint32_t idx = 0; idx < num_inputs; ++idx)
-    {
-        TRITONBACKEND_Input* input = nullptr;
-        LOG_IF_ERROR(TRITONBACKEND_RequestInputByIndex(request, idx, &input), "Error getting input index");
-
-        char const* input_name = nullptr;
-        TRITONSERVER_DataType data_type = TRITONSERVER_TYPE_INVALID;
-        int64_t const* shape = nullptr;
-        uint32_t dims_count = 0;
-        uint64_t byte_size = 0;
-        uint32_t buffer_count = 0;
-        LOG_IF_ERROR(TRITONBACKEND_InputProperties(
-                         input, &input_name, &data_type, &shape, &dims_count, &byte_size, &buffer_count),
-            "Error getting input properties");
-
-        if (std::string(input_name) == "START" || std::string(input_name) == "CORRID"
-            || std::string(input_name) == "END" || std::string(input_name) == kStopInputTensorName
-            || std::string(input_name) == kStreamingInputTensorName)
-        {
-            continue;
-        }
-
-        std::vector<int64_t> shapev;
-        for (uint32_t i = 0; i < dims_count; ++i)
-        {
-            shapev.push_back(shape[i]);
-        }
-
-        NamedTensor t(utils::to_trt_datatype(data_type), shapev, input_name);
-        uint64_t buffer_offset = 0;
-        for (int64_t buffer_id = 0; buffer_id < buffer_count; ++buffer_id)
-        {
-            void const* buffer = nullptr;
-            uint64_t buffer_byte_size = 0;
-            TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-            int64_t memory_type_id = 0;
-            LOG_IF_ERROR(
-                TRITONBACKEND_InputBuffer(input, buffer_id, &buffer, &buffer_byte_size, &memory_type, &memory_type_id),
-                "failed to get input buffer");
-            assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED));
-            std::memcpy(static_cast<char*>(t.tensor->data()) + buffer_offset, buffer, buffer_byte_size);
-            buffer_offset += buffer_byte_size;
-        }
-
-        inputsTensors.insert(make_pair(t.name, std::move(t)));
-    }
-    return inputsTensors;
-}
-
-uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap)
-{
-    char const* charRequestId;
-    TRITONBACKEND_RequestId(request, &charRequestId);
-    uint64_t requestId = 0;
-    if (charRequestId != nullptr)
-    {
-        std::string strRequestId(charRequestId);
-        if (!strRequestId.empty())
-        {
-            try
-            {
-                requestId = stoul(strRequestId);
-            }
-            catch (std::exception const& e)
-            {
-                std::hash<std::string> hasher;
-                requestId = hasher(strRequestId);
-
-                // Check for hash collisions
-                // If requestID already exists in the map with the same string, increment the ID and check again
-                for (auto it = requestIdStrMap.find(requestId);
-                     it != requestIdStrMap.end() && it->second != strRequestId;)
-                {
-                    requestId++;
-                }
-            }
-            requestIdStrMap.insert({requestId, strRequestId});
-        }
-    }
-
-    return requestId;
-}
-
-std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request)
-{
-    std::unordered_set<std::string> outputNames;
-    uint32_t outputCount;
-    LOG_IF_ERROR(TRITONBACKEND_RequestOutputCount(request, &outputCount), "Error getting request output count");
-    for (size_t i = 0; i < outputCount; ++i)
-    {
-        char const* name;
-        LOG_IF_ERROR(TRITONBACKEND_RequestOutputName(request, i, &name), "Error getting request output name");
-        std::string name_s(name);
-        outputNames.insert(std::move(name_s));
-    }
-    return outputNames;
-}
-
-bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, std::string const& inputTensorName)
-{
-    // Get stop signal from the request
-    TRITONBACKEND_Input* input;
-    TRITONSERVER_Error* error = TRITONBACKEND_RequestInput(request, inputTensorName.c_str(), &input);
-    if (error)
-    {
-        // If the user does not provide input "stop", then regard the request as
-        // unstopped
-        std::string msg
-            = "ModelInstanceState::getRequestBooleanInputTensor: user "
-              "did not not provide "
-            + inputTensorName + " input for the request";
-        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, msg.c_str());
-        TRITONSERVER_ErrorDelete(error);
-        return false;
-    }
-
-    uint64_t input_byte_size = 0;
-    uint32_t buffer_count = 0;
-    TRITONBACKEND_InputProperties(input, nullptr, nullptr, nullptr, nullptr, &input_byte_size, &buffer_count);
-
-    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
-        ("ModelInstanceState::getRequestStopSignal: buffer_count = " + std::to_string(buffer_count)).c_str());
-
-    void const* buffer = 0L;
-    uint64_t buffer_byte_size = 0;
-    TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-    int64_t memory_type_id = 0;
-    TRITONBACKEND_InputBuffer(input, 0, &buffer, &buffer_byte_size, &memory_type, &memory_type_id);
-
-    assert((memory_type == TRITONSERVER_MEMORY_CPU) || (memory_type == TRITONSERVER_MEMORY_CPU_PINNED));
-
-    bool boolean = *reinterpret_cast<bool const*>(buffer);
-
-    return boolean;
-}
-
-std::string sparseListToStr(executor::VecTokens const& sparseList)
-{
-    std::string buffer;
-    for (auto v : sparseList)
-    {
-        buffer.append(std::to_string(v) + " ");
-    }
-    return buffer;
-}
-
-std::list<executor::VecTokens> convertWordList(executor::VecTokens const& sparseList)
-{
-    std::list<executor::VecTokens> convertedList;
-    int32_t n = sparseList.size();
-    TLLM_CHECK_WITH_INFO(n % 2 == 0, "Sparse list must not have odd length: " + sparseListToStr(sparseList));
-    int32_t numTokens = n / 2;
-    int32_t currentIndex = 0;
-    for (auto i = numTokens; i < n; ++i)
-    {
-        if (sparseList[i] == -1)
-        {
-            for (auto j = i + 1; j < n; ++j)
-            {
-                TLLM_CHECK_WITH_INFO(
-                    sparseList[j] == -1, "Sparse list must not have additional -1s: " + sparseListToStr(sparseList));
-            }
-            break;
-        }
-        TLLM_CHECK_WITH_INFO(sparseList[i] <= numTokens,
-            "Sparse list must not have out-of-bound offsets: " + sparseListToStr(sparseList));
-        if (i != numTokens)
-        {
-            TLLM_CHECK_WITH_INFO(sparseList[i] > sparseList[i - 1],
-                "Sparse list must not have non-increasing offsets: " + sparseListToStr(sparseList));
-        }
-        executor::VecTokens currentWords;
-        while (currentIndex < sparseList[i])
-        {
-            currentWords.push_back(sparseList[currentIndex]);
-            ++currentIndex;
-        }
-        convertedList.push_back(currentWords);
-    }
-    return convertedList;
-}
-
-void squeezeTensor(std::shared_ptr<runtime::ITensor> const& tensor, int32_t expectedNumDims)
-{
-    auto shape = tensor->getShape();
-    if (shape.nbDims == expectedNumDims)
-    {
-        return;
-    }
-    if (shape.nbDims == expectedNumDims + 1 && shape.d[0] == 1)
-    {
-        --shape.nbDims;
-        for (int32_t i = 0; i < expectedNumDims; ++i)
-        {
-            shape.d[i] = shape.d[i + 1];
-        }
-        tensor->reshape(shape);
-    }
-    else
-    {
-        TLLM_LOG_ERROR("Unexpected prompt tensor shape");
-    }
-}
-
-std::vector<int32_t> csvStrToVecInt(std::string const& str)
-{
-    TLLM_CHECK_WITH_INFO(!str.empty(), "Cannot convert empty string to vector of vector of ints");
-
-    std::vector<int32_t> output;
-    std::stringstream ss(str);
-    while (ss.good())
-    {
-        std::string substr;
-        ss >> std::ws;
-        getline(ss, substr, ',');
-        if (substr.empty())
-        {
-            break;
-        }
-        output.push_back(std::stoi(substr));
-    }
-    TLLM_CHECK_WITH_INFO(!output.empty(), "Empty vector");
-    return output;
-}
-
-std::vector<std::vector<int32_t>> csvStrToVecVecInt(std::string const& str)
-{
-    TLLM_CHECK_WITH_INFO(!str.empty(), "Cannot convert empty string to vector of vector of ints");
-
-    std::vector<std::vector<int32_t>> output;
-    std::stringstream ss(str);
-
-    while (true)
-    {
-        std::string substr;
-        getline(ss, substr, '}');
-        if (substr.empty() || ss.eof())
-        {
-            break;
-        }
-        if (substr[0] == '{')
-        {
-            // Remove the opening bracket from the content
-            substr = substr.substr(1);
-        }
-        output.push_back(csvStrToVecInt(substr));
-        // Ignore the comma and any whitespace
-        ss >> std::ws;
-        ss.ignore(std::numeric_limits<std::streamsize>::max(), ',');
-        ss >> std::ws;
-    }
-    TLLM_CHECK_WITH_INFO(!output.empty(), "Empty vector of vector");
-    return output;
-}
-
-int64_t numElements(std::vector<int64_t> const& shape)
-{
-    int64_t n = 1;
-    for (auto d : shape)
-    {
-        n *= d;
-    }
-    return n;
-}
-
-executor::SamplingConfig getSamplingConfigFromTensors(InputTensors const& inputsTensors)
-{
-    int32_t beamWidth = 1;
-    // If beam_width is specified, set it from config.pbtxt
-    extractSingleton<int32_t>(inputsTensors, InputFieldsNames::beamWidth, beamWidth);
-
-    std::optional<executor::SizeType32> topK{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::topK, topK);
-
-    std::optional<float> topP{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::topP, topP);
-    if (topP.has_value() && topP.value() <= 0.F)
-    {
-        topP.reset();
-    }
-
-    std::optional<float> topPMin{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::topPMin, topPMin);
-
-    std::optional<float> topPDecay{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::topPDecay, topPDecay);
-
-    std::optional<int32_t> topPResetIds{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::topPResetIds, topPResetIds);
-
-    std::optional<float> temperature{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::temperature, temperature);
-
-    std::optional<float> lengthPenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::lengthPenalty, lengthPenalty);
-
-    std::optional<int32_t> earlyStopping{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::earlyStopping, earlyStopping);
-
-    std::optional<float> repetitionPenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::repetitionPenalty, repetitionPenalty);
-
-    std::optional<int32_t> minLength{std::nullopt};
-    extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::minLength, minLength);
-
-    std::optional<float> beamSearchDiversityRate{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::beamSearchDiversityRate, beamSearchDiversityRate);
-
-    std::optional<float> presencePenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::presencePenalty, presencePenalty);
-
-    std::optional<float> frequencyPenalty{std::nullopt};
-    extractOptionalSingleton<float>(inputsTensors, InputFieldsNames::frequencyPenalty, frequencyPenalty);
-
-    std::optional<uint64_t> randomSeed{std::nullopt};
-    extractOptionalSingleton<uint64_t>(inputsTensors, InputFieldsNames::randomSeed, randomSeed);
-
-    return executor::SamplingConfig(beamWidth, topK, topP, topPMin, topPResetIds, topPDecay, randomSeed, temperature,
-        minLength, beamSearchDiversityRate, repetitionPenalty, presencePenalty, frequencyPenalty, lengthPenalty,
-        earlyStopping);
-}
-
-executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTensors)
-{
-    bool returnLogProbs{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnLogProbs, returnLogProbs);
-
-    bool returnGenerationLogits{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnGenerationLogits, returnGenerationLogits);
-
-    bool returnContextLogits{false};
-    extractSingleton<bool>(inputsTensors, InputFieldsNames::returnContextLogits, returnContextLogits);
-
-    // Note that currently excludeInputFromOutput is set from the backend parameters.
-    return executor::OutputConfig(returnLogProbs, returnContextLogits, returnGenerationLogits);
-}
-
-std::optional<executor::ExternalDraftTokensConfig> getExternalDraftTokensConfigFromTensors(
-    InputTensors const& inputsTensors)
-{
-    std::optional<executor::ExternalDraftTokensConfig> externalDraftTokensConfig = std::nullopt;
-
-    if (inputsTensors.count(InputFieldsNames::draftInputs))
-    {
-        executor::VecTokens draftInputs;
-        extractVector<int32_t>(inputsTensors, InputFieldsNames::draftInputs, draftInputs);
-
-        std::optional<executor::Tensor> draftLogits = std::nullopt;
-        if (inputsTensors.count(InputFieldsNames::draftLogits))
-        {
-            std::shared_ptr<runtime::ITensor> originaldraftLogitsTensor
-                = inputsTensors.at(InputFieldsNames::draftLogits).tensor;
-            utils::squeezeTensor(originaldraftLogitsTensor, 2);
-            draftLogits = executor::detail::ofITensor(originaldraftLogitsTensor);
-        }
-
-        std::optional<float> draftAcceptanceThreshold{std::nullopt};
-        utils::extractOptionalSingleton<float>(
-            inputsTensors, InputFieldsNames::draftAcceptanceThreshold, draftAcceptanceThreshold);
-
-        externalDraftTokensConfig
-            = executor::ExternalDraftTokensConfig(draftInputs, draftLogits, draftAcceptanceThreshold);
-    }
-    return externalDraftTokensConfig;
-}
-
-std::optional<executor::PromptTuningConfig> getPromptTuningConfigFromTensors(InputTensors const& inputsTensors)
-{
-    std::optional<executor::PromptTuningConfig> pTuningConfig = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::promptEmbeddingTable))
-    {
-        std::shared_ptr<runtime::ITensor> originalTensor
-            = inputsTensors.at(InputFieldsNames::promptEmbeddingTable).tensor;
-        utils::squeezeTensor(originalTensor, 2);
-        auto const& executorTensor = executor::detail::ofITensor(originalTensor);
-        pTuningConfig = executor::PromptTuningConfig(executorTensor);
-    }
-    return pTuningConfig;
-}
-
-std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const& inputsTensors)
-{
-    std::optional<executor::LoraConfig> loraConfig = std::nullopt;
-    if (inputsTensors.count(InputFieldsNames::loraTaskId))
-    {
-        uint64_t taskId;
-        if (!utils::extractSingleton<uint64_t>(inputsTensors, InputFieldsNames::loraTaskId, taskId))
-        {
-            throw std::runtime_error("failed to extract lora task id");
-        }
-
-        std::optional<executor::Tensor> loraConfigTensor{std::nullopt};
-        if (inputsTensors.count(InputFieldsNames::loraConfig))
-        {
-            std::shared_ptr<runtime::ITensor> originalLoraConfigTensor
-                = inputsTensors.at(InputFieldsNames::loraConfig).tensor;
-            utils::squeezeTensor(originalLoraConfigTensor, 2);
-            loraConfigTensor = executor::detail::ofITensor(originalLoraConfigTensor);
-        }
-
-        std::optional<executor::Tensor> loraWeightsTensor{std::nullopt};
-        if (inputsTensors.count(InputFieldsNames::loraWeights))
-        {
-            std::shared_ptr<runtime::ITensor> originalLoraWeightsTensor
-                = inputsTensors.at(InputFieldsNames::loraWeights).tensor;
-            utils::squeezeTensor(originalLoraWeightsTensor, 2);
-            loraWeightsTensor = executor::detail::ofITensor(originalLoraWeightsTensor);
-        }
-
-        loraConfig = executor::LoraConfig(taskId, loraWeightsTensor, loraConfigTensor);
-    }
-    return loraConfig;
-}
-
-executor::Request createRequestFromInputTensors(std::unordered_map<std::string, NamedTensor> const& inputsTensors,
-    bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType)
-{
-    executor::OutputConfig outConfig = utils::getOutputConfigFromTensors(inputsTensors);
-    outConfig.excludeInputFromOutput = excludeInputFromOutput;
-
-    executor::VecTokens inputTokens;
-    if (!utils::extractVector<int32_t>(inputsTensors, InputFieldsNames::inputTokens, inputTokens))
-    {
-        TLLM_THROW("%s is not present in the request.", InputFieldsNames::inputTokens);
-    }
-    executor::SizeType32 maxNewTokens;
-    if (!utils::extractSingleton<int32_t>(inputsTensors, InputFieldsNames::maxNewTokens, maxNewTokens))
-    {
-        throw std::runtime_error("request_output_len is not present in the request");
-    }
-
-    std::optional<executor::SizeType32> endId{std::nullopt};
-    utils::extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::endId, endId);
-
-    std::optional<executor::SizeType32> padId{std::nullopt};
-    utils::extractOptionalSingleton<int32_t>(inputsTensors, InputFieldsNames::padId, padId);
-
-    std::optional<executor::VecTokens> encoderInputTokens{std::nullopt};
-    if (modelType == executor::ModelType::kENCODER_ONLY || modelType == executor::ModelType::kENCODER_DECODER)
-    {
-        encoderInputTokens = inputTokens;
-
-        // If encoder-decoder, check if decoder tokens are specified
-        if (modelType == executor::ModelType::kENCODER_DECODER)
-        {
-            if (!utils::extractVector<int32_t>(inputsTensors, InputFieldsNames::decoderInputTokens, inputTokens))
-            {
-                if (padId)
-                {
-                    TLLM_LOG_WARNING(
-                        "%s is not present in the request for encoder-decoder model. The decoder input tokens will be "
-                        "set to "
-                        "[padId]",
-                        InputFieldsNames::decoderInputTokens);
-                    inputTokens = {padId.value()};
-                }
-                else
-                {
-                    TLLM_LOG_WARNING("%s is not present in the request for encoder-decoder model",
-                        InputFieldsNames::decoderInputTokens);
-                    inputTokens.clear();
-                }
-            }
-        }
-    }
-
-    if (streaming && !isDecoupled)
-    {
-        throw std::runtime_error(
-            "Streaming is only supported if model is "
-            "deployed using decoupled mode.");
-    }
-
-    auto samplingConfig = utils::getSamplingConfigFromTensors(inputsTensors);
-
-    std::optional<std::list<executor::VecTokens>> badWords = std::nullopt;
-    executor::VecTokens badWordsRaw;
-    if (utils::extractVector<int32_t>(inputsTensors, InputFieldsNames::badWords, badWordsRaw))
-    {
-        badWords = utils::convertWordList(badWordsRaw);
-    }
-
-    std::optional<std::list<executor::VecTokens>> stopWords = std::nullopt;
-    executor::VecTokens stopWordsRaw;
-    if (utils::extractVector<int32_t>(inputsTensors, InputFieldsNames::stopWords, stopWordsRaw))
-    {
-        stopWords = utils::convertWordList(stopWordsRaw);
-    }
-
-    std::optional<executor::Tensor> embeddingBias{std::nullopt};
-    if (inputsTensors.count(InputFieldsNames::embeddingBias))
-    {
-        std::shared_ptr<runtime::ITensor> originalTensor = inputsTensors.at(InputFieldsNames::embeddingBias).tensor;
-        utils::squeezeTensor(originalTensor, 1);
-        auto newShape = originalTensor->getShape();
-        if (!(newShape.nbDims == 1 && newShape.d[0] == 0))
-        {
-            embeddingBias = executor::detail::ofITensor(originalTensor);
-        }
-    }
-
-    auto pTuningConfig = utils::getPromptTuningConfigFromTensors(inputsTensors);
-
-    auto loraConfig = utils::getLoraConfigFromTensors(inputsTensors);
-
-    auto externalDraftTokensConfig = utils::getExternalDraftTokensConfigFromTensors(inputsTensors);
-
-    return executor::Request(inputTokens, maxNewTokens, streaming, samplingConfig, outConfig, endId, padId, badWords,
-        stopWords, embeddingBias, externalDraftTokensConfig, pTuningConfig, loraConfig, std::nullopt,
-        encoderInputTokens);
-}
-
-} // namespace triton::backend::inflight_batcher_llm::utils
diff --git a/inflight_batcher_llm/src/utils.h b/inflight_batcher_llm/src/utils.h
deleted file mode 100644
index 6d7ea384..00000000
--- a/inflight_batcher_llm/src/utils.h
+++ /dev/null
@@ -1,287 +0,0 @@
-// Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//  * Redistributions of source code must retain the above copyright
-//    notice, this list of conditions and the following disclaimer.
-//  * Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimer in the
-//    documentation and/or other materials provided with the distribution.
-//  * Neither the name of NVIDIA CORPORATION nor the names of its
-//    contributors may be used to endorse or promote products derived
-//    from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include "NvInfer.h"
-#include "tensorrt_llm/batch_manager/inferenceRequest.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/runtime/tllmLogger.h"
-#include "triton/backend/backend_common.h"
-#include "triton/core/tritonbackend.h"
-#include "triton/core/tritonserver.h"
-#include <map>
-#include <string>
-#include <unordered_set>
-
-using namespace tensorrt_llm;
-
-namespace triton::backend::inflight_batcher_llm
-{
-
-/// @brief Names of input fields
-struct InputFieldsNames
-{
-    static constexpr char const* inputTokens = "input_ids";
-    static constexpr char const* decoderInputTokens = "decoder_input_ids";
-    static constexpr char const* maxNewTokens = "request_output_len";
-    static constexpr char const* endId = "end_id";
-    static constexpr char const* padId = "pad_id";
-    static constexpr char const* badWords = "bad_words_list";
-    static constexpr char const* stopWords = "stop_words_list";
-    static constexpr char const* embeddingBias = "embedding_bias";
-
-    // OutputConfig
-    static constexpr char const* returnLogProbs = "return_log_probs";
-    static constexpr char const* returnGenerationLogits = "return_generation_logits";
-    static constexpr char const* returnContextLogits = "return_context_logits";
-
-    // SamplingConfig
-    static constexpr char const* beamWidth = "beam_width";
-    static constexpr char const* topK = "runtime_top_k";
-    static constexpr char const* topP = "runtime_top_p";
-    static constexpr char const* topPMin = "runtime_top_k_min";
-    static constexpr char const* topPDecay = "runtime_top_p_decay";
-    static constexpr char const* topPResetIds = "runtime_top_p_reset_ids";
-    static constexpr char const* temperature = "temperature";
-    static constexpr char const* lengthPenalty = "len_penalty";
-    static constexpr char const* earlyStopping = "early_stopping";
-    static constexpr char const* repetitionPenalty = "repetition_penalty";
-    static constexpr char const* minLength = "min_length";
-    static constexpr char const* beamSearchDiversityRate = "beam_search_diversity_rate";
-    static constexpr char const* presencePenalty = "presence_penalty";
-    static constexpr char const* frequencyPenalty = "frequency_penalty";
-    static constexpr char const* randomSeed = "random_seed";
-
-    // PromptTuningConfig
-    static constexpr char const* promptEmbeddingTable = "prompt_embedding_table";
-
-    // LoraConfig
-    static constexpr char const* loraTaskId = "lora_task_id";
-    static constexpr char const* loraWeights = "lora_weights";
-    static constexpr char const* loraConfig = "lora_config";
-
-    // ExternalDraftTokensConfig
-    static constexpr char const* draftInputs = "draft_input_ids";
-    static constexpr char const* draftLogits = "draft_logits";
-    static constexpr char const* draftAcceptanceThreshold = "draft_acceptance_threshold";
-};
-
-/// @brief Names of output fields
-struct OutputFieldsNames
-{
-    static constexpr char const* outputIds = "output_ids";
-    static constexpr char const* sequenceLength = "sequence_length";
-    static constexpr char const* contextLogits = "context_logits";
-    static constexpr char const* generationLogits = "generation_logits";
-    static constexpr char const* outputLogProbs = "output_log_probs";
-    static constexpr char const* cumLogProbs = "cum_log_probs";
-};
-
-inline static std::string const kStopInputTensorName = "stop";
-inline static std::string const kStreamingInputTensorName = "streaming";
-
-namespace utils
-{
-
-/// @brief  Convert Triton datatype to TRT datatype
-nvinfer1::DataType to_trt_datatype(TRITONSERVER_DataType data_type);
-
-using InputTensors = std::unordered_map<std::string, tensorrt_llm::batch_manager::NamedTensor>;
-
-/// @brief Gather input tenors in a Triton request
-/// @return An unordered map with key being input name and value being input tensor
-InputTensors readInputsTensors(TRITONBACKEND_Request* request);
-
-/// @brief Construct executor::SampleConfig from input tensors
-executor::SamplingConfig getSamplingConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::OutputConfig from input tensors
-executor::OutputConfig getOutputConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::ExternalDraftTokensConfig from input tensors
-std::optional<executor::ExternalDraftTokensConfig> getExternalDraftTokensConfigFromTensors(
-    InputTensors const& inputsTensors);
-
-/// @brief Construct executor::PromptTuningConfig from input tensors
-std::optional<executor::PromptTuningConfig> getPromptTuningConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::LoraConfig from input tensors
-std::optional<executor::LoraConfig> getLoraConfigFromTensors(InputTensors const& inputsTensors);
-
-/// @brief Construct executor::Request from input tensors
-executor::Request createRequestFromInputTensors(
-    std::unordered_map<std::string, tensorrt_llm::batch_manager::NamedTensor> const& inputsTensors,
-    bool excludeInputFromOutput, bool isDecoupled, bool streaming, executor::ModelType modelType);
-
-/// @brief get the requestId of the request and update requestIdStrMap
-/// @return Returns 0 if not specified. Throws an error if request_id cannot be convert to uint64_t
-uint64_t getRequestId(TRITONBACKEND_Request* request, std::unordered_map<uint64_t, std::string>& requestIdStrMap);
-
-/// @brief Get the requested output names
-std::unordered_set<std::string> getRequestOutputNames(TRITONBACKEND_Request* request);
-
-/// @brief Get the value of a boolean tensor
-bool getRequestBooleanInputTensor(TRITONBACKEND_Request* request, std::string const& inputTensorName);
-
-/// @brief Get a single value tensor from the input tensors
-/// @return true if the value is found else false
-template <typename Value>
-bool extractSingleton(std::unordered_map<std::string, tensorrt_llm::batch_manager::NamedTensor> const& params,
-    std::string const& name, Value& value)
-{
-    if (!params.count(name))
-    {
-        return false;
-    }
-    auto const& tensor = params.at(name);
-    TLLM_CHECK_WITH_INFO(tensor.tensor->getSize() == 1, "Invalid size for tensor " + name);
-    value = *(static_cast<Value*>(tensor.tensor->data()));
-    return true;
-}
-
-/// @brief Get a single value tensor from the input tensors and put it into an optional. Set to std::nullopt if it's not
-/// found.
-template <typename Value>
-void extractOptionalSingleton(std::unordered_map<std::string, tensorrt_llm::batch_manager::NamedTensor> const& params,
-    std::string const& name, std::optional<Value>& optionalValue)
-{
-    Value value;
-    if (extractSingleton<Value>(params, name, value))
-    {
-        optionalValue = value;
-    }
-    else
-    {
-        optionalValue = std::nullopt;
-    }
-}
-
-/// @brief Get a 1d tensor from the input tensors
-/// @return true if the tensor is found else false
-template <typename Value>
-bool extractVector(std::unordered_map<std::string, tensorrt_llm::batch_manager::NamedTensor> const& params,
-    std::string const& name, std::vector<Value>& value)
-{
-    if (!params.count(name))
-    {
-        return false;
-    }
-    auto const& tensor = params.at(name);
-    int64_t n = tensor.tensor->getSize();
-    value.resize(n);
-    for (int64_t i = 0; i < n; ++i)
-    {
-        value[i] = static_cast<Value*>(tensor.tensor->data())[i];
-    }
-    return true;
-}
-
-int64_t numElements(std::vector<int64_t> const& shape);
-
-/// @brief Flatten the vector and copy into the buffer
-template <typename T>
-void flatten(std::vector<T> const& vec, void* buffer, std::vector<int64_t> const& expectedShape)
-{
-    TLLM_CHECK_WITH_INFO(static_cast<int64_t>(vec.size()) == numElements(expectedShape),
-        "Trying to flatten a tensor with unexpected size");
-    T* typedBuffer = static_cast<T*>(buffer);
-    std::copy(vec.begin(), vec.end(), typedBuffer);
-}
-
-/// @brief Flatten the vector of vector and copy into the buffer
-template <typename T>
-void flatten(std::vector<std::vector<T>> const& vec, void* buffer, std::vector<int64_t> const& expectedShape)
-{
-    T* typedBuffer = static_cast<T*>(buffer);
-    int64_t copiedSize = 0;
-    for (auto const& innerVec : vec)
-    {
-        TLLM_CHECK_WITH_INFO(innerVec.size() == vec.at(0).size(),
-            "The vector of vector to be flattened has mismatched sizes in its inner vectors");
-        copiedSize += innerVec.size();
-        typedBuffer = std::copy(innerVec.begin(), innerVec.end(), typedBuffer);
-    }
-    TLLM_CHECK_WITH_INFO(copiedSize == numElements(expectedShape), "Trying to flatten a tensor with unexpected size");
-}
-
-/// @brief Flatten the tensor and copy into the buffer
-template <typename Value>
-void flatten(tensorrt_llm::executor::Tensor const& tensor, void* buffer, std::vector<int64_t> const& expectedShape)
-{
-    TLLM_CHECK_WITH_INFO(static_cast<int64_t>(tensor.getSize()) == numElements(expectedShape),
-        "Trying to flatten a tensor with unexpected size");
-    Value* typedBuffer = static_cast<Value*>(buffer);
-    Value const* ptr = static_cast<Value const*>(tensor.getData());
-    std::copy(ptr, ptr + tensor.getSize(), typedBuffer);
-}
-
-/// @brief Query Triton for a buffer that can be used to pass the output tensors
-template <typename T>
-void* getResponseBuffer(TRITONBACKEND_Response* tritonResponse, std::vector<int64_t> const& shape,
-    TRITONSERVER_DataType dtype, std::string const& name)
-{
-    TRITONBACKEND_Output* output;
-    TRITONSERVER_Error* err{nullptr};
-    err = TRITONBACKEND_ResponseOutput(tritonResponse, &output, name.c_str(), dtype, shape.data(), shape.size());
-    if (err != nullptr)
-    {
-        auto errMsg = TRITONSERVER_ErrorMessage(err);
-        TLLM_THROW("Could not get response output for output tensor %s: %s", name.c_str(), errMsg);
-    }
-
-    TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
-    int64_t memory_type_id = 0;
-    uint64_t size = 1;
-    for (auto s : shape)
-    {
-        size *= s;
-    }
-    auto buffersize = size * sizeof(T);
-    void* tritonBuffer = 0L;
-    err = TRITONBACKEND_OutputBuffer(output, &tritonBuffer, buffersize, &memory_type, &memory_type_id);
-    if (err != nullptr)
-    {
-        auto errMsg = TRITONSERVER_ErrorMessage(err);
-        TLLM_THROW("Could not get output buffer for output tensor %s: %s", name.c_str(), errMsg);
-    }
-    return tritonBuffer;
-}
-
-/// @brief Convert a sparse tensor to a list of VecTokens
-std::list<executor::VecTokens> convertWordList(executor::VecTokens const& sparseList);
-
-/// @brief Remove the additional size 1 dimension for tensor
-void squeezeTensor(std::shared_ptr<runtime::ITensor> const& tensor, int32_t expectedNumDims);
-
-/// Helper functions to parse a csv delimited string to a vector ints
-std::vector<int32_t> csvStrToVecInt(std::string const& str);
-
-/// Helper functions to parse a csv delimited string to a vector of vector ints
-std::vector<std::vector<int32_t>> csvStrToVecVecInt(std::string const& str);
-
-} // namespace utils
-} // namespace triton::backend::inflight_batcher_llm
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index de1735f5..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-regex
-fire
-tritonclient[all]
-transformers==4.36.1
-pandas
-tabulate
diff --git a/scripts/launch_triton_server.py b/scripts/launch_triton_server.py
deleted file mode 100644
index e0dcc2ef..00000000
--- a/scripts/launch_triton_server.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import argparse
-import os
-import subprocess
-import sys
-from pathlib import Path
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--world_size',
-                        type=int,
-                        default=1,
-                        help='world size, only support tensor parallelism now')
-    parser.add_argument(
-        '--tritonserver',
-        type=str,
-        help='path to the tritonserver exe',
-        default='/opt/tritonserver/bin/tritonserver',
-    )
-    parser.add_argument(
-        '--grpc_port',
-        type=str,
-        help='tritonserver grpc port',
-        default='8001',
-    )
-    parser.add_argument(
-        '--http_port',
-        type=str,
-        help='tritonserver http port',
-        default='8000',
-    )
-    parser.add_argument(
-        '--metrics_port',
-        type=str,
-        help='tritonserver metrics port',
-        default='8002',
-    )
-    parser.add_argument(
-        '--force',
-        '-f',
-        action='http://github.com/store_true',
-        help='launch tritonserver regardless of other instances running')
-    parser.add_argument('--log',
-                        action='http://github.com/store_true',
-                        help='log triton server stats into log_file')
-    parser.add_argument(
-        '--log-file',
-        type=str,
-        help='path to triton log gile',
-        default='triton_log.txt',
-    )
-
-    path = str(Path(__file__).parent.absolute()) + '/../all_models/gpt'
-    parser.add_argument('--model_repo', type=str, default=path)
-
-    parser.add_argument(
-        '--tensorrt_llm_model_name',
-        type=str,
-        help=
-        'Name(s) of the tensorrt_llm Triton model in the repo. Use comma to separate if multiple model names',
-        default='tensorrt_llm',
-    )
-
-    parser.add_argument(
-        '--multi-model',
-        action='http://github.com/store_true',
-        help=
-        'Enable support for multiple TRT-LLM models in the Triton model repository'
-    )
-
-    return parser.parse_args()
-
-
-def get_cmd(world_size, tritonserver, grpc_port, http_port, metrics_port,
-            model_repo, log, log_file, tensorrt_llm_model_name):
-    cmd = ['mpirun', '--allow-run-as-root']
-    for i in range(world_size):
-        cmd += ['-n', '1', tritonserver, f'--model-repository={model_repo}']
-        if log and (i == 0):
-            cmd += ['--log-verbose=3', f'--log-file={log_file}']
-        # If rank is not 0, skip loading of models other than `tensorrt_llm_model_name`
-        if (i != 0):
-            cmd += ['--model-control-mode=explicit']
-            model_names = tensorrt_llm_model_name.split(',')
-            for name in model_names:
-                cmd += [f'--load-model={name}']
-        cmd += [
-            f'--grpc-port={grpc_port}', f'--http-port={http_port}',
-            f'--metrics-port={metrics_port}', '--disable-auto-complete-config',
-            f'--backend-config=python,shm-region-prefix-name=prefix{i}_', ':'
-        ]
-    return cmd
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    res = subprocess.run(['pgrep', '-r', 'R', 'tritonserver'],
-                         capture_output=True,
-                         encoding='utf-8')
-    if res.stdout:
-        pids = res.stdout.replace('\n', ' ').rstrip()
-        msg = f'tritonserver process(es) already found with PID(s): {pids}.\n\tUse `kill {pids}` to stop them.'
-        if args.force:
-            print(msg, file=sys.stderr)
-        else:
-            raise RuntimeError(msg + ' Or use --force.')
-    cmd = get_cmd(int(args.world_size), args.tritonserver, args.grpc_port,
-                  args.http_port, args.metrics_port, args.model_repo, args.log,
-                  args.log_file, args.tensorrt_llm_model_name)
-    env = os.environ.copy()
-    if args.multi_model:
-        assert args.world_size == 1, 'World size must be 1 when using multi-model. Processes will be spawned automatically to run the multi-GPU models'
-        env['TRTLLM_ORCHESTRATOR'] = '1'
-    subprocess.Popen(cmd, env=env)
diff --git a/tensorrt_llm b/tensorrt_llm
index 9691e12b..31116825 160000
--- a/tensorrt_llm
+++ b/tensorrt_llm
@@ -1 +1 @@
-Subproject commit 9691e12bce7ae1c126c435a049eb516eb119486c
+Subproject commit 31116825b39f4e6a6a1e127001f5204b73d1dc32
diff --git a/tools/__init__.py b/tools/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tools/fill_template.py b/tools/fill_template.py
deleted file mode 100644
index 0524f9ef..00000000
--- a/tools/fill_template.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#! /usr/bin/env python3
-from argparse import ArgumentParser
-from string import Template
-
-
-def main(file_path, substitutions, in_place):
-    with open(file_path) as f:
-        pbtxt = Template(f.read())
-
-    sub_dict = {}
-    for sub in substitutions.split(","):
-        key, value = sub.split(":")
-        sub_dict[key] = value
-
-    pbtxt = pbtxt.safe_substitute(sub_dict)
-
-    if in_place:
-        with open(file_path, "w") as f:
-            f.write(pbtxt)
-    else:
-        print(pbtxt)
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument("file_path", help="path of the .pbtxt to modify")
-    parser.add_argument(
-        "substitutions",
-        help=
-        "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
-    )
-    parser.add_argument("--in_place",
-                        "-i",
-                        action="http://github.com/store_true",
-                        help="do the operation in-place")
-    args = parser.parse_args()
-
-    main(**vars(args))
diff --git a/tools/gpt/benchmark_core_model.py b/tools/gpt/benchmark_core_model.py
deleted file mode 100644
index fdb4e93a..00000000
--- a/tools/gpt/benchmark_core_model.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-import statistics as s
-from builtins import range
-from datetime import datetime
-
-import numpy as np
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-w',
-                        '--warm_up',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable warm_up before benchmark')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-p',
-                        '--request_parallelism',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify request parallelism')
-    parser.add_argument('-m',
-                        '--mode',
-                        type=str,
-                        required=False,
-                        default='sync',
-                        help='Mode ("sync"/"async").')
-    parser.add_argument('-b',
-                        '--batch_size',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify batch size')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-s',
-                        '--start_len',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify input length')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument(
-        '-n',
-        '--num_runs',
-        type=int,
-        default=1,
-        required=False,
-        help="Spedifty number of runs to get the average latency")
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print(
-            "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-                FLAGS.protocol))
-        exit(1)
-
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-    input_start_ids = np.random.randint(0,
-                                        50255,
-                                        size=(FLAGS.batch_size,
-                                              FLAGS.start_len),
-                                        dtype=np.int32)
-    input_len = np.array([[input_start_ids.shape[1]]
-                          for _ in range(input_start_ids.shape[0])], np.int32)
-    inputs = utils.prepare_inputs(input_start_ids,
-                                  input_len,
-                                  pad_id=0,
-                                  end_id=2,
-                                  flags=FLAGS)
-
-    # warm up
-    if FLAGS.warm_up:
-        print("[INFO] sending requests to warm up")
-        with utils.create_inference_server_client(
-                FLAGS.protocol,
-                FLAGS.url,
-                concurrency=FLAGS.concurrency,
-                verbose=FLAGS.verbose) as client:
-            utils.send_requests('tensorrt_llm',
-                                inputs,
-                                client,
-                                request_parallelism=2)
-
-    latencies = []
-    for i in range(FLAGS.num_runs):
-        start_time = datetime.now()
-
-        with utils.create_inference_server_client(
-                FLAGS.protocol,
-                FLAGS.url,
-                concurrency=FLAGS.concurrency,
-                verbose=FLAGS.verbose) as client:
-            if FLAGS.mode == 'sync':
-                utils.send_requests('tensorrt_llm', inputs, client,
-                                    FLAGS.request_parallelism)
-            else:
-                if FLAGS.protocol == "http":
-                    async_requests = utils.send_requests_async(
-                        'tensorrt_llm', inputs, client, FLAGS,
-                        FLAGS.request_parallelism)
-                    results = utils.get_http_results(async_requests)
-                else:
-                    user_data = utils.send_requests_async(
-                        'tensorrt_llm', inputs, client, FLAGS,
-                        FLAGS.request_parallelism)
-                    results = utils.get_grpc_results(user_data,
-                                                     FLAGS.request_parallelism)
-
-        stop_time = datetime.now()
-        latencies.append((stop_time - start_time).total_seconds() * 1000.0 /
-                         FLAGS.request_parallelism)
-
-    if FLAGS.num_runs > 1:
-        latency = s.mean(latencies)
-    else:
-        latency = latencies[0]
-    latency = round(latency, 3)
-    throughput = round(1000 / latency * FLAGS.batch_size, 3)
-    print(
-        f"[INFO] Batch size: {FLAGS.batch_size}, Start len: {FLAGS.start_len}, Output len: {FLAGS.output_len}"
-    )
-    print(f"[INFO] Latency: {latency} ms")
-    print(f"[INFO] Throughput: {throughput} sentences / sec")
diff --git a/tools/gpt/client.py b/tools/gpt/client.py
deleted file mode 100644
index 4c7973a7..00000000
--- a/tools/gpt/client.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-from datetime import datetime
-
-import numpy as np
-from transformers import AutoTokenizer
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument(
-        '-t',
-        '--text',
-        type=str,
-        required=False,
-        default='Born in north-east France, Soyer trained as a',
-        help='Input text')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--tokenizer_dir',
-                        type=str,
-                        required=True,
-                        help='Specify tokenizer directory')
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print(
-            "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-                FLAGS.protocol))
-        exit(1)
-
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                              legacy=False,
-                                              padding_side='left')
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    line = tokenizer.encode(FLAGS.text)
-    input_start_ids = np.array([line], np.int32)
-    input_len = np.array([[len(line)]], np.int32)
-    inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id,
-                                  FLAGS)
-
-    start_time = datetime.now()
-
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        results = utils.send_requests('tensorrt_llm',
-                                      inputs,
-                                      client,
-                                      request_parallelism=1)
-    output_ids = results[0].as_numpy("output_ids")
-
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Latency: {latency} ms")
-
-    output_ids = output_ids.reshape(
-        (output_ids.size, )).tolist()[input_start_ids.shape[1]:]
-    output_text = tokenizer.decode(output_ids)
-    print(f'Input: {FLAGS.text}')
-    print(f'Output: {output_text}')
diff --git a/tools/gpt/client_async.py b/tools/gpt/client_async.py
deleted file mode 100644
index b2530f3a..00000000
--- a/tools/gpt/client_async.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-from datetime import datetime
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
-from transformers import AutoTokenizer
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument(
-        '-t',
-        '--text',
-        type=str,
-        required=False,
-        default='Born in north-east France, Soyer trained as a',
-        help='Input text')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--tokenizer_dir',
-                        type=str,
-                        required=True,
-                        help='Specify tokenizer directory')
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print(
-            "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-                FLAGS.protocol))
-        exit(1)
-
-    client_util = httpclient if FLAGS.protocol == "http" else grpcclient
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                              legacy=False,
-                                              padding_side='left')
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    line = tokenizer.encode(FLAGS.text)
-    input_start_ids = np.array([line], np.int32)
-    input_len = np.array([[len(line)]], np.int32)
-    inputs = utils.prepare_inputs(input_start_ids, input_len, pad_id, end_id,
-                                  FLAGS)
-
-    start_time = datetime.now()
-
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        if FLAGS.protocol == "http":
-            async_requests = utils.send_requests_async('tensorrt_llm',
-                                                       inputs,
-                                                       client,
-                                                       FLAGS,
-                                                       request_parallelism=1)
-            results = utils.get_http_results(async_requests)
-        else:
-            user_data = utils.send_requests_async('tensorrt_llm',
-                                                  inputs,
-                                                  client,
-                                                  FLAGS,
-                                                  request_parallelism=1)
-            results = utils.get_grpc_results(user_data, request_parallelism=1)
-    output_ids = results[0].as_numpy("output_ids")
-
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Latency: {latency} ms")
-
-    output_ids = output_ids.reshape(
-        (output_ids.size, )).tolist()[input_start_ids.shape[1]:]
-    output_text = tokenizer.decode(output_ids)
-    print(f'Input: {FLAGS.text}')
-    print(f'Output: {output_text}')
diff --git a/tools/gpt/end_to_end_test.py b/tools/gpt/end_to_end_test.py
deleted file mode 100644
index c2e411bd..00000000
--- a/tools/gpt/end_to_end_test.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-import argparse
-
-import numpy as np
-from transformers import AutoTokenizer
-from utils import utils
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--tokenizer_dir',
-                        type=str,
-                        required=True,
-                        help='Specify tokenizer directory')
-
-    FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print(
-            "unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-                FLAGS.protocol))
-        exit(1)
-
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                              legacy=False,
-                                              padding_side='left')
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    model_name = 'preprocessing'
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        input0 = [["Blackhawks\n The 2015 Hilltoppers"],
-                  ["Data sources you can use to make a decision:"],
-                  ["\n if(angle = 0) { if(angle"],
-                  ["GMs typically get 78% female enrollment, but the "],
-                  ["Previous Chapter | Index | Next Chapter"],
-                  ["Michael, an American Jew, called Jews"],
-                  ["Born in north-east France, Soyer trained as a"],
-                  ["Data sources you can use to make a comparison:"]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * FLAGS.output_len
-        bad_words_list = np.array(
-            [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]],
-            dtype=object)
-        stop_words_list = np.array(
-            [[""], [""], [""], [""], [""], [""], [""], ["month, month"]],
-            dtype=object)
-        inputs = [
-            utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len,
-                                 FLAGS.protocol),
-        ]
-
-        try:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy("INPUT_ID")
-            output1 = result.as_numpy("REQUEST_INPUT_LEN")
-            output2 = result.as_numpy("REQUEST_OUTPUT_LEN")
-            output3 = result.as_numpy("BAD_WORDS_IDS")
-            output4 = result.as_numpy("STOP_WORDS_IDS")
-        except Exception as e:
-            print(e)
-
-    model_name = "tensorrt_llm"
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=1,
-                                              verbose=FLAGS.verbose) as client:
-        inputs = utils.prepare_inputs(output0, output1, pad_id, end_id, FLAGS)
-
-        try:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy("output_ids")
-        except Exception as e:
-            print(e)
-
-    model_name = "postprocessing"
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        inputs = [
-            utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol)
-        ]
-        inputs[0].set_data_from_numpy(output0)
-
-        try:
-            result = client.infer(model_name, inputs)
-            output0 = result.as_numpy("OUTPUT")
-            print("============After postprocessing============")
-            batch_size = len(input0)
-            output0 = output0.reshape([-1, batch_size]).T.tolist()
-            output0 = [[char.decode('UTF-8') for char in line]
-                       for line in output0]
-            output0 = [''.join(line) for line in output0]
-            for line in output0:
-                print(f"{line}")
-            print("===========================================\n\n\n")
-        except Exception as e:
-            print(e)
-
-    model_name = "ensemble"
-    with utils.create_inference_server_client(FLAGS.protocol,
-                                              FLAGS.url,
-                                              concurrency=FLAGS.concurrency,
-                                              verbose=FLAGS.verbose) as client:
-        input0 = [["Blackhawks\n The 2015 Hilltoppers"],
-                  ["Data sources you can use to make a decision:"],
-                  ["\n if(angle = 0) { if(angle"],
-                  ["GMs typically get 78% female enrollment, but the "],
-                  ["Previous Chapter | Index | Next Chapter"],
-                  ["Michael, an American Jew, called Jews"],
-                  ["Born in north-east France, Soyer trained as a"],
-                  ["Data sources you can use to make a comparison:"]]
-        bad_words_list = np.array(
-            [["Hawks, Hawks"], [""], [""], [""], [""], [""], [""], [""]],
-            dtype=object)
-        stop_words_list = np.array(
-            [[""], [""], [""], [""], [""], [""], [""], ["month, month"]],
-            dtype=object)
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * FLAGS.output_len
-        runtime_top_k = (FLAGS.topk *
-                         np.ones([input0_data.shape[0], 1])).astype(np.int32)
-        runtime_top_p = FLAGS.topp * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        temperature = 1.0 * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        len_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        repetition_penalty = 1.0 * np.ones([input0_data.shape[0], 1]).astype(
-            np.float32)
-        random_seed = 0 * np.ones([input0_data.shape[0], 1]).astype(np.uint64)
-        output_log_probs = True * np.ones([input0_data.shape[0], 1
-                                           ]).astype(bool)
-        beam_width = (FLAGS.beam_width *
-                      np.ones([input0_data.shape[0], 1])).astype(np.int32)
-        pad_ids = pad_id * \
-            np.ones([input0_data.shape[0], 1]).astype(np.int32)
-        end_ids = end_id * \
-            np.ones([input0_data.shape[0], 1]).astype(np.int32)
-        min_length = 1 * \
-            np.ones([input0_data.shape[0], 1]).astype(np.int32)
-        presence_penalty = 0.0 * \
-            np.ones([input0_data.shape[0], 1]).astype(np.float32)
-        frequency_penalty = 0.0 * \
-            np.ones([input0_data.shape[0], 1]).astype(np.float32)
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("pad_id", pad_ids, FLAGS.protocol),
-            utils.prepare_tensor("end_id", end_ids, FLAGS.protocol),
-            utils.prepare_tensor("beam_width", beam_width, FLAGS.protocol),
-            utils.prepare_tensor("top_k", runtime_top_k, FLAGS.protocol),
-            utils.prepare_tensor("top_p", runtime_top_p, FLAGS.protocol),
-            utils.prepare_tensor("temperature", temperature, FLAGS.protocol),
-            utils.prepare_tensor("length_penalty", len_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("repetition_penalty", repetition_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("min_length", min_length, FLAGS.protocol),
-            utils.prepare_tensor("presence_penalty", presence_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("frequency_penalty", frequency_penalty,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("random_seed", random_seed, FLAGS.protocol),
-            utils.prepare_tensor("output_log_probs", output_log_probs,
-                                 FLAGS.protocol),
-        ]
-
-        try:
-            result = client.infer(model_name, inputs)
-            ensemble_output0 = result.as_numpy("text_output")
-            print("============After ensemble============")
-            batch_size = len(input0)
-            ensemble_output0 = ensemble_output0.reshape([-1, batch_size
-                                                         ]).T.tolist()
-            ensemble_output0 = [[char.decode('UTF-8') for char in line]
-                                for line in ensemble_output0]
-            ensemble_output0 = [''.join(line) for line in ensemble_output0]
-            for line in ensemble_output0:
-                print(f"{line}")
-        except Exception as e:
-            print(e)
-
-    assert output0 == ensemble_output0
diff --git a/tools/gpt/gen_input_data.py b/tools/gpt/gen_input_data.py
deleted file mode 100644
index 809771a0..00000000
--- a/tools/gpt/gen_input_data.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import argparse
-import json
-
-import numpy as np
-
-
-def add_sample(sample, name, array):
-    sample[name] = {'content': array.flatten().tolist(), 'shape': array.shape}
-
-
-def main(args):
-    data = {'data': []}
-    input_start_ids = np.random.randint(0,
-                                        50255,
-                                        size=(args.start_len),
-                                        dtype=np.int32)
-    input_len = np.array([input_start_ids.shape[0]], np.int32)
-    output_len = np.ones([1]).astype(np.int32) * args.output_len
-    runtime_top_k = (args.topk * np.ones([1])).astype(np.int32)
-    runtime_top_p = args.topp * np.ones([1]).astype(np.float32)
-    beam_search_diversity_rate = 0.0 * np.ones([1]).astype(np.float32)
-    temperature = 1.0 * np.ones([1]).astype(np.float32)
-    len_penalty = 1.0 * np.ones([1]).astype(np.float32)
-    repetition_penalty = 1.0 * np.ones([1]).astype(np.float32)
-    random_seed = 0 * np.ones([1]).astype(np.uint64)
-    # is_return_log_probs = True * np.ones([1]).astype(bool)
-    beam_width = (args.beam_width * np.ones([1])).astype(np.int32)
-    # start_ids = 50256 * np.ones([1]).astype(np.int32)
-    # end_ids = 50256 * np.ones([1]).astype(np.int32)
-    # bad_words_list = np.concatenate([
-    #     np.zeros([1, 1]).astype(np.int32),
-    #     (-1 * np.ones([1, 1])).astype(np.int32)
-    # ],
-    #                                 axis=1)
-    # stop_word_list = np.concatenate([
-    #     np.zeros([1, 1]).astype(np.int32),
-    #     (-1 * np.ones([1, 1])).astype(np.int32)
-    # ],
-    #                                 axis=1)
-
-    for _ in range(args.num_samples):
-        sample = {}
-        add_sample(sample, 'input_ids', input_start_ids)
-        add_sample(sample, 'input_lengths', input_len)
-        add_sample(sample, 'request_output_len', output_len)
-        add_sample(sample, 'runtime_top_k', runtime_top_k)
-        add_sample(sample, 'runtime_top_p', runtime_top_p)
-        add_sample(sample, 'beam_search_diversity_rate',
-                   beam_search_diversity_rate)
-        add_sample(sample, 'temperature', temperature)
-        add_sample(sample, 'len_penalty', len_penalty)
-        add_sample(sample, 'repetition_penalty', repetition_penalty)
-        add_sample(sample, 'random_seed', random_seed)
-        add_sample(sample, 'beam_width', beam_width)
-        # add_sample(sample, 'top_p_decay', top_p_decay)
-        # add_sample(sample, 'top_p_min', top_p_min)
-        # add_sample(sample, 'top_p_reset_ids', top_p_reset_ids)
-        data['data'].append(sample)
-
-    with open('input_data.json', 'w') as f:
-        json.dump(data, f, indent=4)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-b',
-                        '--batch_size',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify batch size')
-    parser.add_argument('-beam',
-                        '--beam_width',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='Specify beam width')
-    parser.add_argument('-topk',
-                        '--topk',
-                        type=int,
-                        default=1,
-                        required=False,
-                        help='topk for sampling')
-    parser.add_argument('-topp',
-                        '--topp',
-                        type=float,
-                        default=0.0,
-                        required=False,
-                        help='topp for sampling')
-    parser.add_argument('-s',
-                        '--start_len',
-                        type=int,
-                        default=8,
-                        required=False,
-                        help='Specify input length')
-    parser.add_argument('-o',
-                        '--output_len',
-                        type=int,
-                        default=10,
-                        required=False,
-                        help='Specify output length')
-    parser.add_argument('--num_samples',
-                        type=int,
-                        default=10000,
-                        required=False,
-                        help='Specify number of samples to generate')
-    args = parser.parse_args()
-    main(args)
diff --git a/tools/inflight_batcher_llm/benchmark_core_model.py b/tools/inflight_batcher_llm/benchmark_core_model.py
deleted file mode 100644
index 3aa53372..00000000
--- a/tools/inflight_batcher_llm/benchmark_core_model.py
+++ /dev/null
@@ -1,487 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import json
-import sys
-import time
-from datetime import datetime
-from functools import partial
-
-import numpy as np
-from transformers import AutoTokenizer
-from utils import utils
-
-
-def callback(user_data, result, error):
-    user_data._completed_requests.put((result, error))
-    if result is None:
-        # There was an error.
-        return
-    try:
-        # GRPC
-        req_id = result.get_response().id
-    except:
-        # HTTP
-        req_id = result.get_response()["id"]
-    start_time = user_data._start_time_dict[req_id]
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    user_data._latencies.append(latency)
-    user_data._latency_dict[req_id] = latency
-    user_data._stop_time_dict[req_id] = stop_time
-
-
-def append_pad_id_to_tensors(pad_id, inputs):
-    if pad_id is not None:
-        pad_id_data = np.array([[pad_id]], dtype=np.int32)
-    else:
-        pad_id_data = np.ones_like([[1]]).astype(np.int32) * 0
-
-    inputs += [utils.prepare_tensor("pad_id", pad_id_data, FLAGS.protocol)]
-
-
-def append_end_id_to_tensors(end_id, inputs):
-    if end_id is not None:
-        end_id_data = np.array([[end_id]], dtype=np.int32)
-    else:
-        end_id_data = np.ones_like([[1]]).astype(np.int32) * 1
-
-    inputs += [utils.prepare_tensor("end_id", end_id_data, FLAGS.protocol)]
-
-
-def test_performance(client,
-                     input_start_ids,
-                     input_lens,
-                     output_lens,
-                     delays,
-                     FLAGS,
-                     pad_id=None,
-                     end_id=None):
-    model_name = "tensorrt_llm"
-
-    print(f"[INFO] Warm up for benchmarking.")
-    if FLAGS.decoupled:
-        client.start_stream(callback=lambda result, error: None,
-                            stream_timeout=FLAGS.stream_timeout)
-    for i in range(10):
-        model_name = FLAGS.tensorrt_llm_model_name[i % len(
-            FLAGS.tensorrt_llm_model_name)]
-        output0_len = np.ones_like([[1]]).astype(np.int32) * 100
-        inputs = [
-            utils.prepare_tensor("input_ids", input_start_ids[0],
-                                 FLAGS.protocol),
-            utils.prepare_tensor("input_lengths", input_lens[0],
-                                 FLAGS.protocol),
-            utils.prepare_tensor("request_output_len", output0_len,
-                                 FLAGS.protocol),
-        ]
-
-        append_pad_id_to_tensors(pad_id, inputs)
-        append_end_id_to_tensors(end_id, inputs)
-        if FLAGS.decoupled:
-            client.async_stream_infer(model_name, inputs, request_id=str(i))
-        else:
-            client.infer(model_name, inputs, request_id=str(i))
-    if FLAGS.decoupled:
-        client.stop_stream()
-
-    print(f"[INFO] Start benchmarking on {len(input_start_ids)} prompts.")
-    latency = 0
-    async_requests = []
-    start_time = datetime.now()
-    user_data = utils.UserData()
-
-    if FLAGS.decoupled:
-        client.start_stream(callback=partial(callback, user_data),
-                            stream_timeout=FLAGS.stream_timeout)
-    for i, ids in enumerate(input_start_ids):
-        model_name = FLAGS.tensorrt_llm_model_name[i % len(
-            FLAGS.tensorrt_llm_model_name)]
-        output0_len = np.ones_like([[1]]).astype(np.int32) * output_lens[i]
-        inputs = [
-            utils.prepare_tensor("input_ids", ids, FLAGS.protocol),
-            utils.prepare_tensor("input_lengths", input_lens[i],
-                                 FLAGS.protocol),
-            utils.prepare_tensor("request_output_len", output0_len,
-                                 FLAGS.protocol),
-        ]
-
-        append_pad_id_to_tensors(pad_id, inputs)
-        append_end_id_to_tensors(end_id, inputs)
-
-        time.sleep(delays[i])
-
-        user_data._start_time_dict[str(i)] = datetime.now()
-        if FLAGS.protocol == "http":
-            async_requests.append(
-                client.async_infer(model_name, inputs, request_id=str(i)))
-        elif FLAGS.protocol == "grpc":
-            if FLAGS.decoupled:
-                client.async_stream_infer(model_name,
-                                          inputs,
-                                          request_id=str(i))
-            else:
-                async_requests.append(
-                    client.async_infer(model_name,
-                                       inputs,
-                                       callback=partial(callback, user_data),
-                                       request_id=str(i)))
-    if FLAGS.decoupled:
-        client.stop_stream()
-    try:
-        if FLAGS.protocol == "http":
-            utils.get_http_results(async_requests)
-        elif FLAGS.protocol == "grpc":
-            responses = utils.get_grpc_results(user_data, len(input_start_ids))
-        else:
-            raise RuntimeError("Invalid protocol")
-
-        stop_time = datetime.now()
-        latency = (stop_time - start_time).total_seconds() * 1000.0
-        latency = round(latency, 3)
-        print(f"[INFO] Total Latency: {latency} ms")
-
-        # TODO(kaiyu): support `extract_print_stats` for http
-        data_dict = None
-        if FLAGS.protocol == "grpc":
-            request_latencies = 0.0
-            for latency in user_data._latencies:
-                request_latencies += latency
-            print(f"[INFO] Total request latencies: {request_latencies} ms")
-
-            ip_token_len_list = []
-            for ip in input_lens:
-                ip_token_len_list.append(
-                    ip[0][0])  #for some reason, two level nesting
-
-            data_dict = utils.extract_print_stats(ip_token_len_list, responses,
-                                                  user_data, FLAGS)
-
-        if FLAGS.check_perf_json:
-            check_performance(data_dict, FLAGS)
-
-    except Exception as e:
-        print("Failed receiving responses: " + str(e))
-        sys.exit(1)
-
-
-def check_performance(data_dict, FLAGS):
-    if not data_dict:
-        print(
-            "[ERROR] --check-perf-json was used, but no data was collected. Please use grpc protocol."
-        )
-    ref = json.load(open(FLAGS.check_perf_json, "r"))
-    if FLAGS.check_perf_key not in ref or len(ref[FLAGS.check_perf_key]) == 0:
-        print(
-            f"[ERROR] There are no reference numbers for {FLAGS.check_perf_key}, so the performance is not checked. Please add an entry to {FLAGS.check_perf_json}."
-        )
-        sys.exit(1)
-    for metric in ref[FLAGS.check_perf_key]:
-        if metric not in data_dict:
-            print(f"[ERROR] Data for '{metric}' was not found.")
-        np.testing.assert_allclose(
-            data_dict[metric],
-            ref[FLAGS.check_perf_key][metric],
-            rtol=FLAGS.check_perf_rtol,
-            atol=FLAGS.check_perf_atol,
-            err_msg=
-            f"'{metric}' check failed - did not match reference in '{FLAGS.check_perf_json}' for '{FLAGS.check_perf_key}'"
-        )
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    subparsers = parser.add_subparsers(dest='workload')
-
-    parser_dataset = subparsers.add_parser('dataset')
-    parser_dataset.add_argument('--dataset',
-                                type=str,
-                                required=True,
-                                help='Dataset path used for the test.')
-    parser_dataset.add_argument('--tokenizer-dir',
-                                type=str,
-                                required=True,
-                                help='Specify tokenizer directory')
-    parser_dataset.add_argument('--tokenizer-type',
-                                type=str,
-                                default='auto',
-                                required=False,
-                                choices=['auto', 't5', 'llama'],
-                                help='Specify tokenizer type')
-    parser_dataset.add_argument(
-        '--op-tokens-per-word',
-        type=float,
-        default=1.3,
-        required=False,
-        help=
-        'Specify op tokens/word ratio. Useful to have model generate exactly as many tokens as needed by the dataset'
-    )
-
-    parser_token_norm_dist = subparsers.add_parser('token-norm-dist')
-    parser_token_norm_dist.add_argument(
-        '--input-mean',
-        type=int,
-        required=True,
-        help='normal dist mean for input tokens')
-    parser_token_norm_dist.add_argument(
-        '--input-stdev',
-        type=int,
-        required=True,
-        help='normal dist stdev for input tokens')
-    parser_token_norm_dist.add_argument(
-        '--output-mean',
-        type=int,
-        required=True,
-        help='normal dist mean for output tokens')
-    parser_token_norm_dist.add_argument(
-        '--output-stdev',
-        type=int,
-        required=True,
-        help='normal dist stdev for output tokens')
-
-    parser_token_from_hist = subparsers.add_parser('token-from-histogram')
-    parser_token_from_hist.add_argument(
-        '--histogram-key',
-        type=str,
-        required=True,
-        help='key to retrieve histogram buckets,freqs defined in utils')
-
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        choices=['http', 'grpc'],
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument(
-        '--decoupled',
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        'Uses async_stream_infer which allows decoupled backends (must use grpc protocol)'
-    ),
-    parser.add_argument(
-        "-t",
-        "--stream-timeout",
-        type=float,
-        required=False,
-        default=None,
-        help="Stream timeout in seconds. Default is None.",
-    )
-    parser.add_argument(
-        "--tensorrt-llm-model-name",
-        type=str,
-        required=False,
-        default=["tensorrt_llm"],
-        action="http://github.com/append",
-        help=
-        "Specify the name of the TensorRT-LLM model. Can be specified multiple times to use multiple models."
-    )
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=128,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('--max-input-len',
-                        type=int,
-                        required=True,
-                        help='Specify max input length')
-    parser.add_argument('--request-rate',
-                        type=float,
-                        required=False,
-                        help="# of reqs/sec. -1 indicates SOL/Offline",
-                        default=-1.0)
-    parser.add_argument('--time-delay-dist',
-                        type=str,
-                        required=False,
-                        choices=["constant", "exponential_dist"],
-                        default="exponential_dist",
-                        help="# of reqs/sec. -1 indicates SOL/Offline")
-    parser.add_argument(
-        '--dump-perfetto-trace',
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        'Dumps trace of requests in a json (perfetto.json) to be visualized in perfetto'
-    ),
-    parser.add_argument('--op-stats-csv',
-                        type=str,
-                        default=None,
-                        help='csv filename to dump stats'),
-    parser.add_argument(
-        "--exclude-input-in-output",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help="Expect that output IDs do not contain input IDs",
-    )
-    parser.add_argument(
-        '--num-requests',
-        type=int,
-        required=False,
-        default=30000,
-        help=
-        'For dataset, requests = min(dataset, num_requests). number of requests to be generated by the client'
-    )
-    parser.add_argument(
-        '--check-perf-json',
-        type=str,
-        required=False,
-        help=
-        'If set, this will compare the latency to the value in this file under the key from --check-perf-key'
-    )
-    parser.add_argument(
-        '--check-perf-key',
-        type=str,
-        required=False,
-        help=
-        'Used with --check-perf-json to specify which entry in the file to compare with'
-    )
-    parser.add_argument('--check-perf-atol',
-                        type=float,
-                        required=False,
-                        help="Absolute tolerance for performance check",
-                        default=50)
-    parser.add_argument('--check-perf-rtol',
-                        type=float,
-                        required=False,
-                        help="Relative tolerance for performance check",
-                        default=0.05)
-
-    FLAGS = parser.parse_args()
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-    if FLAGS.decoupled and FLAGS.protocol != 'grpc':
-        print("Protocol must be set to 'grpc' when using '--decoupled'.")
-        sys.exit(1)
-
-    try:
-        client = utils.create_inference_server_client(
-            FLAGS.protocol,
-            FLAGS.url,
-            concurrency=FLAGS.concurrency,
-            verbose=FLAGS.verbose)
-    except Exception as e:
-        print("channel creation failed: " + str(e))
-        sys.exit(1)
-
-    if FLAGS.request_rate == -1:
-        mean_time_bet_reqs = 0
-    else:
-        mean_time_bet_reqs = 1.0 / FLAGS.request_rate
-
-    input_start_ids = []
-    input_lens = []
-    output_lens = []
-    ratio = []
-
-    print(FLAGS.workload)
-    if FLAGS.workload == "dataset":
-        tokenizer = AutoTokenizer.from_pretrained(FLAGS.tokenizer_dir,
-                                                  legacy=False,
-                                                  padding_side='left')
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = tokenizer.eos_token
-
-        pad_id = tokenizer.encode(tokenizer.pad_token,
-                                  add_special_tokens=False)[0]
-        end_id = tokenizer.encode(tokenizer.eos_token,
-                                  add_special_tokens=False)[0]
-
-        prompt_cnt = 0
-
-        with open(FLAGS.dataset, 'r') as f:
-            data_dict = json.load(f)
-            for req in data_dict:
-                prompt = req['input'] + ' ' + req['instruction']
-                output = req['output']
-                line = tokenizer.encode(prompt)
-                if len(line) > FLAGS.max_input_len:
-                    continue
-
-                prompt_cnt += 1
-                if prompt_cnt > FLAGS.num_requests:
-                    break
-
-                input_start_ids.append(np.array([line], np.int32))
-                input_lens.append(np.array([[len(line)]], np.int32))
-                output_lens.append(
-                    int(len(output.split(' ')) * FLAGS.op_tokens_per_word))
-                prompt_tokens = len(line)
-                prompt_words = len(prompt.split())
-                ratio.append(prompt_tokens / prompt_words)
-
-        print("Tokenizer: Tokens per word = ", round(np.mean(ratio), 3))
-        num_reqs = len(input_lens)
-        delays = utils.get_list_of_delays(FLAGS.time_delay_dist,
-                                          mean_time_bet_reqs, num_reqs)
-        test_performance(client, input_start_ids, input_lens, output_lens,
-                         delays, FLAGS, pad_id, end_id)
-
-    elif FLAGS.workload == "token-norm-dist":
-        input_lens = utils.get_norm_dist_tokens(FLAGS.input_mean,
-                                                FLAGS.input_stdev,
-                                                FLAGS.num_requests)
-        pruned_ip_list = [
-            ip_len for ip_len in input_lens if ip_len <= FLAGS.max_input_len
-        ]
-        num_reqs = len(pruned_ip_list)
-        ip_lens_2d_array = [
-            np.array([[ip_len]], np.int32) for ip_len in pruned_ip_list
-        ]
-        output_lens = utils.get_norm_dist_tokens(FLAGS.output_mean,
-                                                 FLAGS.output_stdev, num_reqs)
-        delays = utils.get_list_of_delays(FLAGS.time_delay_dist,
-                                          mean_time_bet_reqs, num_reqs)
-
-        input_start_ids = utils.gen_random_start_ids(pruned_ip_list)
-        test_performance(client, input_start_ids, ip_lens_2d_array,
-                         output_lens, delays, FLAGS)
-
-    elif FLAGS.workload == "token-from-histogram":
-        input_lens_orig = utils.get_token_list_from_histogram(
-            FLAGS.histogram_key + "_ip")
-        output_lens_orig = utils.get_token_list_from_histogram(
-            FLAGS.histogram_key + "_op")
-
-        final_lens = min(len(input_lens_orig), len(output_lens_orig))
-        input_lens = input_lens_orig[:final_lens]
-        output_lens = output_lens_orig[:final_lens]
-
-        num_reqs = len(input_lens)
-        ip_lens_2d_array = [
-            np.array([[ip_len]], np.int32) for ip_len in input_lens
-        ]
-        output_lens = utils.get_token_list_from_histogram(FLAGS.histogram_key +
-                                                          "_op")
-        print(len(input_lens), len(output_lens))
-        assert (len(input_lens) == len(output_lens))
-
-        delays = utils.get_list_of_delays(FLAGS.time_delay_dist,
-                                          mean_time_bet_reqs, num_reqs)
-
-        input_start_ids = utils.gen_random_start_ids(input_lens)
-        test_performance(client, input_start_ids, ip_lens_2d_array,
-                         output_lens, delays, FLAGS)
diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py
deleted file mode 100644
index 9361de46..00000000
--- a/tools/inflight_batcher_llm/end_to_end_test.py
+++ /dev/null
@@ -1,453 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-
-import argparse
-import json
-import sys
-from datetime import datetime
-from functools import partial
-
-import numpy as np
-from utils import utils
-
-
-def callback(user_data, start_time, result, error):
-    user_data._completed_requests.put((result, error))
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    user_data._latencies.append(latency)
-
-
-def test_functionality(client,
-                       prompts,
-                       output_lens,
-                       vocabSizePadded=50257,
-                       return_context_logits=False,
-                       return_generation_logits=False,
-                       test_bls=False):
-    print(f"[INFO] Start testing on {len(prompts)} prompts.")
-    for i, prompt in enumerate(prompts):
-
-        # 1. Ensemble models manually: preprocessing -> tensorrt_llm -> postprocessing
-        model_name = 'preprocessing'
-        input0 = [[prompt]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("QUERY", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("BAD_WORDS_DICT", bad_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("STOP_WORDS_DICT", stop_words_list,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("REQUEST_OUTPUT_LEN", output0_len,
-                                 FLAGS.protocol),
-        ]
-        result = client.infer(model_name, inputs, request_id=str(i))
-        output0 = result.as_numpy("INPUT_ID")
-        output1 = result.as_numpy("REQUEST_INPUT_LEN")
-        output2 = result.as_numpy("REQUEST_OUTPUT_LEN")
-        decoder_input_id = result.as_numpy("DECODER_INPUT_ID")
-        output_end_id = result.as_numpy("OUT_END_ID")
-        output_pad_id = result.as_numpy("OUT_PAD_ID")
-        inputIds = output0  # Use to check context logits shape
-
-        model_name = "tensorrt_llm"
-        inputs = [
-            utils.prepare_tensor("input_ids", output0, FLAGS.protocol),
-            utils.prepare_tensor("decoder_input_ids", decoder_input_id,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("input_lengths", output1, FLAGS.protocol),
-            utils.prepare_tensor("request_output_len", output2,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("end_id", output_end_id, FLAGS.protocol),
-            utils.prepare_tensor("pad_id", output_pad_id, FLAGS.protocol),
-        ]
-        if return_context_logits:
-            return_context_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_context_logits",
-                                     return_context_logits_flag,
-                                     FLAGS.protocol),
-            ]
-        if return_generation_logits:
-            return_generation_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_generation_logits",
-                                     return_generation_logits_flag,
-                                     FLAGS.protocol),
-            ]
-
-        result = client.infer(model_name, inputs, request_id=str(i))
-        output0 = result.as_numpy("output_ids").astype(np.int32)
-        seq_lengths = result.as_numpy("sequence_length")
-        cum_log_probs = result.as_numpy("cum_log_probs").astype(np.float32)
-        output_log_probs = result.as_numpy("output_log_probs").astype(
-            np.float32)
-        context_logits = result.as_numpy("context_logits").astype(np.float32)
-        generation_logits = result.as_numpy("generation_logits").astype(
-            np.float32)
-
-        print(f"context_logits.shape: {context_logits.shape}")
-        print(f"generation_logits.shape: {generation_logits.shape}")
-
-        model_name = "postprocessing"
-        inputs = [
-            utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol),
-            utils.prepare_tensor("SEQUENCE_LENGTH", seq_lengths,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("CUM_LOG_PROBS", cum_log_probs,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("OUTPUT_LOG_PROBS", output_log_probs,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("CONTEXT_LOGITS", context_logits,
-                                 FLAGS.protocol),
-            utils.prepare_tensor("GENERATION_LOGITS", generation_logits,
-                                 FLAGS.protocol)
-        ]
-        inputs[0].set_data_from_numpy(output0)
-        inputs[1].set_data_from_numpy(seq_lengths)
-        inputs[2].set_data_from_numpy(cum_log_probs)
-        inputs[3].set_data_from_numpy(output_log_probs)
-        inputs[4].set_data_from_numpy(context_logits)
-        inputs[5].set_data_from_numpy(generation_logits)
-
-        result = client.infer(model_name, inputs, request_id=str(i))
-        output0 = result.as_numpy("OUTPUT")
-        post_gen_logits = result.as_numpy("OUT_GENERATION_LOGITS")
-        assert (generation_logits == post_gen_logits).all()
-
-        # 2. Use ensemble model
-        model_name = "ensemble"
-        input0 = [[prompt]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list,
-                                 FLAGS.protocol),
-        ]
-        if return_context_logits:
-            return_context_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_context_logits",
-                                     return_context_logits_flag,
-                                     FLAGS.protocol),
-            ]
-        if return_generation_logits:
-            return_generation_logits_flag = np.array([[True]], dtype=bool)
-            inputs += [
-                utils.prepare_tensor("return_generation_logits",
-                                     return_generation_logits_flag,
-                                     FLAGS.protocol),
-            ]
-
-        result = client.infer(model_name, inputs, request_id=str(i))
-
-        # 3. Check the results between manually ensembled models and the ensemble model
-        ensemble_output = result.as_numpy('text_output')
-        ensemble_cum_log_probs = result.as_numpy('cum_log_probs')
-        ensemble_output_log_probs = result.as_numpy('output_log_probs')
-        ensemble_context_logits = result.as_numpy('context_logits')
-        ensemble_generation_logits = result.as_numpy('generation_logits')
-
-        assert output0 == ensemble_output
-        assert cum_log_probs == ensemble_cum_log_probs
-        assert (output_log_probs == ensemble_output_log_probs).all()
-        assert (context_logits == ensemble_context_logits).all()
-        assert (generation_logits == ensemble_generation_logits).all()
-
-        ensemble_context_logits_shape = ensemble_context_logits.shape
-        assert (len(ensemble_context_logits_shape) == 3)
-        if return_context_logits:
-            # Expect shape [1, prompt_length, vocabSizePadded]
-            assert (ensemble_context_logits_shape[0] == 1)  # One request
-            assert (ensemble_context_logits_shape[1] == inputIds.size
-                    )  # Prompt length
-            assert (ensemble_context_logits_shape[2] == vocabSizePadded
-                    )  # VocabSizePadded
-        else:
-            # Expect shape [1, 1, 1]
-            assert (ensemble_context_logits_shape[0] == 1)
-            assert (ensemble_context_logits_shape[1] == 1)
-            assert (ensemble_context_logits_shape[2] == 1)
-            assert (ensemble_context_logits[0][0][0] == 0
-                    )  # Dummy tensor's value is 0
-
-        ensemble_generation_logits_shape = ensemble_generation_logits.shape
-        assert (len(ensemble_generation_logits_shape) == 4)
-
-        if return_generation_logits:
-            # Expect shape [1, beam_width, output_length, vocabSizePadded]
-            assert (ensemble_generation_logits_shape[0] == 1)  # One request
-            assert (ensemble_generation_logits_shape[1] == 1
-                    )  # Beam width (default)
-            assert (ensemble_generation_logits_shape[2] == output_lens[i]
-                    )  # Output length
-            assert (ensemble_generation_logits_shape[3] == vocabSizePadded
-                    )  # VocabSizePadded
-        else:
-            assert (ensemble_generation_logits_shape[0] == 1)
-            assert (ensemble_generation_logits_shape[1] == 1)
-            assert (ensemble_generation_logits_shape[2] == 1)
-            assert (ensemble_generation_logits_shape[3] == 1)
-            assert (ensemble_generation_logits[0][0][0][0] == 0
-                    )  # Dummy tensor's value is 0
-
-        if test_bls:
-            # 4. Use bls
-            model_name = "tensorrt_llm_bls"
-            input0 = [[prompt]]
-            input0_data = np.array(input0).astype(object)
-            output0_len = np.ones_like(input0).astype(
-                np.int32) * output_lens[i]
-            bad_words_list = np.array([[""]], dtype=object)
-            stop_words_list = np.array([[""]], dtype=object)
-
-            inputs = [
-                utils.prepare_tensor("text_input", input0_data,
-                                     FLAGS.protocol),
-                utils.prepare_tensor("max_tokens", output0_len,
-                                     FLAGS.protocol),
-                utils.prepare_tensor("bad_words", bad_words_list,
-                                     FLAGS.protocol),
-                utils.prepare_tensor("stop_words", stop_words_list,
-                                     FLAGS.protocol),
-            ]
-            if return_context_logits:
-                return_context_logits_flag = np.array([[True]], dtype=bool)
-                inputs += [
-                    utils.prepare_tensor("return_context_logits",
-                                         return_context_logits_flag,
-                                         FLAGS.protocol),
-                ]
-            if return_generation_logits:
-                return_generation_logits_flag = np.array([[True]], dtype=bool)
-                inputs += [
-                    utils.prepare_tensor("return_generation_logits",
-                                         return_generation_logits_flag,
-                                         FLAGS.protocol),
-                ]
-
-            result = client.infer(model_name, inputs, request_id=str(i))
-
-            # 5. Check the results between manually ensembled models and the bls model
-            bls_output = result.as_numpy('text_output')
-            bls_cum_log_probs = result.as_numpy('cum_log_probs')
-            bls_output_log_probs = result.as_numpy('output_log_probs')
-            bls_context_logits = result.as_numpy('context_logits')
-            bls_generation_logits = result.as_numpy('generation_logits')
-            continue
-
-            assert output0 == bls_output
-            assert cum_log_probs == bls_cum_log_probs
-            assert (output_log_probs == bls_output_log_probs).all()
-            assert (context_logits == bls_context_logits).all()
-            assert (generation_logits == bls_generation_logits).all()
-
-            bls_context_logits_shape = bls_context_logits.shape
-            assert (len(bls_context_logits_shape) == 3)
-            if return_context_logits:
-                # Expect shape [1, prompt_length, vocabSizePadded]
-                assert (bls_context_logits_shape[0] == 1)  # One request
-                assert (bls_context_logits_shape[1] == inputIds.size
-                        )  # Prompt length
-                assert (bls_context_logits_shape[2] == vocabSizePadded
-                        )  # VocabSizePadded
-            else:
-                # Expect shape [1, 1, 1]
-                assert (bls_context_logits_shape[0] == 1)
-                assert (bls_context_logits_shape[1] == 1)
-                assert (bls_context_logits_shape[2] == 1)
-                assert (bls_context_logits[0][0][0] == 0
-                        )  # Dummy tensor's value is 0
-
-            bls_generation_logits_shape = bls_generation_logits.shape
-            assert (len(bls_generation_logits_shape) == 4)
-
-            if return_generation_logits:
-                # Expect shape [1, beam_width, output_length, vocabSizePadded]
-                assert (bls_generation_logits_shape[0] == 1)  # One request
-                assert (bls_generation_logits_shape[1] == 1
-                        )  # Beam width (default)
-                assert (bls_generation_logits_shape[2] == output_lens[i]
-                        )  # Output length
-                assert (bls_generation_logits_shape[3] == vocabSizePadded
-                        )  # VocabSizePadded
-            else:
-                assert (bls_generation_logits_shape[0] == 1)
-                assert (bls_generation_logits_shape[1] == 1)
-                assert (bls_generation_logits_shape[2] == 1)
-                assert (bls_generation_logits_shape[3] == 1)
-                assert (bls_generation_logits[0][0][0][0] == 0
-                        )  # Dummy tensor's value is 0
-
-        if FLAGS.verbose:
-            print('Response: {}'.format(result.get_response()))
-            print('Output: {}'.format(ensemble_output))
-    print(f"[INFO] Functionality test succeed.")
-
-
-def test_performance(client, prompts, output_lens):
-    model_name = "ensemble"
-
-    print(f"[INFO] Warm up for benchmarking.")
-    for i in range(min(10, len(prompts))):
-        input0 = [[prompts[0]]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list,
-                                 FLAGS.protocol),
-        ]
-
-        client.infer(model_name, inputs, request_id=str(i))
-
-    print(f"[INFO] Start benchmarking on {len(prompts)} prompts.")
-    latency = 0
-    async_requests = []
-    start_time = datetime.now()
-    user_data = utils.UserData()
-    for i, prompt in enumerate(prompts):
-        input0 = [[prompt]]
-        input0_data = np.array(input0).astype(object)
-        output0_len = np.ones_like(input0).astype(np.int32) * output_lens[i]
-        bad_words_list = np.array([[""]], dtype=object)
-        stop_words_list = np.array([[""]], dtype=object)
-
-        inputs = [
-            utils.prepare_tensor("text_input", input0_data, FLAGS.protocol),
-            utils.prepare_tensor("max_tokens", output0_len, FLAGS.protocol),
-            utils.prepare_tensor("bad_words", bad_words_list, FLAGS.protocol),
-            utils.prepare_tensor("stop_words", stop_words_list,
-                                 FLAGS.protocol),
-        ]
-
-        if FLAGS.protocol == "http":
-            async_requests.append(
-                client.async_infer(model_name, inputs, request_id=str(i)))
-        elif FLAGS.protocol == "grpc":
-            async_requests.append(
-                client.async_infer(model_name,
-                                   inputs,
-                                   callback=partial(callback, user_data,
-                                                    datetime.now()),
-                                   request_id=str(i)))
-
-    if FLAGS.protocol == "http":
-        utils.get_http_results(async_requests)
-    elif FLAGS.protocol == "grpc":
-        utils.get_grpc_results(user_data, len(prompts))
-    else:
-        raise RuntimeError("Invalid protocol")
-
-    stop_time = datetime.now()
-    latency = (stop_time - start_time).total_seconds() * 1000.0
-    latency = round(latency, 3)
-    print(f"[INFO] Total Latency: {latency} ms")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
-    parser.add_argument(
-        '-i',
-        '--protocol',
-        type=str,
-        required=False,
-        default='http',
-        choices=['http', 'grpc'],
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-c',
-                        '--concurrency',
-                        type=int,
-                        default=128,
-                        required=False,
-                        help='Specify concurrency')
-    parser.add_argument('--max-input-len',
-                        type=int,
-                        required=True,
-                        help='Specify max input length')
-
-    parser.add_argument('--dataset',
-                        type=str,
-                        required=True,
-                        help='Dataset path used for the test.')
-
-    parser.add_argument('--return-context-logits',
-                        action="http://github.com/store_true",
-                        default=False,
-                        help='Return context logits.')
-
-    parser.add_argument('--return-generation-logits',
-                        action="http://github.com/store_true",
-                        default=False,
-                        help='Return generation logits.')
-
-    parser.add_argument('--test-bls',
-                        action="http://github.com/store_true",
-                        default=False,
-                        help="test BLS model")
-
-    FLAGS = parser.parse_args()
-    if FLAGS.url is None:
-        FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
-
-    try:
-        client = utils.create_inference_server_client(
-            FLAGS.protocol,
-            FLAGS.url,
-            concurrency=FLAGS.concurrency,
-            verbose=FLAGS.verbose)
-    except Exception as e:
-        print("Encountered error: " + str(e))
-        sys.exit(1)
-
-    prompts = []
-    output_lens = []
-    with open(FLAGS.dataset, 'r') as f:
-        data_dict = json.load(f)
-        for req in data_dict:
-            prompt = req['input'] + ' ' + req['instruction']
-            output = req['output']
-            # 1.3 is a magic number that converts number of words to number of tokens
-            if int(len(prompt.split(' ')) / 1.3) > FLAGS.max_input_len:
-                continue
-            prompts.append(prompt)
-            # 1.3 is a magic number that converts number of words to number of tokens
-            output_lens.append(int(len(output.split(' ')) * 1.3))
-
-    vocabSizePadded = 50257  # gpt
-    test_functionality(client, prompts, output_lens, vocabSizePadded,
-                       FLAGS.return_context_logits,
-                       FLAGS.return_generation_logits, FLAGS.test_bls)
-    test_performance(client, prompts, output_lens)
diff --git a/tools/inflight_batcher_llm/speculative_decoding_test.py b/tools/inflight_batcher_llm/speculative_decoding_test.py
deleted file mode 100644
index 22a99491..00000000
--- a/tools/inflight_batcher_llm/speculative_decoding_test.py
+++ /dev/null
@@ -1,321 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-utils_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
-root_path = os.path.dirname(utils_path)
-sys.path.append(utils_path)
-sys.path.append(os.path.join(root_path, "inflight_batcher_llm"))
-
-import argparse
-import json
-import sys
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from client import e2e_grpc_speculative_decoding_client, end_to_end_grpc_client
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="http://github.com/store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-
-    parser.add_argument('--url-target',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the target model')
-
-    parser.add_argument('--url-draft',
-                        type=str,
-                        required=True,
-                        help='Inference server URL for the draft model')
-
-    parser.add_argument('--max-input-len',
-                        type=int,
-                        required=True,
-                        help='Max input length for input prompts')
-
-    parser.add_argument(
-        '--preprocessor-model-name',
-        type=str,
-        required=False,
-        default="preprocessing",
-        help='Name of the preprocessor model (should be hosted at url-draft)')
-
-    parser.add_argument(
-        '--postprocessor-model-name',
-        type=str,
-        required=False,
-        default="postprocessing",
-        help='Name of the postprocessor model (should be hosted at url-target)'
-    )
-
-    parser.add_argument(
-        '--draft-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm draft model (hosted at url-draft)')
-
-    parser.add_argument(
-        '--target-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm",
-        help='Name of the tensorrt_llm target model (hosted at url-target)')
-
-    parser.add_argument(
-        '--bls-speculative-tensorrt-llm-model-name',
-        type=str,
-        required=False,
-        default="tensorrt_llm_bls",
-        help=
-        'Name of the tensorrt_llm bls  model (only supports the case of url-target == url-draft)'
-    )
-
-    parser.add_argument(
-        '--execute-bls-speculative-decoding',
-        action='http://github.com/store_true',
-        help='Executes the BLS speculative decoding model if set')
-
-    parser.add_argument(
-        "-b",
-        "--beam-width",
-        required=False,
-        type=int,
-        default=1,
-        help="Beam width value",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        required=False,
-        default=1.0,
-        help="temperature value",
-    )
-
-    parser.add_argument(
-        "--repetition-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The repetition penalty value",
-    )
-
-    parser.add_argument(
-        "--presence-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The presence penalty value",
-    )
-
-    parser.add_argument(
-        "--frequency-penalty",
-        type=float,
-        required=False,
-        default=None,
-        help="The frequency penalty value",
-    )
-
-    parser.add_argument('-o',
-                        '--output-len',
-                        type=int,
-                        default=100,
-                        required=False,
-                        help='Specify output length')
-
-    parser.add_argument(
-        '--num-draft-tokens',
-        type=int,
-        default=5,
-        required=False,
-        help=
-        'Specify the number of speculative tokens for the draft model to generate per lookahead.'
-    )
-    parser.add_argument(
-        '--use-draft-logits',
-        default=False,
-        required=False,
-        action='http://github.com/store_true',
-        help='Use logits from draft model when performing speculative decoding'
-    )
-    parser.add_argument('--return-context-logits',
-                        default=False,
-                        required=False,
-                        action='http://github.com/store_true',
-                        help='Return context logits')
-    parser.add_argument('--return-generation-logits',
-                        default=False,
-                        required=False,
-                        action='http://github.com/store_true',
-                        help='Return generation logits')
-
-    parser.add_argument('--end-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The end if token')
-
-    parser.add_argument('--pad-id',
-                        type=int,
-                        default=None,
-                        required=False,
-                        help='The pad if token')
-
-    parser.add_argument('--stop-words',
-                        nargs='+',
-                        default=[],
-                        help='The stop words')
-
-    parser.add_argument('--bad-words',
-                        nargs='+',
-                        default=[],
-                        help='The bad words')
-
-    parser.add_argument('--dataset',
-                        type=str,
-                        required=True,
-                        help='Dataset path used for the test.')
-
-    parser.add_argument('--disable-output-comparison',
-                        action='http://github.com/store_true',
-                        required=False,
-                        help='disable output check')
-
-    parser.add_argument(
-        "--return-draft-model-draft-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return draft model's draft tokens' logits, require to enable `gather_generation_logits` when build engine"
-    )
-
-    parser.add_argument(
-        "--return-target-model-accepted-token-logits",
-        action="http://github.com/store_true",
-        required=False,
-        default=False,
-        help=
-        "Return target model's accepted token logits, require to enable `gather_generation_logits` when build engine",
-    )
-
-    FLAGS = parser.parse_args()
-    if not FLAGS.url_target:
-        FLAGS.url_target = "localhost:8001"
-
-    if not FLAGS.url_draft:
-        FLAGS.url_draft = FLAGS.url_target
-
-    try:
-        client_target = grpcclient.InferenceServerClient(url=FLAGS.url_target)
-        client_draft = grpcclient.InferenceServerClient(
-            url=FLAGS.url_draft) if (
-                FLAGS.url_target != FLAGS.url_draft) else client_target
-    except Exception as e:
-        print("client creation failed: " + str(e))
-        sys.exit(1)
-
-    if (FLAGS.beam_width > 1):
-        raise Exception(
-            'Beam width > 1 is not yet supported with speculative decoding')
-
-    request_id = 1
-    total_count = 0
-    failed_count = 0
-    with open(FLAGS.dataset, 'r') as f:
-        data_dict = json.load(f)
-        for req in data_dict:
-            prompt = req['input'] + ' ' + req['instruction']
-            output = req['output']
-            # 1.3 is a magic number that converts number of words to number of tokens
-            if int(len(prompt.split(' ')) * 1.3) > FLAGS.max_input_len:
-                continue
-            # 1.3 is a magic number that converts number of words to number of tokens
-            output_len = int(len(output.split(' ')) * 1.3)
-            if FLAGS.verbose:
-                print(f"Prompt: {prompt}")
-                print(f"Output len: {output_len}")
-
-            # Calling target model only
-            if FLAGS.verbose:
-                print(f"Calling target model", flush=True)
-            output_target = end_to_end_grpc_client.run_inference(
-                client_target, prompt, output_len, str(request_id),
-                FLAGS.repetition_penalty, FLAGS.presence_penalty,
-                FLAGS.frequency_penalty, FLAGS.temperature, FLAGS.stop_words,
-                FLAGS.bad_words, [], [], "ensemble", False, 1, False, None,
-                None, FLAGS.end_id, FLAGS.pad_id, FLAGS.verbose)
-            if FLAGS.verbose:
-                print(f"output_target: {output_target}", flush=True)
-                print(f"flags: {FLAGS}")
-                print(f"prompt: {prompt}")
-                print(f"output_len: {output_len}")
-
-            # Calling BLS speculative decoding
-            if FLAGS.execute_bls_speculative_decoding:
-                if FLAGS.verbose:
-                    print(f"Calling BLS speculative decoding model",
-                          flush=True)
-                output_speculative = end_to_end_grpc_client.run_inference(
-                    client_target, prompt, output_len, str(request_id),
-                    FLAGS.repetition_penalty, FLAGS.presence_penalty,
-                    FLAGS.frequency_penalty, FLAGS.temperature,
-                    FLAGS.stop_words, FLAGS.bad_words, [], [],
-                    "tensorrt_llm_bls", False, 1, False, None,
-                    np.array([[FLAGS.return_generation_logits]], dtype=bool),
-                    FLAGS.end_id, FLAGS.pad_id, FLAGS.verbose,
-                    FLAGS.num_draft_tokens, FLAGS.use_draft_logits)
-                if FLAGS.verbose:
-                    print(f"output_bls_speculative: {output_speculative}",
-                          flush=True)
-            else:
-                # Calling client-side coordination of speculative decoding
-                if FLAGS.verbose:
-                    print(f"Calling speculative client", flush=True)
-                output_speculative = e2e_grpc_speculative_decoding_client.run_speculative_inference(
-                    client_draft,
-                    client_target, prompt, output_len, FLAGS.num_draft_tokens,
-                    str(request_id), FLAGS.repetition_penalty,
-                    FLAGS.presence_penalty, FLAGS.frequency_penalty,
-                    FLAGS.temperature, FLAGS.stop_words, FLAGS.bad_words,
-                    FLAGS.end_id, FLAGS.pad_id, FLAGS.beam_width,
-                    FLAGS.preprocessor_model_name,
-                    FLAGS.draft_tensorrt_llm_model_name,
-                    FLAGS.target_tensorrt_llm_model_name,
-                    FLAGS.postprocessor_model_name,
-                    FLAGS.return_draft_model_draft_logits,
-                    FLAGS.return_target_model_accepted_token_logits,
-                    FLAGS.verbose)
-                if FLAGS.verbose:
-                    print(f"output_speculative: {output_speculative}",
-                          flush=True)
-
-            total_count = total_count + 1
-            if not FLAGS.disable_output_comparison:
-                if (output_target != output_speculative):
-                    failed_count = failed_count + 1
-                    print(f"{total_count}: Outputs don't match")
-                    print(f"Prompt:")
-                    print(f"{prompt}")
-                    print(f"Output target:")
-                    print(f"{output_target}")
-                    print(f"Output speculative:")
-                    print(f"{output_speculative}")
-                else:
-                    print(f"{total_count}: Outputs match")
-            else:
-                print("Not checking output")
-                if output_speculative == "":
-                    failed_count += 1
-            request_id = request_id + 1
-
-    print(f"failed/total: {failed_count}/{total_count}")
-    sys.exit(failed_count > 0)
diff --git a/tools/utils.sh b/tools/utils.sh
deleted file mode 100644
index 27ef4cd1..00000000
--- a/tools/utils.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
-# success, 1 on failure
-
-function wait_for_server_ready() {
-
-    local spid="$1";
-    local wait_time_secs="${2:-30}";
-    local triton_http_port="${3:-8000}"
-    WAIT_RET=0
-
-    local wait_secs=$wait_time_secs
-    until test $wait_secs -eq 0 ; do
-        if ! kill -0 $spid; then
-            echo "=== Server not running."
-            WAIT_RET=1
-            return
-        fi
-
-        sleep 1;
-
-        set +e
-        code=`curl -s -w %{http_code} localhost:${triton_http_port}/v2/health/ready`
-        set -e
-        if [ "$code" == "200" ]; then
-            return
-        fi
-
-        ((wait_secs--));
-    done
-
-    echo "=== Timeout $wait_time_secs secs. Server not ready."
-    WAIT_RET=1
-}
diff --git a/tools/utils/__init__.py b/tools/utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tools/utils/utils.py b/tools/utils/utils.py
deleted file mode 100644
index 501aac32..00000000
--- a/tools/utils/utils.py
+++ /dev/null
@@ -1,449 +0,0 @@
-import csv
-import json
-import math
-import queue
-import random
-from datetime import timedelta
-from functools import partial
-
-import numpy as np
-import pandas as pd
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
-from tabulate import tabulate
-from tritonclient.utils import np_to_triton_dtype
-
-
-class UserData:
-
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-        self._latencies = []
-        self._latency_dict = {}
-        self._start_time_dict = {}
-        self._stop_time_dict = {}
-
-
-# Callback function used for async_stream_infer()
-def completion_callback(user_data, result, error):
-    # passing error raise and handling out
-    user_data._completed_requests.put((result, error))
-
-
-def prepare_tensor(name, input, protocol):
-    client_util = httpclient if protocol == "http" else grpcclient
-    t = client_util.InferInput(name, input.shape,
-                               np_to_triton_dtype(input.dtype))
-    t.set_data_from_numpy(input)
-    return t
-
-
-def prepare_inputs(input_start_ids, input_len, pad_id, end_id, flags):
-    output_len = np.ones([input_start_ids.shape[0], 1]).astype(
-        np.int32) * flags.output_len
-    runtime_top_k = (flags.topk *
-                     np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
-    runtime_top_p = flags.topp * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    beam_search_diversity_rate = 0.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    temperature = 1.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    len_penalty = 1.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    repetition_penalty = 1.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    random_seed = 0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.uint64)
-    output_log_probs = True * \
-        np.ones([input_start_ids.shape[0], 1]).astype(bool)
-    beam_width = (flags.beam_width *
-                  np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
-    pad_ids = pad_id * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
-    end_ids = end_id * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
-    min_length = 1 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
-    presence_penalty = 0.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    frequency_penalty = 0.0 * \
-        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
-    bad_words_list = np.concatenate([
-        np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
-        (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
-    ],
-                                    axis=1)
-    stop_word_list = np.concatenate([
-        np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
-        (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
-    ],
-                                    axis=1)
-    inputs = [
-        prepare_tensor("input_ids", input_start_ids, flags.protocol),
-        prepare_tensor("input_lengths", input_len, flags.protocol),
-        prepare_tensor("request_output_len", output_len, flags.protocol),
-        prepare_tensor("pad_id", pad_ids, flags.protocol),
-        prepare_tensor("end_id", end_ids, flags.protocol),
-        prepare_tensor("beam_width", beam_width, flags.protocol),
-        prepare_tensor("temperature", temperature, flags.protocol),
-        prepare_tensor("runtime_top_k", runtime_top_k, flags.protocol),
-        prepare_tensor("runtime_top_p", runtime_top_p, flags.protocol),
-        prepare_tensor("len_penalty", len_penalty, flags.protocol),
-        prepare_tensor("repetition_penalty", repetition_penalty,
-                       flags.protocol),
-        prepare_tensor("min_length", min_length, flags.protocol),
-        prepare_tensor("presence_penalty", presence_penalty, flags.protocol),
-        prepare_tensor("frequency_penalty", frequency_penalty, flags.protocol),
-        prepare_tensor("random_seed", random_seed, flags.protocol),
-        prepare_tensor("output_log_probs", output_log_probs, flags.protocol),
-        # prepare_tensor("bad_words_list", bad_words_list, flags.protocol),
-        # prepare_tensor("stop_words_list", stop_word_list, flags.protocol),
-    ]
-    return inputs
-
-
-def create_inference_server_client(protocol, url, concurrency, verbose):
-    client_util = httpclient if protocol == "http" else grpcclient
-    if protocol == "http":
-        return client_util.InferenceServerClient(url,
-                                                 concurrency=concurrency,
-                                                 verbose=verbose)
-    elif protocol == "grpc":
-        return client_util.InferenceServerClient(url, verbose=verbose)
-
-
-def send_requests(model_name, inputs, client, request_parallelism):
-    results = []
-    for _ in range(request_parallelism):
-        result = client.infer(model_name, inputs)
-        results.append(result)
-    return results
-
-
-def send_requests_async(model_name, inputs, client, flags,
-                        request_parallelism):
-    if flags.protocol == "http":
-        async_requests = []
-        for _ in range(request_parallelism):
-            async_requests.append(client.async_infer(model_name, inputs))
-        return async_requests
-    else:
-        user_data = UserData()
-        for _ in range(request_parallelism):
-            client.async_infer(model_name, inputs,
-                               partial(completion_callback, user_data))
-        return user_data
-
-
-def get_http_results(async_requests):
-    results = []
-    for async_request in async_requests:
-        results.append(async_request.get_result())
-    return results
-
-
-def get_grpc_results(user_data, request_parallelism):
-    results = []
-    processed_count = 0
-    while processed_count < request_parallelism:
-        (result, error) = user_data._completed_requests.get()
-        processed_count += 1
-        if error is not None:
-            raise RuntimeError(error)
-        results.append(result)
-    return results
-
-
-def append_start_and_end_ids(inputs,
-                             batch_size,
-                             flags,
-                             start_id=None,
-                             end_id=None):
-    if start_id is not None:
-        start_ids = start_id * np.ones([batch_size, 1]).astype(np.int32)
-        inputs.append(prepare_tensor("start_id", start_ids, flags.protocol))
-    if end_id is not None:
-        end_ids = end_id * np.ones([batch_size, 1]).astype(np.int32)
-        inputs.append(prepare_tensor("end_id", end_ids, flags.protocol))
-
-
-def generate_histogram(range_buckets, frequencies):
-    histogram = []
-
-    for i in range(len(range_buckets)):
-        bucket = range_buckets[i]
-        frequency = frequencies[i]
-
-        # Split the bucket range into min and max values
-        min_range, max_range = bucket
-
-        # Generate 'frequency' random values within the specified range
-        random.seed(420)
-        random_values = [
-            random.randint(min_range, max_range) for _ in range(frequency)
-        ]
-
-        # Extend the histogram with the random values
-        histogram.extend(random_values)
-
-    # Randomize the order of values in the histogram
-    random.shuffle(histogram)
-
-    return histogram
-
-
-def get_token_list_from_histogram(histogram_key):
-
-    histogram_buckets = {
-        "example_ip": [(151, 175), (176, 200), (201, 225), (226, 250),
-                       (251, 275)],
-        "example_op": [(6, 10), (11, 15), (16, 20), (21, 25), (26, 30)]
-    }
-    histogram_freq = {
-        "example_ip": [220, 225, 150, 150, 140],
-        "example_op": [76, 210, 174, 130, 152]
-    }
-
-    range_buckets = histogram_buckets[histogram_key]
-    freqs = histogram_freq[histogram_key]
-    assert (len(range_buckets) == len(freqs))
-
-    return generate_histogram(range_buckets, freqs)
-
-
-def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
-    if delay_dist == "constant":
-        delays = [mean_time_bet_reqs] * num_reqs
-    elif delay_dist == "exponential_dist":
-        delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)
-
-    return delays
-
-
-def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
-    # set seed for determinism
-    np.random.seed(420)
-    return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()
-
-
-def get_norm_dist_tokens(mean, stdev, num_reqs):
-    # set seed for determinism
-    np.random.seed(420)
-    numbers_list = np.random.normal(loc=mean, scale=stdev,
-                                    size=num_reqs).tolist()
-    return [max(1, math.ceil(x)) for x in numbers_list]
-
-
-def gen_random_start_ids(ip_lens):
-    input_start_ids = []
-    for ip_len in ip_lens:
-        start_ids = list(
-            np.random.randint(low=0,
-                              high=np.iinfo(np.int32).max,
-                              size=ip_len,
-                              dtype=np.int32))
-        input_start_ids.append(np.array([start_ids]))
-
-    return input_start_ids
-
-
-def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
-    if delay_dist == "constant":
-        delays = [mean_time_bet_reqs] * num_reqs
-    elif delay_dist == "exponential_dist":
-        delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)
-
-    return delays
-
-
-def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
-    return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()
-
-
-def get_norm_dist_tokens(mean, stdev, num_reqs):
-    numbers_list = np.random.normal(loc=mean, scale=stdev,
-                                    size=num_reqs).tolist()
-    return [max(1, math.ceil(x)) for x in numbers_list]
-
-
-def get_inflight_reqs_profile(start_times, end_times, requests_per_sec):
-    """
-    Receives start and end times of all requests,
-    divides total E2E time into equal intervals and assigns how many requests are in flight
-    in each interval.
-    """
-    # Calculate min of start time and max of end time
-    min_start_time = min(start_times)
-    max_end_time = max(end_times)
-
-    # need to have enough resolution intervals depending on avg. latency per request. 10 times smaller than request processing time
-    sec_per_request = 1.0 / requests_per_sec
-    NUM_INTERVALS = int((max_end_time - min_start_time) /
-                        timedelta(seconds=(sec_per_request / 10)))
-    print(NUM_INTERVALS)
-    # Calculate interval length
-    interval_length = (max_end_time - min_start_time) / NUM_INTERVALS
-
-    # Initialize a list to store the count of requests in each interval
-    interval_counts = [0] * NUM_INTERVALS
-
-    # Iterate through the requests and update interval counts
-    for i in range(len(start_times)):
-        start = start_times[i]
-        end = end_times[i]
-
-        # Calculate which interval the request falls into
-        interval_index = int((start - min_start_time) / interval_length)
-
-        # Increment the count for that interval and subsequent intervals until end
-        while start < end and interval_index < NUM_INTERVALS:
-            interval_counts[interval_index] += 1
-            interval_index += 1
-            start += interval_length
-
-    return interval_counts
-
-
-def extract_print_stats(ip_token_len_list, responses, user_data, FLAGS):
-
-    #### Gather info about requests
-    op_token_len_list = []
-    op_token_len_ooo = {}
-
-    for response in responses:
-        #JG: long sequence to extract output length from response json dict. Responses are out of order
-        op_token_len_ooo[response.get_response(as_json=True)['id']] = \
-            int(response.get_response(as_json=True)['outputs'][0]['shape'][2])
-
-    op_token_len_list = [
-        value for key, value in sorted(op_token_len_ooo.items())
-    ]
-
-    assert (len(op_token_len_list) == len(ip_token_len_list))
-    if not FLAGS.exclude_input_in_output:
-        for i in range(len(op_token_len_list)):
-            op_token_len_list[i] = op_token_len_list[i] - ip_token_len_list[i]
-
-    # Get latencies per request
-    # Order latencies based on issue order.
-    latency_list_in_order = [
-        value for key, value in sorted(user_data._latency_dict.items())
-    ]
-    start_time_list_in_order = [
-        value for key, value in sorted(user_data._start_time_dict.items())
-    ]
-    stop_time_list_in_order = [
-        value for key, value in sorted(user_data._stop_time_dict.items())
-    ]
-
-    latency_sorted = np.sort(latency_list_in_order)
-    index_99 = math.ceil(len(latency_sorted) * 0.99)
-    index_90 = math.ceil(len(latency_sorted) * 0.90)
-
-    data = {
-        'latency': latency_list_in_order,
-        'start_time': start_time_list_in_order,
-        'stop_time': stop_time_list_in_order,
-        'num_ip_tokens': ip_token_len_list,
-        'num_op_tokens': op_token_len_list
-    }
-
-    # Bundle everything in a single DF
-    df = pd.DataFrame(data)
-
-    #stats
-    df['num_ip_tokens'].sum()
-    avg_ip_tokens = df['num_ip_tokens'].mean()
-    df['num_ip_tokens'].median()
-    df['num_ip_tokens'].std()
-    total_op_tokens = df['num_op_tokens'].sum()
-    avg_op_tokens = df['num_op_tokens'].mean()
-    df['num_op_tokens'].median()
-    df['num_op_tokens'].std()
-
-    tend = max(df['stop_time'].tolist())
-    t0 = min(df['start_time'].tolist())
-    total_latency = (tend - t0).total_seconds()
-    requests_per_sec = len(responses) / total_latency
-    tokens_generated_per_sec = total_op_tokens / total_latency
-
-    avg_in_flight_requests = 0
-
-    print_data_dict = {}
-    print_data_dict["Requests/Sec"] = requests_per_sec
-    print_data_dict["OP tokens/sec"] = tokens_generated_per_sec
-    print_data_dict["Avg. latency (ms)"] = np.mean(latency_list_in_order)
-    print_data_dict["P99 latency (ms)"] = latency_sorted[index_99 - 1]
-    print_data_dict["P90 latency (ms)"] = latency_sorted[index_90 - 1]
-    print_data_dict["Avg. Input tokens per request"] = avg_ip_tokens
-    print_data_dict["Avg. Output tokens per request"] = avg_op_tokens
-    print_data_dict["Avg. InFlight requests"] = avg_in_flight_requests
-    print_data_dict["Total latency (ms)"] = total_latency * 1000
-    print_data_dict["Total requests"] = len(responses)
-
-    print_data = [["Requests/Sec", requests_per_sec],
-                  ["OP tokens/sec", tokens_generated_per_sec],
-                  ["Avg. latency (ms)",
-                   np.mean(latency_list_in_order)],
-                  ["P99 latency (ms)", latency_sorted[index_99 - 1]],
-                  ["P90 latency (ms)", latency_sorted[index_90 - 1]],
-                  ["Avg. IP tokens per request", avg_ip_tokens],
-                  ["Avg. OP tokens per request", avg_op_tokens],
-                  ["Avg. InFlight requests", avg_in_flight_requests],
-                  ["Total latency (ms)", total_latency * 1000],
-                  ["Total requests", len(responses)]]
-
-    # Format numerical values to 2 decimal places
-    formatted_data = [[item, f"{value:.2f}"] for item, value in print_data]
-    headers = ["Stat", "Value"]
-    table = tabulate(formatted_data, headers=headers, tablefmt="pretty")
-
-    if FLAGS.op_stats_csv is not None:
-        with open(FLAGS.op_stats_csv, "a", newline="") as file:
-            filednames = print_data_dict.keys()
-            writer = csv.DictWriter(file, fieldnames=filednames)
-
-            # Check if the file is empty, and write the header if needed
-            if file.tell() == 0:
-                writer.writeheader()
-
-            # Write the dictionaries as new rows
-            writer.writerow(print_data_dict)
-
-    print(table)
-
-    if FLAGS.dump_perfetto_trace:
-        json_dict = []
-        for i in range(len(op_token_len_list)):
-            req_dict = {}
-            req_dict['name'] = 'req_{}'.format(i)
-            req_dict["cat"] = "batch"
-            req_dict["ph"] = "X"
-            req_dict["ts"] = (start_time_list_in_order[i].timestamp() -
-                              t0.timestamp()) * 1000000  #perfetto expects us
-            req_dict["dur"] = (
-                stop_time_list_in_order[i] -
-                start_time_list_in_order[i]).total_seconds() * 1000000
-            req_dict["pid"] = "1"
-            req_dict["args"] = {
-                "isl": int(ip_token_len_list[i]),
-                "osl": int(op_token_len_list[i])
-            }
-            json_dict.append(req_dict)
-
-        with open("prfetto_dump.json", "w") as file:
-            json.dump(json_dict, file, indent=4)
-
-    return print_data_dict
-
-
-def extract_string_from_nested_list(nested_list):
-    if isinstance(nested_list, str):
-        return nested_list
-    elif isinstance(nested_list, list):
-        for item in nested_list:
-            extracted_string = extract_string_from_nested_list(item)
-            if extracted_string:
-                return extracted_string
-    return ""
diff --git a/tools/version.txt b/tools/version.txt
deleted file mode 100644
index dfc4d6d2..00000000
--- a/tools/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-73b896d12a81662027fa6746ab3ed99450150e18