From 2d0f250ac295d075a8776071e3923e5b0b7c6ecb Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Wed, 24 Sep 2025 15:07:44 -0700
Subject: [PATCH 1/2] ci: Remove deprecated vLLM config "disable_log_requests"
 (#96)

---
 ci/L0_backend_vllm/vllm_backend/test.sh          | 4 ++--
 ci/L0_multi_gpu_vllm/multi_lora/test.sh          | 4 ----
 docs/llama_multi_lora_tutorial.md                | 4 +---
 samples/model_repository/vllm_model/1/model.json | 1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh
index 87e04b21..690599b5 100755
--- a/ci/L0_backend_vllm/vllm_backend/test.sh
+++ b/ci/L0_backend_vllm/vllm_backend/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -64,7 +64,7 @@ wget -P models/add_sub https://raw.githubusercontent.com/triton-inference-server
 
 # Invalid model attribute
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_1/
-sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
+sed -i 's/"enforce_eager"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
 
 # Invalid model name
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_2/
diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh
index bcc52770..8cf2c3fa 100755
--- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh
+++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh
@@ -110,7 +110,6 @@ export SERVER_ENABLE_LORA=true
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.7,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -202,7 +201,6 @@ wait $SERVER_PID
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.7,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -282,7 +280,6 @@ export SERVER_ENABLE_LORA=false
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -344,7 +341,6 @@ export SERVER_ENABLE_LORA=false
 model_json=$(cat <<EOF
 {
     "model":"./weights/backbone/gemma-2b",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,
diff --git a/docs/llama_multi_lora_tutorial.md b/docs/llama_multi_lora_tutorial.md
index c12910e6..5159ae4e 100644
--- a/docs/llama_multi_lora_tutorial.md
+++ b/docs/llama_multi_lora_tutorial.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -146,7 +146,6 @@ For this tutorial we will use the following set of parameters, specified in the
 ```json
 {
     "model":"/vllm_workspace/weights/backbone/llama-7b-hf",
-    "disable_log_requests": "true",
     "gpu_memory_utilization": 0.8,
     "tensor_parallel_size": 2,
     "block_size": 16,
@@ -157,7 +156,6 @@ For this tutorial we will use the following set of parameters, specified in the
 ```
 
 + `model`: The path to your model repository
-+ `disable_log_requests`: To show logs when launch vllm or not.
 + `gpu_memory_utilization`: The gpu memory allocated for the model weights and vllm *PagedAttention* kv cache manager.
 + `tensor_parallel_size`: The vllm now support the tensor paralism, so you can decide how many gpus you want to use for serving.
 + `block_size`: vLLM kv cache block size.
diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json
index 8a32050d..50ed9637 100644
--- a/samples/model_repository/vllm_model/1/model.json
+++ b/samples/model_repository/vllm_model/1/model.json
@@ -1,6 +1,5 @@
 {
     "model":"facebook/opt-125m",
-    "disable_log_requests": true,
     "gpu_memory_utilization": 0.5,
     "enforce_eager": true
 }

From f921ead38085f59c42fc8fe3b56a3f72e080ecb1 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Date: Mon, 6 Oct 2025 16:45:58 -0700
Subject: [PATCH 2/2] TPRD-1710: Update default branches post-25.09 (#99)

Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com>
---
 ci/L0_additional_outputs_vllm/test.sh     | 1 -
 ci/L0_backend_vllm/test.sh                | 1 -
 ci/L0_check_health_vllm/test.sh           | 1 -
 ci/L0_multi_gpu_vllm/multi_lora/test.sh   | 3 +++
 ci/L0_multi_gpu_vllm/test.sh              | 1 -
 ci/L0_multi_gpu_vllm/vllm_backend/test.sh | 3 +++
 6 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ci/L0_additional_outputs_vllm/test.sh b/ci/L0_additional_outputs_vllm/test.sh
index a13fdae2..967e771f 100755
--- a/ci/L0_additional_outputs_vllm/test.sh
+++ b/ci/L0_additional_outputs_vllm/test.sh
@@ -30,7 +30,6 @@ source ../common/util.sh
 
 pip3 install pytest==8.1.1
 pip3 install tritonclient[grpc]
-pip3 install "transformers<=4.53.3" # TODO:DLIS-8441 remove this dependency
 
 # Prepare Model
 rm -rf models vllm_baseline_output.pkl && mkdir -p models
diff --git a/ci/L0_backend_vllm/test.sh b/ci/L0_backend_vllm/test.sh
index 674b0fa9..b4d27357 100755
--- a/ci/L0_backend_vllm/test.sh
+++ b/ci/L0_backend_vllm/test.sh
@@ -29,7 +29,6 @@ RET=0
 SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"
 
 python3 -m pip install tritonclient[grpc]
-python3 -m pip install "transformers<=4.53.3" # TODO:DLIS-8441 remove this dependency
 
 for TEST in ${SUBTESTS}; do
     (cd ${TEST} && bash -ex test.sh && cd ..)
diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh
index 655b043f..80668bcb 100755
--- a/ci/L0_check_health_vllm/test.sh
+++ b/ci/L0_check_health_vllm/test.sh
@@ -30,7 +30,6 @@ source ../common/util.sh
 
 pip3 install pytest==8.1.1
 pip3 install tritonclient[grpc]
-pip3 install "transformers<=4.53.3" # TODO:DLIS-8441 remove this dependency
 
 RET=0
 
diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh
index 8cf2c3fa..c045c4fc 100755
--- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh
+++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh
@@ -41,6 +41,9 @@ EXPECTED_NUM_TESTS=2
 GENERATE_ENDPOINT="localhost:8000/v2/models/vllm_llama_multi_lora/generate"
 CHECK_FOR_ERROR=true
 
+export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
+export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+
 make_api_call() {
     local endpoint="$1"
     local data="$2"
diff --git a/ci/L0_multi_gpu_vllm/test.sh b/ci/L0_multi_gpu_vllm/test.sh
index 38caa77a..34843139 100755
--- a/ci/L0_multi_gpu_vllm/test.sh
+++ b/ci/L0_multi_gpu_vllm/test.sh
@@ -29,7 +29,6 @@ RET=0
 SUBTESTS="vllm_backend multi_lora"
 
 python3 -m pip install tritonclient[grpc]
-python3 -m pip install "transformers<=4.53.3" # TODO:DLIS-8441 remove this dependency
 
 for TEST in ${SUBTESTS}; do
     (cd ${TEST} && bash -ex test.sh && cd ..)
diff --git a/ci/L0_multi_gpu_vllm/vllm_backend/test.sh b/ci/L0_multi_gpu_vllm/vllm_backend/test.sh
index 0609bebf..e4de2ad2 100755
--- a/ci/L0_multi_gpu_vllm/vllm_backend/test.sh
+++ b/ci/L0_multi_gpu_vllm/vllm_backend/test.sh
@@ -36,6 +36,9 @@ CLIENT_PY="./vllm_multi_gpu_test.py"
 SAMPLE_MODELS_REPO="../../../samples/model_repository"
 EXPECTED_NUM_TESTS=1
 
+export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
+export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+
 ### Helpers
 function validate_file_contains() {
     local KEY="${1}"