NVIDIA · anish-shanbhag · Oct 13, 2025 · Oct 13, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -23,7 +23,7 @@
 
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization import QuantMode
 

@@ -27,7 +27,7 @@
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import (PYTHON_BINDINGS, ModelConfig, ModelRunner,
                                   SamplingConfig, Session, TensorInfo)

@@ -25,7 +25,7 @@
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import (ModelConfig, SamplingConfig, Session,
                                   TensorInfo)

@@ -33,7 +33,8 @@
 import tensorrt_llm.logger as logger
 from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
                                  trt_dtype_to_torch)
-from tensorrt_llm.bindings import GptJsonConfig, KVCacheType
+from tensorrt_llm.bindings import GptJsonConfig
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelConfig, SamplingConfig
 from tensorrt_llm.runtime.session import Session, TensorInfo
 

@@ -9,7 +9,6 @@
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
 from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, KvCacheConfig, _ParallelConfig
-from ...llmapi.utils import get_type_repr
 from .models import ModelFactory, ModelFactoryRegistry
 from .utils._config import DynamicYamlMixInForSettings
 
@@ -294,12 +293,11 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
 
     model_config = _get_config_dict()
 
-    build_config: Optional[object] = Field(
-        default_factory=lambda: BuildConfig(),
+    build_config: Optional[BuildConfig] = Field(
+        default_factory=BuildConfig,
         description="!!! DO NOT USE !!! Internal only; needed for BaseLlmArgs compatibility.",
         exclude_from_json=True,
         frozen=True,
-        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"},
         repr=False,
     )
     backend: Literal["_autodeploy"] = Field(

@@ -22,8 +22,8 @@
     QuantAlgo.NVFP4, QuantAlgo.FP8, QuantAlgo.FP8_BLOCK_SCALES,
     QuantAlgo.NO_QUANT, None
 }
-DEFAULT_MAX_BATCH_SIZE = BuildConfig.max_batch_size
-DEFAULT_MAX_NUM_TOKENS = BuildConfig.max_num_tokens
+DEFAULT_MAX_BATCH_SIZE = BuildConfig.model_fields["max_batch_size"].default
+DEFAULT_MAX_NUM_TOKENS = BuildConfig.model_fields["max_num_tokens"].default
 
 
 def get_benchmark_engine_settings(

diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
@@ -28,8 +28,9 @@
                                  mpi_comm, mpi_rank, mpi_world_size)
 from tensorrt_llm.auto_parallel import infer_cluster_config
 from tensorrt_llm.auto_parallel.cluster_info import cluster_infos
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.auto_parallel.config import AutoParallelConfig
 from tensorrt_llm.builder import BuildConfig, Engine, build
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.logger import logger, severity_map
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraManager
@@ -94,29 +95,30 @@ def parse_arguments():
     parser.add_argument(
         '--max_batch_size',
         type=int,
-        default=BuildConfig.max_batch_size,
+        default=BuildConfig.model_fields["max_batch_size"].default,
         help="Maximum number of requests that the engine can schedule.")
-    parser.add_argument('--max_input_len',
-                        type=int,
-                        default=BuildConfig.max_input_len,
-                        help="Maximum input length of one request.")
+    parser.add_argument(
+        '--max_input_len',
+        type=int,
+        default=BuildConfig.model_fields["max_input_len"].default,
+        help="Maximum input length of one request.")
     parser.add_argument(
         '--max_seq_len',
         '--max_decoder_seq_len',
         dest='max_seq_len',
         type=int,
-        default=BuildConfig.max_seq_len,
+        default=BuildConfig.model_fields["max_seq_len"].default,
         help="Maximum total length of one request, including prompt and outputs. "
         "If unspecified, the value is deduced from the model config.")
     parser.add_argument(
         '--max_beam_width',
         type=int,
-        default=BuildConfig.max_beam_width,
+        default=BuildConfig.model_fields["max_beam_width"].default,
         help="Maximum number of beams for beam search decoding.")
     parser.add_argument(
         '--max_num_tokens',
         type=int,
-        default=BuildConfig.max_num_tokens,
+        default=BuildConfig.model_fields["max_num_tokens"].default,
         help=
         "Maximum number of batched input tokens after padding is removed in each batch. "
         "Currently, the input padding is removed by default; "
@@ -125,7 +127,7 @@ def parse_arguments():
     parser.add_argument(
         '--opt_num_tokens',
         type=int,
-        default=BuildConfig.opt_num_tokens,
+        default=BuildConfig.model_fields["opt_num_tokens"].default,
         help=
         "Optimal number of batched input tokens after padding is removed in each batch "
         "It equals to ``max_batch_size * max_beam_width`` by default, set this "
@@ -134,15 +136,16 @@ def parse_arguments():
     parser.add_argument(
         '--max_encoder_input_len',
         type=int,
-        default=BuildConfig.max_encoder_input_len,
+        default=BuildConfig.model_fields["max_encoder_input_len"].default,
         help="Maximum encoder input length for enc-dec models. "
         "Set ``max_input_len`` to 1 to start generation from decoder_start_token_id of length 1."
     )
     parser.add_argument(
         '--max_prompt_embedding_table_size',
         '--max_multimodal_len',
         type=int,
-        default=BuildConfig.max_prompt_embedding_table_size,
+        default=BuildConfig.model_fields["max_prompt_embedding_table_size"].
+        default,
         help=
         "Maximum prompt embedding table size for prompt tuning, or maximum multimodal input size for multimodal models. "
         "Setting a value > 0 enables prompt tuning or multimodal input.")
@@ -164,36 +167,38 @@ def parse_arguments():
     parser.add_argument(
         '--input_timing_cache',
         type=str,
-        default=BuildConfig.input_timing_cache,
+        default=BuildConfig.model_fields["input_timing_cache"].default,
         help=
         "The file path to read the timing cache. This option is ignored if the file does not exist."
     )
-    parser.add_argument('--output_timing_cache',
-                        type=str,
-                        default=BuildConfig.output_timing_cache,
-                        help="The file path to write the timing cache.")
+    parser.add_argument(
+        '--output_timing_cache',
+        type=str,
+        default=BuildConfig.model_fields["output_timing_cache"].default,
+        help="The file path to write the timing cache.")
     parser.add_argument(
         '--profiling_verbosity',
         type=str,
-        default=BuildConfig.profiling_verbosity,
+        default=BuildConfig.model_fields["profiling_verbosity"].default,
         choices=['layer_names_only', 'detailed', 'none'],
         help=
         "The profiling verbosity for the generated TensorRT engine. Setting to detailed allows inspecting tactic choices and kernel parameters."
     )
     parser.add_argument(
         '--strip_plan',
-        default=BuildConfig.use_strip_plan,
+        default=BuildConfig.model_fields["use_strip_plan"].default,
         action='store_true',
         help=
         "Enable stripping weights from the final TensorRT engine under the assumption that the refit weights are identical to those provided at build time."
     )
-    parser.add_argument('--weight_sparsity',
-                        default=BuildConfig.weight_sparsity,
-                        action='store_true',
-                        help="Enable weight sparsity.")
+    parser.add_argument(
+        '--weight_sparsity',
+        default=BuildConfig.model_fields["weight_sparsity"].default,
+        action='store_true',
+        help="Enable weight sparsity.")
     parser.add_argument(
         '--weight_streaming',
-        default=BuildConfig.weight_streaming,
+        default=BuildConfig.model_fields["weight_streaming"].default,
         action='store_true',
         help=
         "Enable offloading weights to CPU and streaming loading at runtime.",
@@ -215,10 +220,11 @@ def parse_arguments():
                         default='info',
                         choices=severity_map.keys(),
                         help="The logging level.")
-    parser.add_argument('--enable_debug_output',
-                        default=BuildConfig.enable_debug_output,
-                        action='store_true',
-                        help="Enable debug output.")
+    parser.add_argument(
+        '--enable_debug_output',
+        default=BuildConfig.model_fields["enable_debug_output"].default,
+        action='store_true',
+        help="Enable debug output.")
     parser.add_argument(
         '--visualize_network',
         type=str,
@@ -228,7 +234,7 @@ def parse_arguments():
     )
     parser.add_argument(
         '--dry_run',
-        default=BuildConfig.dry_run,
+        default=BuildConfig.model_fields["dry_run"].default,
         action='store_true',
         help=
         "Run through the build process except the actual Engine build for debugging."
@@ -567,79 +573,49 @@ def main():
                 f"Overriding # of builder profiles <= {force_num_profiles_from_env}."
             )
 
-        build_config = BuildConfig.from_dict(
-            {
-                'max_input_len':
-                args.max_input_len,
-                'max_seq_len':
-                args.max_seq_len,
-                'max_batch_size':
-                args.max_batch_size,
-                'max_beam_width':
-                args.max_beam_width,
-                'max_num_tokens':
-                args.max_num_tokens,
-                'opt_num_tokens':
-                args.opt_num_tokens,
-                'max_prompt_embedding_table_size':
-                args.max_prompt_embedding_table_size,
-                'gather_context_logits':
-                args.gather_context_logits,
-                'gather_generation_logits':
-                args.gather_generation_logits,
-                'strongly_typed':
-                True,
-                'force_num_profiles':
-                force_num_profiles_from_env,
-                'weight_sparsity':
-                args.weight_sparsity,
-                'profiling_verbosity':
-                args.profiling_verbosity,
-                'enable_debug_output':
-                args.enable_debug_output,
-                'max_draft_len':
-                args.max_draft_len,
-                'speculative_decoding_mode':
-                speculative_decoding_mode,
-                'input_timing_cache':
-                args.input_timing_cache,
-                'output_timing_cache':
-                args.output_timing_cache,
-                'auto_parallel_config': {
-                    'world_size':
-                    args.auto_parallel,
-                    'gpus_per_node':
-                    args.gpus_per_node,
-                    'sharded_io_allowlist': [
-                        'past_key_value_\\d+',
-                        'present_key_value_\\d*',
-                    ],
-                    'same_buffer_io': {
-                        'past_key_value_(\\d+)': 'present_key_value_\\1',
-                    },
-                    **cluster_config,
+        build_config = BuildConfig(
+            max_input_len=args.max_input_len,
+            max_seq_len=args.max_seq_len,
+            max_batch_size=args.max_batch_size,
+            max_beam_width=args.max_beam_width,
+            max_num_tokens=args.max_num_tokens,
+            opt_num_tokens=args.opt_num_tokens,
+            max_prompt_embedding_table_size=args.
+            max_prompt_embedding_table_size,
+            kv_cache_type=getattr(args, "kv_cache_type", None),
+            gather_context_logits=args.gather_context_logits,
+            gather_generation_logits=args.gather_generation_logits,
+            strongly_typed=True,
+            force_num_profiles=force_num_profiles_from_env,
+            weight_sparsity=args.weight_sparsity,
+            profiling_verbosity=args.profiling_verbosity,
+            enable_debug_output=args.enable_debug_output,
+            max_draft_len=args.max_draft_len,
+            speculative_decoding_mode=speculative_decoding_mode,
+            input_timing_cache=args.input_timing_cache,
+            output_timing_cache=args.output_timing_cache,
+            auto_parallel_config=AutoParallelConfig(
+                world_size=args.auto_parallel,
+                gpus_per_node=args.gpus_per_node,
+                sharded_io_allowlist=[
+                    'past_key_value_\\d+',
+                    'present_key_value_\\d*',
+                ],
+                same_buffer_io={
+                    'past_key_value_(\\d+)': 'present_key_value_\\1',
                 },
-                'dry_run':
-                args.dry_run,
-                'visualize_network':
-                args.visualize_network,
-                'max_encoder_input_len':
-                args.max_encoder_input_len,
-                'weight_streaming':
-                args.weight_streaming,
-                'monitor_memory':
-                args.monitor_memory,
-                'use_mrope':
-                (True if model_config.qwen_type == "qwen2_vl" else False)
-                if hasattr(model_config, "qwen_type") else False
-            },
+                **cluster_config,
+            ),
+            dry_run=args.dry_run,
+            visualize_network=args.visualize_network,
+            max_encoder_input_len=args.max_encoder_input_len,
+            weight_streaming=args.weight_streaming,
+            monitor_memory=args.monitor_memory,
+            use_mrope=getattr(model_config, "qwen_type", None) == "qwen2_vl",
             plugin_config=plugin_config)
-
-        if hasattr(args, 'kv_cache_type'):
-            build_config.update_from_dict({'kv_cache_type': args.kv_cache_type})
     else:
-        build_config = BuildConfig.from_json_file(args.build_config,
-                                                  plugin_config=plugin_config)
+        build_config = BuildConfig.from_json_file(args.build_config)
+        build_config.plugin_config = plugin_config
 
     parallel_build(model_config, ckpt_dir, build_config, args.output_dir,
                    workers, args.log_level, model_cls, **kwargs)

diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
@@ -50,23 +50,23 @@
               help="The logging level.")
 @click.option("--max_beam_width",
               type=int,
-              default=BuildConfig.max_beam_width,
+              default=BuildConfig.model_fields["max_beam_width"].default,
               help="Maximum number of beams for beam search decoding.")
 @click.option("--max_batch_size",
               type=int,
-              default=BuildConfig.max_batch_size,
+              default=BuildConfig.model_fields["max_batch_size"].default,
               help="Maximum number of requests that the engine can schedule.")
 @click.option(
     "--max_num_tokens",
     type=int,
-    default=BuildConfig.max_num_tokens,
+    default=BuildConfig.model_fields["max_num_tokens"].default,
     help=
     "Maximum number of batched input tokens after padding is removed in each batch."
 )
 @click.option(
     "--max_seq_len",
     type=int,
-    default=BuildConfig.max_seq_len,
+    default=BuildConfig.model_fields["max_seq_len"].default,
     help="Maximum total length of one request, including prompt and outputs. "
     "If unspecified, the value is deduced from the model config.")
 @click.option("--tp_size", type=int, default=1, help='Tensor parallelism size.')