Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/models/core/llama/summarize_long.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import tensorrt_llm
import tensorrt_llm.profiler as profiler
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
from tensorrt_llm.logger import logger
from tensorrt_llm.quantization import QuantMode

Expand Down
2 changes: 1 addition & 1 deletion examples/models/core/qwen2audio/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import tensorrt_llm
import tensorrt_llm.profiler as profiler
from tensorrt_llm import logger
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime import (PYTHON_BINDINGS, ModelConfig, ModelRunner,
SamplingConfig, Session, TensorInfo)
Expand Down
2 changes: 1 addition & 1 deletion examples/models/core/qwenvl/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import tensorrt_llm
import tensorrt_llm.profiler as profiler
from tensorrt_llm import logger
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime import (ModelConfig, SamplingConfig, Session,
TensorInfo)
Expand Down
3 changes: 2 additions & 1 deletion examples/models/core/whisper/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
import tensorrt_llm.logger as logger
from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
trt_dtype_to_torch)
from tensorrt_llm.bindings import GptJsonConfig, KVCacheType
from tensorrt_llm.bindings import GptJsonConfig
from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelConfig, SamplingConfig
from tensorrt_llm.runtime.session import Session, TensorInfo

Expand Down
6 changes: 2 additions & 4 deletions tensorrt_llm/_torch/auto_deploy/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from tensorrt_llm.models.modeling_utils import QuantConfig

from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, KvCacheConfig, _ParallelConfig
from ...llmapi.utils import get_type_repr
from .models import ModelFactory, ModelFactoryRegistry
from .utils._config import DynamicYamlMixInForSettings

Expand Down Expand Up @@ -294,12 +293,11 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):

model_config = _get_config_dict()

build_config: Optional[object] = Field(
default_factory=lambda: BuildConfig(),
build_config: Optional[BuildConfig] = Field(
default_factory=BuildConfig,
description="!!! DO NOT USE !!! Internal only; needed for BaseLlmArgs compatibility.",
exclude_from_json=True,
frozen=True,
json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"},
repr=False,
)
backend: Literal["_autodeploy"] = Field(
Expand Down
4 changes: 2 additions & 2 deletions tensorrt_llm/bench/build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
QuantAlgo.NVFP4, QuantAlgo.FP8, QuantAlgo.FP8_BLOCK_SCALES,
QuantAlgo.NO_QUANT, None
}
DEFAULT_MAX_BATCH_SIZE = BuildConfig.max_batch_size
DEFAULT_MAX_NUM_TOKENS = BuildConfig.max_num_tokens
DEFAULT_MAX_BATCH_SIZE = BuildConfig.model_fields["max_batch_size"].default
DEFAULT_MAX_NUM_TOKENS = BuildConfig.model_fields["max_num_tokens"].default


def get_benchmark_engine_settings(
Expand Down
337 changes: 121 additions & 216 deletions tensorrt_llm/builder.py

Large diffs are not rendered by default.

174 changes: 75 additions & 99 deletions tensorrt_llm/commands/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
mpi_comm, mpi_rank, mpi_world_size)
from tensorrt_llm.auto_parallel import infer_cluster_config
from tensorrt_llm.auto_parallel.cluster_info import cluster_infos
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.auto_parallel.config import AutoParallelConfig
from tensorrt_llm.builder import BuildConfig, Engine, build
from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
from tensorrt_llm.logger import logger, severity_map
from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.lora_manager import LoraManager
Expand Down Expand Up @@ -94,29 +95,30 @@ def parse_arguments():
parser.add_argument(
'--max_batch_size',
type=int,
default=BuildConfig.max_batch_size,
default=BuildConfig.model_fields["max_batch_size"].default,
help="Maximum number of requests that the engine can schedule.")
parser.add_argument('--max_input_len',
type=int,
default=BuildConfig.max_input_len,
help="Maximum input length of one request.")
parser.add_argument(
'--max_input_len',
type=int,
default=BuildConfig.model_fields["max_input_len"].default,
help="Maximum input length of one request.")
parser.add_argument(
'--max_seq_len',
'--max_decoder_seq_len',
dest='max_seq_len',
type=int,
default=BuildConfig.max_seq_len,
default=BuildConfig.model_fields["max_seq_len"].default,
help="Maximum total length of one request, including prompt and outputs. "
"If unspecified, the value is deduced from the model config.")
parser.add_argument(
'--max_beam_width',
type=int,
default=BuildConfig.max_beam_width,
default=BuildConfig.model_fields["max_beam_width"].default,
help="Maximum number of beams for beam search decoding.")
parser.add_argument(
'--max_num_tokens',
type=int,
default=BuildConfig.max_num_tokens,
default=BuildConfig.model_fields["max_num_tokens"].default,
help=
"Maximum number of batched input tokens after padding is removed in each batch. "
"Currently, the input padding is removed by default; "
Expand All @@ -125,7 +127,7 @@ def parse_arguments():
parser.add_argument(
'--opt_num_tokens',
type=int,
default=BuildConfig.opt_num_tokens,
default=BuildConfig.model_fields["opt_num_tokens"].default,
help=
"Optimal number of batched input tokens after padding is removed in each batch "
"It equals to ``max_batch_size * max_beam_width`` by default, set this "
Expand All @@ -134,15 +136,16 @@ def parse_arguments():
parser.add_argument(
'--max_encoder_input_len',
type=int,
default=BuildConfig.max_encoder_input_len,
default=BuildConfig.model_fields["max_encoder_input_len"].default,
help="Maximum encoder input length for enc-dec models. "
"Set ``max_input_len`` to 1 to start generation from decoder_start_token_id of length 1."
)
parser.add_argument(
'--max_prompt_embedding_table_size',
'--max_multimodal_len',
type=int,
default=BuildConfig.max_prompt_embedding_table_size,
default=BuildConfig.model_fields["max_prompt_embedding_table_size"].
default,
help=
"Maximum prompt embedding table size for prompt tuning, or maximum multimodal input size for multimodal models. "
"Setting a value > 0 enables prompt tuning or multimodal input.")
Expand All @@ -164,36 +167,38 @@ def parse_arguments():
parser.add_argument(
'--input_timing_cache',
type=str,
default=BuildConfig.input_timing_cache,
default=BuildConfig.model_fields["input_timing_cache"].default,
help=
"The file path to read the timing cache. This option is ignored if the file does not exist."
)
parser.add_argument('--output_timing_cache',
type=str,
default=BuildConfig.output_timing_cache,
help="The file path to write the timing cache.")
parser.add_argument(
'--output_timing_cache',
type=str,
default=BuildConfig.model_fields["output_timing_cache"].default,
help="The file path to write the timing cache.")
parser.add_argument(
'--profiling_verbosity',
type=str,
default=BuildConfig.profiling_verbosity,
default=BuildConfig.model_fields["profiling_verbosity"].default,
choices=['layer_names_only', 'detailed', 'none'],
help=
"The profiling verbosity for the generated TensorRT engine. Setting to detailed allows inspecting tactic choices and kernel parameters."
)
parser.add_argument(
'--strip_plan',
default=BuildConfig.use_strip_plan,
default=BuildConfig.model_fields["use_strip_plan"].default,
action='store_true',
help=
"Enable stripping weights from the final TensorRT engine under the assumption that the refit weights are identical to those provided at build time."
)
parser.add_argument('--weight_sparsity',
default=BuildConfig.weight_sparsity,
action='store_true',
help="Enable weight sparsity.")
parser.add_argument(
'--weight_sparsity',
default=BuildConfig.model_fields["weight_sparsity"].default,
action='store_true',
help="Enable weight sparsity.")
parser.add_argument(
'--weight_streaming',
default=BuildConfig.weight_streaming,
default=BuildConfig.model_fields["weight_streaming"].default,
action='store_true',
help=
"Enable offloading weights to CPU and streaming loading at runtime.",
Expand All @@ -215,10 +220,11 @@ def parse_arguments():
default='info',
choices=severity_map.keys(),
help="The logging level.")
parser.add_argument('--enable_debug_output',
default=BuildConfig.enable_debug_output,
action='store_true',
help="Enable debug output.")
parser.add_argument(
'--enable_debug_output',
default=BuildConfig.model_fields["enable_debug_output"].default,
action='store_true',
help="Enable debug output.")
parser.add_argument(
'--visualize_network',
type=str,
Expand All @@ -228,7 +234,7 @@ def parse_arguments():
)
parser.add_argument(
'--dry_run',
default=BuildConfig.dry_run,
default=BuildConfig.model_fields["dry_run"].default,
action='store_true',
help=
"Run through the build process except the actual Engine build for debugging."
Expand Down Expand Up @@ -567,79 +573,49 @@ def main():
f"Overriding # of builder profiles <= {force_num_profiles_from_env}."
)

build_config = BuildConfig.from_dict(
{
'max_input_len':
args.max_input_len,
'max_seq_len':
args.max_seq_len,
'max_batch_size':
args.max_batch_size,
'max_beam_width':
args.max_beam_width,
'max_num_tokens':
args.max_num_tokens,
'opt_num_tokens':
args.opt_num_tokens,
'max_prompt_embedding_table_size':
args.max_prompt_embedding_table_size,
'gather_context_logits':
args.gather_context_logits,
'gather_generation_logits':
args.gather_generation_logits,
'strongly_typed':
True,
'force_num_profiles':
force_num_profiles_from_env,
'weight_sparsity':
args.weight_sparsity,
'profiling_verbosity':
args.profiling_verbosity,
'enable_debug_output':
args.enable_debug_output,
'max_draft_len':
args.max_draft_len,
'speculative_decoding_mode':
speculative_decoding_mode,
'input_timing_cache':
args.input_timing_cache,
'output_timing_cache':
args.output_timing_cache,
'auto_parallel_config': {
'world_size':
args.auto_parallel,
'gpus_per_node':
args.gpus_per_node,
'sharded_io_allowlist': [
'past_key_value_\\d+',
'present_key_value_\\d*',
],
'same_buffer_io': {
'past_key_value_(\\d+)': 'present_key_value_\\1',
},
**cluster_config,
build_config = BuildConfig(
max_input_len=args.max_input_len,
max_seq_len=args.max_seq_len,
max_batch_size=args.max_batch_size,
max_beam_width=args.max_beam_width,
max_num_tokens=args.max_num_tokens,
opt_num_tokens=args.opt_num_tokens,
max_prompt_embedding_table_size=args.
max_prompt_embedding_table_size,
kv_cache_type=getattr(args, "kv_cache_type", None),
gather_context_logits=args.gather_context_logits,
gather_generation_logits=args.gather_generation_logits,
strongly_typed=True,
force_num_profiles=force_num_profiles_from_env,
weight_sparsity=args.weight_sparsity,
profiling_verbosity=args.profiling_verbosity,
enable_debug_output=args.enable_debug_output,
max_draft_len=args.max_draft_len,
speculative_decoding_mode=speculative_decoding_mode,
input_timing_cache=args.input_timing_cache,
output_timing_cache=args.output_timing_cache,
auto_parallel_config=AutoParallelConfig(
world_size=args.auto_parallel,
gpus_per_node=args.gpus_per_node,
sharded_io_allowlist=[
'past_key_value_\\d+',
'present_key_value_\\d*',
],
same_buffer_io={
'past_key_value_(\\d+)': 'present_key_value_\\1',
},
'dry_run':
args.dry_run,
'visualize_network':
args.visualize_network,
'max_encoder_input_len':
args.max_encoder_input_len,
'weight_streaming':
args.weight_streaming,
'monitor_memory':
args.monitor_memory,
'use_mrope':
(True if model_config.qwen_type == "qwen2_vl" else False)
if hasattr(model_config, "qwen_type") else False
},
**cluster_config,
),
dry_run=args.dry_run,
visualize_network=args.visualize_network,
max_encoder_input_len=args.max_encoder_input_len,
weight_streaming=args.weight_streaming,
monitor_memory=args.monitor_memory,
use_mrope=getattr(model_config, "qwen_type", None) == "qwen2_vl",
plugin_config=plugin_config)

if hasattr(args, 'kv_cache_type'):
build_config.update_from_dict({'kv_cache_type': args.kv_cache_type})
else:
build_config = BuildConfig.from_json_file(args.build_config,
plugin_config=plugin_config)
build_config = BuildConfig.from_json_file(args.build_config)
build_config.plugin_config = plugin_config

parallel_build(model_config, ckpt_dir, build_config, args.output_dir,
workers, args.log_level, model_cls, **kwargs)
Expand Down
8 changes: 4 additions & 4 deletions tensorrt_llm/commands/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,23 @@
help="The logging level.")
@click.option("--max_beam_width",
type=int,
default=BuildConfig.max_beam_width,
default=BuildConfig.model_fields["max_beam_width"].default,
help="Maximum number of beams for beam search decoding.")
@click.option("--max_batch_size",
type=int,
default=BuildConfig.max_batch_size,
default=BuildConfig.model_fields["max_batch_size"].default,
help="Maximum number of requests that the engine can schedule.")
@click.option(
"--max_num_tokens",
type=int,
default=BuildConfig.max_num_tokens,
default=BuildConfig.model_fields["max_num_tokens"].default,
help=
"Maximum number of batched input tokens after padding is removed in each batch."
)
@click.option(
"--max_seq_len",
type=int,
default=BuildConfig.max_seq_len,
default=BuildConfig.model_fields["max_seq_len"].default,
help="Maximum total length of one request, including prompt and outputs. "
"If unspecified, the value is deduced from the model config.")
@click.option("--tp_size", type=int, default=1, help='Tensor parallelism size.')
Expand Down
Loading