triton-inference-server · pskiran1 · Oct 17, 2025 · Oct 17, 2025 · Oct 24, 2025 · Oct 27, 2025
diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto
@@ -1,4 +1,4 @@
-// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1659,6 +1659,21 @@ message ModelEnsembling
   //@@     The models and the input / output mappings used within the ensemble.
   //@@
   repeated Step step = 1;
+
+  //@@  .. cpp:var:: uint32 max_inflight_requests
+  //@@
+  //@@     The maximum number of concurrent inflight requests allowed at each
+  //@@     ensemble step per inference request. This limit prevents unbounded
+  //@@     memory growth when ensemble steps produce responses faster than
+  //@@     downstream steps can consume, e.g. decoupled models.
+  //@@     Default value is 0, which indicates that no limit is enforced.
+  //@@
+  //@@     Note: Applying this limit may block upstream steps while they wait
+  //@@     for downstream capacity. This blocking does not cancel or internally
+  //@@     time out intermediate requests, but clients may experience increased
+  //@@     end-to-end latency.
+  //@@
+  uint32 max_inflight_requests = 2;
 }
 
 //@@