Skip to content

Commit e27f0e0

Browse files
authored
Fix a concurrency bug when the stub is starting (triton-inference-server#51)
* Increase the default timeout * Lock the mutex before creating the child process
1 parent b2023cb commit e27f0e0

File tree

1 file changed

+5
-5
lines changed

1 file changed

+5
-5
lines changed

src/python.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,7 @@ ModelInstanceState::SetupChildProcess()
799799
}
800800

801801
parent_pid_ = getpid();
802+
pthread_mutex_lock(parent_mutex_);
802803

803804
pid_t pid = fork();
804805

@@ -838,7 +839,6 @@ ModelInstanceState::SetupChildProcess()
838839
}
839840

840841
} else {
841-
pthread_mutex_lock(parent_mutex_);
842842
int64_t stub_timeout_seconds =
843843
model_state->StateForBackend()->stub_timeout_seconds;
844844

@@ -851,7 +851,9 @@ ModelInstanceState::SetupChildProcess()
851851
if (pthread_cond_timedwait(parent_cond_, parent_mutex_, &ts) != 0) {
852852
return TRITONSERVER_ErrorNew(
853853
TRITONSERVER_ERROR_INTERNAL,
854-
(std::string("Failed to initialize model instance ") + Name())
854+
(std::string("Timed out occured while waiting for the stub process. "
855+
"Failed to initialize model instance ") +
856+
Name())
855857
.c_str());
856858
}
857859

@@ -1063,7 +1065,7 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
10631065
triton::common::TritonJson::Value cmdline;
10641066
backend_state->shm_default_byte_size = 64 * 1024 * 1024; // 64 MBs
10651067
backend_state->shm_growth_byte_size = 64 * 1024 * 1024; // 64 MBs
1066-
backend_state->stub_timeout_seconds = 10;
1068+
backend_state->stub_timeout_seconds = 30;
10671069

10681070
if (backend_config.Find("cmdline", &cmdline)) {
10691071
triton::common::TritonJson::Value shm_growth_size;
@@ -1257,8 +1259,6 @@ TRITONBACKEND_ModelInstanceExecute(
12571259
ModelInstanceState* instance_state;
12581260
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
12591261
instance, reinterpret_cast<void**>(&instance_state)));
1260-
1261-
12621262
RETURN_IF_ERROR(instance_state->ProcessRequests(requests, request_count));
12631263

12641264
for (uint32_t r = 0; r < request_count; ++r) {

0 commit comments

Comments
 (0)