From c192e889723fcb8289b18997d0e663ced4c52db0 Mon Sep 17 00:00:00 2001 From: jbkyang-nvi <80359429+jbkyang-nvi@users.noreply.github.com> Date: Tue, 4 Jan 2022 15:21:43 -0800 Subject: [PATCH 001/216] Enable batching on inferentia and add tests (#107) This PR enables batching for inferentia models, added tests for batching, and fixed inferentia + triton tensorflow bug where first dimension of inputs need to be multiples of num_threads_per_core * core_per_instance. Also added extra error reporting in python backend stub Testing PR: triton-inference-server/server#3714 --- inferentia/README.md | 70 ++++-- .../qa/setup_test_enviroment_and_test.sh | 1 - inferentia/scripts/gen_triton_model.py | 234 ++++++++++++++---- src/pb_stub.cc | 8 +- 4 files changed, 248 insertions(+), 65 deletions(-) diff --git a/inferentia/README.md b/inferentia/README.md index 3112f2e0..f6b12d85 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -41,6 +41,7 @@ and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest - [PyTorch](#pytorch) - [TensorFlow](#tensorflow) - [Serving Inferentia model in Triton](#serving-inferentia-model-in-triton) + - [Using Triton's Dynamic Batching](#using-tritons-dynamic-batching) - [Testing Inferentia Setup for Accuracy](#testing-inferentia-setup-for-accuracy) ## Inferentia setup @@ -48,13 +49,13 @@ and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest First step of running Triton with Inferentia is to create an AWS Inferentia instance with Deep Learning AMI (tested with Ubuntu 18.04). `ssh -i .pem ubuntu@` -Note: It is recommended to set your storage space to greater than default value +Note: It is recommended to set your storage space to greater than default value of 110 GiB. The current version of Triton has been tested with storage of 500 GiB. After logging into the inf1* instance, you will need to clone -[this current Github repo](https://github.com/triton-inference-server/python_backend). - Follow [steps on Github to set up ssh access](https://docs.github.com/en/authentication/connecting-to-github-with-ssh) +[this current Github repo](https://github.com/triton-inference-server/python_backend). + Follow [steps on Github to set up ssh access](https://docs.github.com/en/authentication/connecting-to-github-with-ssh) or simply clone with https. Clone this repo with Github to home repo `/home/ubuntu`. @@ -87,7 +88,7 @@ After starting the Triton container, go into the `python_backend` folder and run This script will: 1. Setup miniconda enviroment 2. Install necessary dependencies -3. Create a [Custom Python Execution Environment](https://github.com/triton-inference-server/python_backend#using-custom-python-execution-environments), +3. Create a [Custom Python Execution Environment](https://github.com/triton-inference-server/python_backend#using-custom-python-execution-environments), `python_backend_stub` to use for Inferentia 4. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), the Neuron compiler and [neuron-rtd](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-runtime/overview.html) the Neuron Runtime @@ -106,15 +107,16 @@ Currently, we only support [PyTorch](https://awsdocs-neuron.readthedocs-hosted.c and [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/index.html) workflows for execution on inferentia. -The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels` (for tensorflow) models. This is -a critical step since Inferentia will need the underlying `.NEFF` graph to execute -the inference request. Please refer to: +The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels` +(for tensorflow) models. This is a critical step since Inferentia will need +the underlying `.NEFF` graph to execute the inference request. Please refer to: + - [Neuron compiler CLI Reference Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/command-line-reference.html) - [PyTorch-Neuron trace python API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/api-compilation-python-api.html) -- [PyTorch Tutorials](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/tutorials/index.html) +- [PyTorch Tutorials](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/tutorials/index.html) - [TensorFlow Tutorials](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/index.html) - for guidance on how to compile models. + ### PyTorch For PyTorch, we support models traced by [PyTorch-Neuron trace python API](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/api-compilation-python-api.html) @@ -157,8 +159,9 @@ the number of neuron cores to be a proper multiple of the instance count. ### TensorFlow + For TensorFlow, the model must be compiled for AWS Neuron. See -[AWS Neuron TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/index.html +[AWS Neuron TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/tutorials/index.html) tutorials to learn how to get a compiled model that uses Neuron cores. Currently, the code is tested only on `tensorflow==1.15`. @@ -183,12 +186,18 @@ module installed in order to use this script for tensorflow models. Similar to PyTorch, `--neuron_core_range` and `--triton_model_instance_count` can be used to specify the neuron core range and number of triton model instances. However, the neuron core indices don't point to a specific -neuron core in the chip. For TensorFlow, we use deprecated feature of +neuron core in the chip. For TensorFlow, we use deprecated feature of `NEURONCORE_GROUP_SIZES` to load model. The model in this case will be loaded on next available Neuron cores and not specific ones. See [Parallel Execution using NEURONCORE_GROUP_SIZES](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/appnotes/perf/parallel-ncgs.html?highlight=NEURONCORE_GROUP_SIZES) for more information. +Another note, since Neuron-Tensorflow(unlike Neuron-Python) does not have +built-in functions for running a model for multiple cores, `model.py` will +distribute the workload by splitting the input tensor across available cores. +It is recommended the first dimension for the inputs be `None` if the user enables +processing across multiple cores. + Please use the `-h` or `--help` options in `gen_triton_model.py` to learn about more configurable options. @@ -219,29 +228,52 @@ Now, the server can be launched with the model as below: $tritonserver --model-repository ``` -Note: +Note: + 1. The `config.pbtxt` and `model.py` should be treated as starting point. The users can customize these files as per their need. -2. Triton Inferentia is currently tested with a **single** model. +2. Triton Inferentia is currently tested with a **single** model. + +### Using Triton's Dynamic Batching + +To enable dynamic batching, `--enable_dynamic_batching` +flag needs to be specified. `gen_triton_model.py` supports following three +options for configuring [Triton's dynamic batching](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md): + +1. `--preferred_batch_size`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes) for details on preferred batch size. To optimize + performance, this is recommended to be multiples of engaged neuron cores. + For example, if each instance is using 2 neuron cores, `preferred_batch_size` + could be 2, 4 or 6. +2. `--max_queue_delay_microseconds`: Please refer to + [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching) for details. +3. `--disable_batch_requests_to_neuron`: Enable the non-default way for Triton to + handle batched requests. Triton backend will send each request to neuron + separately, irrespective of if the Triton server requests are batched. + This flag is recommended when users want to optimize performance with models + that do not perform well with batching without the flag. + +Additionally, `--max_batch_size` will affect the maximum batching limit. Please +refer to the [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size) +for details. ## Testing Inferentia Setup for Accuracy + The [qa folder](https://github.com/triton-inference-server/python_backend/tree/main/inferentia/qa) contains the necessary files to set up testing with a simple add_sub model. The test requires an instance with more than 8 inferentia cores to run, eg:`inf1.6xlarge`. -start the test, run +start the test, run ``` $source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh ``` where `` is usually `/home/ubuntu`/. This script will pull the [server repo](https://github.com/triton-inference-server/server) -that contains the tests for inferentia. It will then build the most recent -Triton Server and Triton SDK. +that contains the tests for inferentia. It will then build the most recent +Triton Server and Triton SDK. Note: If you would need to change some of the tests in the server repo, -you would need to run +you would need to run ``` $export TRITON_SERVER_REPO_TAG= ``` -before running the script. - +before running the script. diff --git a/inferentia/qa/setup_test_enviroment_and_test.sh b/inferentia/qa/setup_test_enviroment_and_test.sh index 89b5b41a..c396f016 100644 --- a/inferentia/qa/setup_test_enviroment_and_test.sh +++ b/inferentia/qa/setup_test_enviroment_and_test.sh @@ -45,7 +45,6 @@ export TEST_REPO=/opt/tritonserver/qa/L0_inferentia_perf_analyzer export TEST_SCRIPT="test.sh" CONTAINER_NAME="qa_container" - cd ${TRITON_PATH} echo "Using server repo tag: $TRITON_SERVER_REPO_TAG" # Clone necessary branches diff --git a/inferentia/scripts/gen_triton_model.py b/inferentia/scripts/gen_triton_model.py index 40a21501..dc44ac79 100644 --- a/inferentia/scripts/gen_triton_model.py +++ b/inferentia/scripts/gen_triton_model.py @@ -84,7 +84,6 @@ def parse_tf_tensors(saved_model_dir, tag_set, signature_def_key): for dim in output_signature.tensor_shape.dim: shape.append(dim.size) output_dict[output_signature.name] = [datatype, shape] - return input_dict, output_dict @@ -107,10 +106,26 @@ def get_parameter_spec(key1, value): def create_modelconfig(model_name, max_batch_size, inputs, outputs, compiled_model_path, nc_start_idx, nc_end_idx, - threads_per_core, instance_count): + threads_per_core, instance_count, + enable_dynamic_batching, preferred_batch_size, + max_queue_delay_microseconds): config = "name: \"{}\"\n".format(model_name) config += "backend: \"python\"\n" config += "max_batch_size: {}\n".format(max_batch_size) + if enable_dynamic_batching: + config += ''' +dynamic_batching { +''' + if preferred_batch_size is not None: + config += ''' + preferred_batch_size: {} +'''.format(preferred_batch_size) + if max_queue_delay_microseconds is not None: + config += ''' + max_queue_delay_microseconds: {} +'''.format(max_queue_delay_microseconds) + config += ''' +}\n''' for input_name in inputs.keys(): data_type, shape = inputs[input_name] config += ''' @@ -336,7 +351,7 @@ def _validate_output_dict(self, expected_count): return init_impl -def get_tensorflow_execute_impl(): +def get_tensorflow_execute_impl(disable_batch_requests_to_neuron): exec_impl = ''' def _one_thread(self, pred, model_feed_dict): result = pred(model_feed_dict) @@ -363,9 +378,10 @@ def execute(self, requests): A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ - +''' + if disable_batch_requests_to_neuron: + exec_impl += ''' responses = [] - num_threads = len(self.pred_list) model_feed_dict_list = [{} for _ in range(num_threads)] for request in requests: @@ -375,29 +391,22 @@ def execute(self, requests): tensor = pb_utils.get_input_tensor_by_name(request, name).as_numpy() split_tensor = [None] * num_threads - # TODO: This will force split the first dimension of the input - # into however num_threads, and will report error if the dimension - # is not divisible by the amount of neuron cores. Fix this behavior for split_index in range(num_threads): - model_feed_dict_list[split_index][name] = np.split( + model_feed_dict_list[split_index][name] = np.array_split( tensor, num_threads, axis=0)[split_index] - executor = futures.ThreadPoolExecutor(max_workers=num_threads) running = { executor.submit(self._one_thread, self.pred_list[idx], model_feed_dict_list[idx]): idx for idx in range(num_threads) } - results = [None] * num_threads for future in futures.as_completed(running): idx = running[future] results[idx] = future.result() - output_tensors = [] for i in range(len(self.output_list)): name, dt, shape = self.output_list[i] - out_list = [None] * num_threads for idx in range(num_threads): out_list[idx] = results[idx][name] @@ -405,23 +414,81 @@ def execute(self, requests): for idx in range(num_threads - 1): full_tensor = np.concatenate( (full_tensor, out_list[idx + 1]), axis=0) - output_tensor = pb_utils.Tensor( name, full_tensor.astype(pb_utils.triton_string_to_numpy(dt))) - output_tensors.append(output_tensor) - inference_response = pb_utils.InferenceResponse( output_tensors=output_tensors) responses.append(inference_response) + return responses +''' + else: + exec_impl += ''' + responses = [] + num_threads = len(self.pred_list) + model_feed_dict_list = [{} for _ in range(num_threads)] + num_requests = len(requests) + request_batch_sizes = [] + inputs = [] + for i in range(len(self.input_list)): + name, dt, shape = self.input_list[i] + first_tensor = pb_utils.get_input_tensor_by_name(requests[0], name).as_numpy() + request_batch_sizes.append(np.size(first_tensor, axis=0)) + batched_tensor = first_tensor + for j in range(1, num_requests): + tensor = pb_utils.get_input_tensor_by_name(requests[j], + name).as_numpy() + request_batch_sizes.append(request_batch_sizes[-1] + np.size(tensor, axis=0)) + batched_tensor = np.concatenate((batched_tensor, tensor), axis=0) + split_tensor = [None] * num_threads + for split_index in range(num_threads): + model_feed_dict_list[split_index][name] = np.array_split( + batched_tensor, num_threads, axis=0)[split_index] + + executor = futures.ThreadPoolExecutor(max_workers=num_threads) + running = { + executor.submit(self._one_thread, self.pred_list[idx], + model_feed_dict_list[idx]): idx + for idx in range(num_threads) + } + + results = [None] * num_threads + for future in futures.as_completed(running): + idx = running[future] + results[idx] = future.result() + + chuncky_tensors = [] + for i in range(len(self.output_list)): + name, dt, shape = self.output_list[i] + out_list = [None] * num_threads + for idx in range(num_threads): + out_list[idx] = results[idx][name] + full_tensor = out_list[0] + for idx in range(num_threads - 1): + full_tensor = np.concatenate( + (full_tensor, out_list[idx + 1]), axis=0) + chuncky_tensors.append(np.split(full_tensor, request_batch_sizes, axis=0)) + + for i in range(num_requests): + output_tensors = [] + for j in range(len(self.output_list)): + name, dt, shape = self.output_list[j] + tensor = chuncky_tensors[j][i] + output_tensor = pb_utils.Tensor( + name, + tensor.astype(pb_utils.triton_string_to_numpy(dt))) + output_tensors.append(output_tensor) + + inference_response = pb_utils.InferenceResponse(output_tensors=output_tensors) + responses.append(inference_response) return responses ''' return exec_impl -def get_pytorch_execute_impl(): +def get_pytorch_execute_impl(disable_batch_requests_to_neuron): exec_impl = ''' def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -444,19 +511,18 @@ def execute(self, requests): A list of pb_utils.InferenceResponse. The length of this list must be the same as `requests` """ - +''' + if disable_batch_requests_to_neuron: + exec_impl += ''' responses = [] - for request in requests: inputs = [] for i in range(len(self.input_dict)): name, dt, shape = self.input_dict[i] - tensor = pb_utils.get_input_tensor_by_name(request, - name).as_numpy() - inputs.append(torch.as_tensor(tensor)) - + tensor = torch.as_tensor(pb_utils.get_input_tensor_by_name(request, + name).as_numpy()) + inputs.append(tensor) results = self.model_neuron(*inputs) - output_tensors = [] for i in self.output_dict.keys(): name, dt, shape = self.output_dict[i] @@ -464,9 +530,45 @@ def execute(self, requests): output_tensor = pb_utils.Tensor( name, result.numpy().astype( pb_utils.triton_string_to_numpy(dt))) - output_tensors.append(output_tensor) - + inference_response = pb_utils.InferenceResponse( + output_tensors=output_tensors) + responses.append(inference_response) + return responses +''' + else: + exec_impl += ''' + responses = [] + inputs = [] + num_requests = len(requests) + request_batch_sizes = [] + for i in self.input_dict.keys(): + name, dt, shape = self.input_dict[i] + first_tensor = torch.as_tensor(pb_utils.get_input_tensor_by_name(requests[0], + name).as_numpy()) + request_batch_sizes.append(first_tensor.size(dim=0)) + batched_tensor = first_tensor + for j in range(1, num_requests): + tensor = torch.as_tensor(pb_utils.get_input_tensor_by_name(requests[j], + name).as_numpy()) + request_batch_sizes.append(request_batch_sizes[-1] + tensor.size(dim=0)) + batched_tensor = torch.cat((batched_tensor, tensor), dim=0) + inputs.append(batched_tensor) + + batched_results = self.model_neuron(*inputs) + chunky_batched_results = [] + for i in self.output_dict.keys(): + batch = batched_results[i] if isinstance(batched_results, tuple) else batched_results + chunky_batched_results.append(torch.tensor_split(batch, request_batch_sizes, dim=0)) + for i in range(num_requests): + output_tensors = [] + for j in self.output_dict.keys(): + name, dt, shape = self.output_dict[j] + result = chunky_batched_results[j][i] + output_tensor = pb_utils.Tensor( + name, result.numpy().astype( + pb_utils.triton_string_to_numpy(dt))) + output_tensors.append(output_tensor) inference_response = pb_utils.InferenceResponse( output_tensors=output_tensors) responses.append(inference_response) @@ -489,7 +591,8 @@ def finalize(self): return finalize_impl -def get_triton_python_model_impl(using_tensorflow_model): +def get_triton_python_model_impl(using_tensorflow_model, + disable_batch_requests_to_neuron): triton_pmi = ''' class TritonPythonModel: """Your Python model must use the same class name. Every Python model @@ -499,17 +602,18 @@ class TritonPythonModel: if using_tensorflow_model: triton_pmi += get_tensorflow_initialize_impl() - triton_pmi += get_tensorflow_execute_impl() + triton_pmi += get_tensorflow_execute_impl( + disable_batch_requests_to_neuron) else: triton_pmi += get_pytorch_initialize_impl() - triton_pmi += get_pytorch_execute_impl() + triton_pmi += get_pytorch_execute_impl(disable_batch_requests_to_neuron) triton_pmi += get_finalize_impl() return triton_pmi -def create_model_file(using_tensorflow_model): +def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): triton_model = get_model_license() triton_model += ''' import json @@ -529,27 +633,61 @@ def create_model_file(using_tensorflow_model): import torch import torch.neuron ''' - triton_model += get_triton_python_model_impl(using_tensorflow_model) + triton_model += get_triton_python_model_impl( + using_tensorflow_model, disable_batch_requests_to_neuron) return triton_model if __name__ == '__main__': parser = argparse.ArgumentParser() + parser.add_argument('--model_type', + type=str, + required=True, + choices=['pytorch', 'tensorflow'], + help='''The type of the compiled model. Currently, + only supports \"pytorch\" and \"tensorflow\".''') parser.add_argument('--model_version', type=int, default=1, help='The version of the model') + parser.add_argument( + '--enable_dynamic_batching', + action="http://github.com/store_true", + help='''Enable dynamic batching. Please see model configuration + documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher''' + ) parser.add_argument( '--max_batch_size', type=int, default=0, - help='The maximum batch size for the model being generated') - parser.add_argument('--model_type', - type=str, - required=True, - choices=['pytorch', 'tensorflow'], - help='''The type of the compiled model. Currently, - only supports \"pytorch\" and \"tensorflow\".''') + help='''The maximum batch size for the model being generated. + Please see model configuration documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size''' + ) + parser.add_argument('--preferred_batch_size', + type=int, + help='''The preferred batch size. Should be multiples + of cores available to ensure proper utilization of + neuron cores. + This flag is ignored if --enable_dynamic_batching is + not specified. Please see model configuration + documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes''' + ) + parser.add_argument('--max_queue_delay_microseconds', + type=int, + help='''Max queue delay time(ms) for dynamic batching. + This flag is ignored if --enable_dynamic_batching is not specified. + Please see model configuration documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching''' + ) + parser.add_argument( + '--disable_batch_requests_to_neuron', + action="http://github.com/store_true", + help='''Send each request separately to neuron if enabled. + If not specified, then requests are combined and sent to + neuron as a single batch''') parser.add_argument('--tag_set', type=str, default="serve", @@ -628,6 +766,14 @@ def create_model_file(using_tensorflow_model): elif FLAGS.model_type == 'pytorch': is_tensorflow_model = False + print('''Triton Dynamic Batching is enabled: {}, + preferred_batch_size: {} and max_batch_size: {} + with max_queue_delay_microseconds: {}. + Batch requests to neruon are disabled: {}'''.format( + FLAGS.enable_dynamic_batching, FLAGS.preferred_batch_size, + FLAGS.max_batch_size, FLAGS.max_queue_delay_microseconds, + FLAGS.disable_batch_requests_to_neuron)) + if not is_tensorflow_model or (FLAGS.triton_input != None and FLAGS.triton_output != None): inputs = parse_io_tensors(FLAGS.triton_input) @@ -647,13 +793,15 @@ def create_model_file(using_tensorflow_model): pass # ignore existing dir model_name = os.path.basename(FLAGS.triton_model_dir) - mc = create_modelconfig(model_name, FLAGS.max_batch_size, inputs, outputs, - FLAGS.compiled_model, nc_start_idx, nc_end_idx, - FLAGS.threads_per_core, - FLAGS.triton_model_instance_count) + mc = create_modelconfig( + model_name, FLAGS.max_batch_size, inputs, outputs, FLAGS.compiled_model, + nc_start_idx, nc_end_idx, FLAGS.threads_per_core, + FLAGS.triton_model_instance_count, FLAGS.enable_dynamic_batching, + FLAGS.preferred_batch_size, FLAGS.max_queue_delay_microseconds) with open(FLAGS.triton_model_dir + "/config.pbtxt", "w") as config_file: config_file.write(mc) - mf = create_model_file(is_tensorflow_model) + mf = create_model_file(is_tensorflow_model, + FLAGS.disable_batch_requests_to_neuron) with open(FLAGS.triton_model_dir + "/1/model.py", "w") as model_file: model_file.write(mf) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 4bccde37..c497b5fc 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -494,9 +494,13 @@ Stub::Execute(RequestBatch* request_batch, ResponseBatch* response_batch) // If the number of request objects do not match the number of resposne // objects throw an error. if (response_size != batch_size) { - throw PythonBackendException( + std::string err = "Number of InferenceResponse objects do not match the number of " - "InferenceRequest objects."); + "InferenceRequest objects. InferenceRequest(s) size is:" + + std::to_string(batch_size) + + ", and InferenceResponse(s) size is:" + std::to_string(response_size) + + "\n"; + throw PythonBackendException(err); } shm_pool_->Map( From b145472d06dee790e1ac90506b9f7672fc82b13a Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Thu, 6 Jan 2022 09:33:08 -0800 Subject: [PATCH 002/216] Fix error handling to include failed counts in infer stat (#110) * Fix error handling to include failed counts in infer stat * Replace goto with a reporter class * Add newlines --- CMakeLists.txt | 2 + src/pb_metric_reporter.cc | 103 +++++++++++++ src/pb_metric_reporter.h | 58 +++++++ src/python.cc | 308 +++++++++++++++++++------------------- 4 files changed, 313 insertions(+), 158 deletions(-) create mode 100644 src/pb_metric_reporter.cc create mode 100644 src/pb_metric_reporter.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d0caf1b..ae41aff0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -149,6 +149,8 @@ set( src/pb_env.h src/pb_main_utils.cc src/pb_main_utils.h + src/pb_metric_reporter.cc + src/pb_metric_reporter.h ) list(APPEND diff --git a/src/pb_metric_reporter.cc b/src/pb_metric_reporter.cc new file mode 100644 index 00000000..20786872 --- /dev/null +++ b/src/pb_metric_reporter.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_metric_reporter.h" + +#include "triton/backend/backend_common.h" + +namespace triton { namespace backend { namespace python { + +PbMetricReporter::PbMetricReporter( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::shared_ptr> responses) + : instance_(instance), requests_(requests), request_count_(request_count), + responses_(responses), total_batch_size_(0), exec_start_ns_(0), + compute_start_ns_(0), compute_end_ns_(0), exec_end_ns_(0) +{ +} + +PbMetricReporter::~PbMetricReporter() +{ + for (uint32_t r = 0; r < request_count_; ++r) { + TRITONBACKEND_Request* request = requests_[r]; + + // Report statistics for the request. Note that there could + // still be responses that have not yet been sent but those + // cannot be captured in the statistics as they reflect only the + // request object. We use the execution start/end time for + // compute also so that the entire execution time is associated + // with the inference computation. + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + instance_, request, + ((*responses_)[r] != nullptr) /* success */, exec_start_ns_, + compute_start_ns_, compute_end_ns_, exec_end_ns_), + "failed reporting request statistics"); + } + + // Report the entire batch statistics. This backend does not support + // batching so the total batch size is always 1. + if (total_batch_size_ != 0) { + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportBatchStatistics( + instance_, total_batch_size_, exec_start_ns_, + compute_start_ns_, compute_end_ns_, exec_end_ns_), + "failed reporting batch request statistics"); + } +} + +void +PbMetricReporter::SetBatchStatistics(size_t total_batch_size) +{ + total_batch_size_ = total_batch_size; +} + +void +PbMetricReporter::SetExecStartNs(const uint64_t exec_start_ns) +{ + exec_start_ns_ = exec_start_ns; +} + +void +PbMetricReporter::SetComputeStartNs(const uint64_t compute_start_ns) +{ + compute_start_ns_ = compute_start_ns; +} + +void +PbMetricReporter::SetComputeEndNs(const uint64_t compute_end_ns) +{ + compute_end_ns_ = compute_end_ns; +} + +void +PbMetricReporter::SetExecEndNs(const uint64_t exec_end_ns) +{ + exec_end_ns_ = exec_end_ns; +} + +}}} // namespace triton::backend::python diff --git a/src/pb_metric_reporter.h b/src/pb_metric_reporter.h new file mode 100644 index 00000000..978a949d --- /dev/null +++ b/src/pb_metric_reporter.h @@ -0,0 +1,58 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include +#include "triton/core/tritonbackend.h" + +namespace triton { namespace backend { namespace python { +class PbMetricReporter { + TRITONBACKEND_ModelInstance* instance_; + TRITONBACKEND_Request** requests_; + uint32_t request_count_; + std::shared_ptr> responses_; + size_t total_batch_size_; + uint64_t exec_start_ns_; + uint64_t compute_start_ns_; + uint64_t compute_end_ns_; + uint64_t exec_end_ns_; + + public: + PbMetricReporter( + TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, + const uint32_t request_count, + std::shared_ptr> responses); + ~PbMetricReporter(); + void SetBatchStatistics(size_t total_batch_size); + void SetExecStartNs(const uint64_t exec_start_ns); + void SetComputeStartNs(const uint64_t compute_start_ns); + void SetComputeEndNs(const uint64_t compute_end_ns); + void SetExecEndNs(const uint64_t exec_end_ns); +}; +}}}; // namespace triton::backend::python diff --git a/src/python.cc b/src/python.cc index 1cd23928..572f6c4c 100644 --- a/src/python.cc +++ b/src/python.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -56,6 +56,7 @@ #include "message_queue.h" #include "pb_env.h" #include "pb_main_utils.h" +#include "pb_metric_reporter.h" #include "pb_tensor.h" #include "pb_utils.h" #include "shm_manager.h" @@ -78,27 +79,47 @@ } \ } while (false) - -#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ - do { \ - TRITONSERVER_Error* raarie_err__ = (X); \ - if (raarie_err__ != nullptr) { \ - SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__); \ - return; \ - } \ +#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ + do { \ + TRITONSERVER_Error* raasnie_err__ = (X); \ + if (raasnie_err__ != nullptr) { \ + for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ + if ((*RESPONSES)[ridx] != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + raasnie_err__), \ + "failed to send error response"); \ + (*RESPONSES)[ridx] = nullptr; \ + } \ + } \ + TRITONSERVER_ErrorDelete(raasnie_err__); \ + return; \ + } \ } while (false) -#define RESPOND_ALL_AND_RETURN_IF_EXCEPTION(RESPONSES, RESPONSES_COUNT, X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& exception) { \ - TRITONSERVER_Error* raarie_err__ = TRITONSERVER_ErrorNew( \ - TRITONSERVER_ERROR_INTERNAL, exception.what()); \ - SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__); \ - return; \ - } \ + +#define RESPOND_ALL_AND_RETURN_IF_EXCEPTION(RESPONSES, RESPONSES_COUNT, X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& exception) { \ + TRITONSERVER_Error* raarie_err__ = TRITONSERVER_ErrorNew( \ + TRITONSERVER_ERROR_INTERNAL, exception.what()); \ + for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ + if ((*RESPONSES)[ridx] != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + raarie_err__), \ + "failed to send error response"); \ + (*RESPONSES)[ridx] = nullptr; \ + } \ + } \ + TRITONSERVER_ErrorDelete(raarie_err__); \ + return; \ + } \ } while (false) #define RESPOND_AND_RETURN_IF_ERROR(REQUEST, X) \ @@ -120,54 +141,40 @@ } \ } while (false) -#define RESPOND_ALL_REQUESTS_AND_RETURN_IF_EXCEPTION( \ - REQUESTS, REQUEST_COUNT, X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& exception) { \ - TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \ - TRITONSERVER_ERROR_INTERNAL, exception.what()); \ - RequestsRespondWithError(REQUESTS, REQUEST_COUNT, rarie_err__); \ - return; \ - } \ - } while (false) - -#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X) \ - do { \ - if ((RESPONSES)[IDX] != nullptr) { \ - TRITONSERVER_Error* err__ = (X); \ - if (err__ != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - err__), \ - "failed to send error response"); \ - (RESPONSES)[IDX] = nullptr; \ - TRITONSERVER_ErrorDelete(err__); \ - } \ - } \ +#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X) \ + do { \ + if ((*RESPONSES)[IDX] != nullptr) { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + err__), \ + "failed to send error response"); \ + (*RESPONSES)[IDX] = nullptr; \ + TRITONSERVER_ErrorDelete(err__); \ + } \ + } \ } while (false) -#define GUARDED_RESPOND_IF_EXCEPTION(RESPONSES, IDX, X) \ - do { \ - if ((RESPONSES)[IDX] != nullptr) { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& pb_exception) { \ - TRITONSERVER_Error* err__ = TRITONSERVER_ErrorNew( \ - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - (RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - err__), \ - "failed to send error response"); \ - (RESPONSES)[IDX] = nullptr; \ - TRITONSERVER_ErrorDelete(err__); \ - } \ - } \ +#define GUARDED_RESPOND_IF_EXCEPTION(RESPONSES, IDX, X) \ + do { \ + if ((*RESPONSES)[IDX] != nullptr) { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + TRITONSERVER_Error* err__ = TRITONSERVER_ErrorNew( \ + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + err__), \ + "failed to send error response"); \ + (*RESPONSES)[IDX] = nullptr; \ + TRITONSERVER_ErrorDelete(err__); \ + } \ + } \ } while (false) #define RETURN_IF_EXCEPTION(X) \ @@ -262,7 +269,7 @@ class ModelInstanceState : public BackendModelInstance { TRITONSERVER_Error* GetInputTensor( const uint32_t input_idx, Tensor* input_tensor_shm, std::shared_ptr& input_tensor, TRITONBACKEND_Request* request, - std::vector& responses); + std::shared_ptr>& responses); void ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -283,12 +290,13 @@ class ModelInstanceState : public BackendModelInstance { // Get a message from the stub process void SendMessageAndReceiveResponse( off_t message, off_t& response, bool& restart, - std::vector& responses, + std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count); // Responds to all the requests with an error message. void RespondErrorToAllRequests( - const char* message, std::vector& responses, + const char* message, + std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count); // Kill stub process @@ -346,7 +354,7 @@ ModelInstanceState::WaitForBLSRequestsToFinish() void ModelInstanceState::SendMessageAndReceiveResponse( off_t message, off_t& response, bool& restart, - std::vector& responses, + std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count) { auto error = SendMessageToStub(message); @@ -445,11 +453,12 @@ ModelInstanceState::ReceiveMessageFromStub(off_t& message) void ModelInstanceState::RespondErrorToAllRequests( - const char* message, std::vector& responses, + const char* message, + std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count) { for (uint32_t r = 0; r < request_count; ++r) { - if (responses[r] == nullptr) + if ((*responses)[r] == nullptr) continue; std::string err_message = @@ -462,10 +471,10 @@ ModelInstanceState::RespondErrorToAllRequests( TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); LOG_IF_ERROR( TRITONBACKEND_ResponseSend( - responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed sending response"); - responses[r] = nullptr; + (*responses)[r] = nullptr; TRITONSERVER_ErrorDelete(err); } } @@ -604,6 +613,15 @@ ModelInstanceState::ProcessRequests( int max_batch_size = model_state->MaxBatchSize(); std::string name = model_state->Name(); + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("model ") + model_state->Name() + ", instance " + Name() + + ", executing " + std::to_string(request_count) + " requests") + .c_str()); + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + // For each request collect the total batch size for this inference // execution. The batch-size, number of inputs, and size of each // input has already been checked so don't need to do that here. @@ -622,7 +640,29 @@ ModelInstanceState::ProcessRequests( .c_str())); return; } + } + + // We take the responsibility of the responses. + std::shared_ptr> responses( + new std::vector()); + responses->reserve(request_count); + PbMetricReporter reporter( + TritonModelInstance(), requests, request_count, responses); + reporter.SetExecStartNs(exec_start_ns); + + for (size_t i = 0; i < request_count; i++) { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses->emplace_back(response); + } else { + responses->emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + for (size_t i = 0; i < request_count; i++) { if (max_batch_size > 0) { // Retrieve the batch size from one of the inputs, if the model // supports batching, the first dimension size is batch size @@ -636,8 +676,7 @@ ModelInstanceState::ProcessRequests( total_batch_size += shape[0]; } if (err != nullptr) { - RequestsRespondWithError(requests, request_count, err); - return; + RESPOND_ALL_AND_RETURN_IF_ERROR(responses, request_count, err); } } else { ++total_batch_size; @@ -655,74 +694,50 @@ ModelInstanceState::ProcessRequests( // scheduler has done something badly wrong so fail and release all // requests. if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) { - RequestsRespondWithError( - requests, request_count, + RESPOND_ALL_AND_RETURN_IF_ERROR( + responses, request_count, TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string( "batch size " + std::to_string(total_batch_size) + " for '" + name + "', max allowed is " + std::to_string(max_batch_size)) .c_str())); - return; } - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model ") + model_state->Name() + ", instance " + Name() + - ", executing " + std::to_string(request_count) + " requests") - .c_str()); - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - std::unique_ptr ipc_message = - std::make_unique(shm_pool_, false /*inline_resposne*/); + std::make_unique(shm_pool_, false /*inline_response*/); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; RequestBatch* request_batch; off_t request_batch_offset; - RESPOND_ALL_REQUESTS_AND_RETURN_IF_EXCEPTION( - requests, request_count, + RESPOND_ALL_AND_RETURN_IF_EXCEPTION( + responses, request_count, shm_pool_->Map( (char**)&request_batch, sizeof(RequestBatch), request_batch_offset)); + ipc_message->Args() = request_batch_offset; request_batch->batch_size = request_count; Request* requests_shm; off_t requests_shm_offset; - RESPOND_ALL_REQUESTS_AND_RETURN_IF_EXCEPTION( - requests, request_count, + RESPOND_ALL_AND_RETURN_IF_EXCEPTION( + responses, request_count, shm_pool_->Map( (char**)&requests_shm, sizeof(Request) * request_count, requests_shm_offset)); request_batch->requests = requests_shm_offset; - // We take the responsibilty of the responses. - std::vector responses; - responses.reserve(request_count); - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses.emplace_back(response); - } else { - responses.emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response."); - TRITONSERVER_ErrorDelete(err); - } - } - for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; Request* python_infer_request = &requests_shm[r]; uint32_t requested_input_count = 0; RESPOND_ALL_AND_RETURN_IF_ERROR( - &responses, request_count, + responses, request_count, TRITONBACKEND_RequestInputCount(request, &requested_input_count)); uint32_t requested_output_count = 0; RESPOND_ALL_AND_RETURN_IF_ERROR( - &responses, request_count, + responses, request_count, TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); python_infer_request->requested_output_count = requested_output_count; @@ -730,7 +745,7 @@ ModelInstanceState::ProcessRequests( off_t input_tensors_offset; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, shm_pool_->Map( (char**)&input_tensors, sizeof(Tensor) * requested_input_count, input_tensors_offset)); @@ -742,7 +757,7 @@ ModelInstanceState::ProcessRequests( std::shared_ptr pb_input_tensor; RESPOND_ALL_AND_RETURN_IF_ERROR( - &responses, request_count, + responses, request_count, GetInputTensor( iidx, input_tensor, pb_input_tensor, request, responses)); pb_input_tensors.emplace_back(std::move(pb_input_tensor)); @@ -753,7 +768,7 @@ ModelInstanceState::ProcessRequests( for (size_t iidx = 0; iidx < requested_output_count; ++iidx) { const char* requested_output_name; RESPOND_ALL_AND_RETURN_IF_ERROR( - &responses, request_count, + responses, request_count, TRITONBACKEND_RequestOutputName( request, iidx, &requested_output_name)); requested_output_names.emplace_back(requested_output_name); @@ -762,23 +777,24 @@ ModelInstanceState::ProcessRequests( // request id const char* id; RESPOND_ALL_AND_RETURN_IF_ERROR( - &responses, request_count, TRITONBACKEND_RequestId(request, &id)); + responses, request_count, TRITONBACKEND_RequestId(request, &id)); uint64_t correlation_id; RESPOND_ALL_AND_RETURN_IF_ERROR( - &responses, request_count, + responses, request_count, TRITONBACKEND_RequestCorrelationId(request, &correlation_id)); InferRequest infer_request( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version()); RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, infer_request.SaveToSharedMemory(shm_pool_, python_infer_request)); } uint64_t compute_start_ns = 0; SET_TIMESTAMP(compute_start_ns); + reporter.SetComputeStartNs(compute_start_ns); // This means that the stub process has exited and Python // backend failed to restart the stub process. @@ -786,7 +802,6 @@ ModelInstanceState::ProcessRequests( const char* error_message = "The stub process has exited unexpectedly."; RespondErrorToAllRequests( error_message, responses, requests, request_count); - return; } @@ -799,7 +814,7 @@ ModelInstanceState::ProcessRequests( } RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, ipc_message = IPCMessage::LoadFromSharedMemory(shm_pool_, response_message)); @@ -819,23 +834,23 @@ ModelInstanceState::ProcessRequests( restart = true; RespondErrorToAllRequests( TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - return; } RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, ipc_message = IPCMessage::LoadFromSharedMemory(shm_pool_, response_message)); } uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); + reporter.SetComputeEndNs(compute_end_ns); // Parsing the request response ResponseBatch* response_batch; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, shm_pool_->MapOffset((char**)&response_batch, ipc_message->Args())); // If inference fails, release all the requests and send an error response. @@ -845,7 +860,7 @@ ModelInstanceState::ProcessRequests( if (response_batch->is_error_set) { char* error_message; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, LoadStringFromSharedMemory( shm_pool_, response_batch->error, error_message)); RespondErrorToAllRequests( @@ -856,15 +871,15 @@ ModelInstanceState::ProcessRequests( RespondErrorToAllRequests( error_message, responses, requests, request_count); } - return; } Response* responses_shm; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - &responses, request_count, + responses, request_count, shm_pool_->MapOffset((char**)&responses_shm, response_batch->responses)); + bool has_gpu_output = false; // The vector that stores the tensor pairs for the tensors that the stub @@ -873,7 +888,7 @@ ModelInstanceState::ProcessRequests( tensor_buffer_pairs; for (uint32_t r = 0; r < request_count; ++r) { - TRITONBACKEND_Response* response = responses[r]; + TRITONBACKEND_Response* response = (*responses)[r]; TRITONBACKEND_Request* request = requests[r]; uint32_t requested_output_count = 0; @@ -892,7 +907,7 @@ ModelInstanceState::ProcessRequests( LOG_IF_ERROR( TRITONBACKEND_ResponseSend( - responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed sending response"); TRITONSERVER_ErrorDelete(err); } else { @@ -902,12 +917,12 @@ ModelInstanceState::ProcessRequests( LOG_IF_ERROR( TRITONBACKEND_ResponseSend( - responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed sending response"); TRITONSERVER_ErrorDelete(err); } - responses[r] = nullptr; + (*responses)[r] = nullptr; // If has_error is true, we do not look at the response even if the // response is set. @@ -919,10 +934,10 @@ ModelInstanceState::ProcessRequests( TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); LOG_IF_ERROR( TRITONBACKEND_ResponseSend( - responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), "failed sending response"); TRITONSERVER_ErrorDelete(err); - responses[r] = nullptr; + (*responses)[r] = nullptr; continue; } @@ -1069,36 +1084,13 @@ ModelInstanceState::ProcessRequests( GUARDED_RESPOND_IF_ERROR( responses, r, TRITONBACKEND_ResponseSend( - responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr)); + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr)); } uint64_t exec_end_ns = 0; SET_TIMESTAMP(exec_end_ns); - - for (uint32_t r = 0; r < request_count; ++r) { - TRITONBACKEND_Request* request = requests[r]; - - // Report statistics for the request. Note that there could - // still be responses that have not yet been sent but those - // cannot be captured in the statistics as they reflect only the - // request object. We use the execution start/end time for - // compute also so that the entire execution time is associated - // with the inference computation. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics( - TritonModelInstance(), request, - (responses[r] != nullptr) /* success */, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting request statistics"); - } - - // Report the entire batch statistics. This backend does not support - // batching so the total batch size is always 1. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportBatchStatistics( - TritonModelInstance(), total_batch_size, exec_start_ns, - compute_start_ns, compute_end_ns, exec_end_ns), - "failed reporting batch request statistics"); + reporter.SetExecEndNs(exec_end_ns); + reporter.SetBatchStatistics(total_batch_size); return; } @@ -1472,7 +1464,7 @@ TRITONSERVER_Error* ModelInstanceState::GetInputTensor( const uint32_t input_idx, Tensor* input_tensor_shm, std::shared_ptr& input_tensor, TRITONBACKEND_Request* request, - std::vector& responses) + std::shared_ptr>& responses) { const char* input_name; // Load iidx'th input name @@ -1495,7 +1487,7 @@ ModelInstanceState::GetInputTensor( &input_dims_count, &input_byte_size, &input_buffer_count)); BackendInputCollector collector( - &request, 1, &responses, Model()->TritonMemoryManager(), + &request, 1, responses.get(), Model()->TritonMemoryManager(), false /* pinned_enable */, CudaStream(), nullptr, nullptr, 0, HostPolicyName().c_str()); From 8d203e9709efc95ad1ff1b41be41b1cab96f6b02 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 6 Jan 2022 15:38:16 -0500 Subject: [PATCH 003/216] Fix zero length io in BLS (#109) --- src/pb_main_utils.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/pb_main_utils.cc b/src/pb_main_utils.cc index 6526f663..a11318e6 100644 --- a/src/pb_main_utils.cc +++ b/src/pb_main_utils.cc @@ -265,10 +265,18 @@ RequestExecutor::Infer( // userp is only set for the CPU tensors if (memory_type != TRITONSERVER_MEMORY_GPU) { - output_tensors.push_back(std::make_shared( - sname, dims_vector, datatype, memory_type, memory_type_id, - const_cast(base), byte_size, nullptr /* DLManagedTensor */, - *(reinterpret_cast(userp)))); + if (byte_size != 0) { + output_tensors.push_back(std::make_shared( + sname, dims_vector, datatype, memory_type, memory_type_id, + const_cast(base), byte_size, + nullptr /* DLManagedTensor */, + *(reinterpret_cast(userp)))); + } else { + output_tensors.push_back(std::make_shared( + sname, dims_vector, datatype, memory_type, memory_type_id, + const_cast(base), byte_size, + nullptr /* DLManagedTensor */, 0 /* shared memory offest */)); + } } else { output_tensors.push_back(std::make_shared( sname, dims_vector, datatype, memory_type, memory_type_id, From e9379464298de8ca9273910fecd07a57202d0789 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 11 Jan 2022 11:23:32 -0500 Subject: [PATCH 004/216] Fix background thread sleep (#112) --- src/pb_stub.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index c497b5fc..958574b2 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -790,7 +790,7 @@ main(int argc, char** argv) // shared memory and will be set to false by the parent process. // The parent process expects that the stub process sets this // variable to true within 1 second. - sleep(0.3); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); stub->UpdateHealth(); From a48fe08523821008b97e2c641b90be97c0d37438 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 11 Jan 2022 23:11:08 -0500 Subject: [PATCH 005/216] Improve the error checking for execute return (#111) --- src/pb_stub.cc | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 958574b2..0c16d0b1 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -478,15 +478,25 @@ Stub::Execute(RequestBatch* request_batch, ResponseBatch* response_batch) // Execute Response py::object execute_return = model_instance_.attr("execute")(request_list); - py::list responses; + py::object responses_obj; bool is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); if (is_coroutine) { - responses = asyncio.attr("run")(execute_return); + responses_obj = asyncio.attr("run")(execute_return); } else { - responses = execute_return; + responses_obj = execute_return; } + // Check the return type of execute function. + if (!py::isinstance(responses_obj)) { + std::string str = py::str(execute_return.get_type()); + throw PythonBackendException( + std::string("Expected a list in the execute return, found type '") + + str + "'."); + } + + py::list responses = responses_obj; + Response* responses_shm; off_t responses_shm_offset; size_t response_size = py::len(responses); @@ -511,6 +521,15 @@ Stub::Execute(RequestBatch* request_batch, ResponseBatch* response_batch) size_t i = 0; for (auto& response : responses) { + // Check the return type of execute function. + if (!py::isinstance(response)) { + std::string str = py::str(response.get_type()); + throw PythonBackendException( + std::string("Expected an 'InferenceResponse' object in the execute " + "function return list, found type '") + + str + "'."); + } + InferResponse* infer_response = response.cast(); Response* response_shm = &responses_shm[i]; ProcessResponse(response_shm, response_batch, infer_response); From 27c854c2ab83ae2ca214419c618534db7fa949b7 Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Fri, 14 Jan 2022 18:52:54 -0800 Subject: [PATCH 006/216] Add warning for when TRITON_ENABLE_GPU is disabled (#113) --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae41aff0..c6141312 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -114,6 +114,8 @@ if(${TRITON_ENABLE_GPU}) find_package(CUDAToolkit REQUIRED) message(STATUS "Using CUDA ${CUDA_VERSION}") set(CUDA_NVCC_FLAGS -std=c++11) +elseif() + message(WARNING "TRITON_ENABLE_GPU is OFF, GPU Tensor support will be disabled") endif() # TRITON_ENABLE_GPU find_package(ZLIB REQUIRED) From f00770b89116430708a258a4aaf89e094a88cc7c Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 21 Jan 2022 13:40:24 -0500 Subject: [PATCH 007/216] Fix BLS support for sequence models (#115) * Fix BLS support for sequence models * Review edits --- src/infer_request.cc | 21 +++++++++++++++++--- src/infer_request.h | 6 +++++- src/pb_main_utils.cc | 7 ++++++- src/pb_stub.cc | 7 +++++-- src/pb_utils.h | 1 + src/python.cc | 6 +++++- src/resources/triton_python_backend_utils.py | 4 ++++ 7 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index b958325b..8a011f94 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -39,10 +39,11 @@ InferRequest::InferRequest( const std::string& request_id, uint64_t correlation_id, const std::vector>& inputs, const std::vector& requested_output_names, - const std::string& model_name, const int64_t model_version) + const std::string& model_name, const int64_t model_version, + const uint32_t flags) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), - model_version_(model_version) + model_version_(model_version), flags_(flags) { } @@ -82,6 +83,18 @@ InferRequest::ModelVersion() return model_version_; } +uint32_t +InferRequest::Flags() +{ + return flags_; +} + +void +InferRequest::SetFlags(uint32_t flags) +{ + flags_ = flags; +} + void InferRequest::SaveToSharedMemory( std::unique_ptr& shm_pool, Request* request_shm) @@ -97,6 +110,7 @@ InferRequest::SaveToSharedMemory( (char**)&requested_output_names, sizeof(off_t) * request_shm->requested_output_count, requested_output_names_offset); + request_shm->flags = Flags(); request_shm->requested_output_names = requested_output_names_offset; size_t i = 0; @@ -150,7 +164,8 @@ InferRequest::LoadFromSharedMemory( LoadStringFromSharedMemory(shm_pool, request->model_name, model_name); return std::make_unique( id, request->correlation_id, std::move(py_input_tensors), - requested_output_names, model_name, request->model_version); + requested_output_names, model_name, request->model_version, + request->flags); } #ifdef TRITON_PB_STUB diff --git a/src/infer_request.h b/src/infer_request.h index 636b62a1..233a77d4 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -38,19 +38,23 @@ class InferRequest { std::vector requested_output_names_; std::string model_name_; int64_t model_version_; + uint32_t flags_; public: InferRequest( const std::string& request_id, uint64_t correlation_id, const std::vector>& inputs, const std::vector& requested_output_names, - const std::string& model_name, const int64_t model_version); + const std::string& model_name, const int64_t model_version, + const uint32_t flags = 0); const std::vector>& Inputs(); const std::string& RequestId(); uint64_t CorrelationId(); const std::string& ModelName(); int64_t ModelVersion(); + uint32_t Flags(); + void SetFlags(uint32_t flags); const std::vector& RequestedOutputNames(); /// Save an Inference Request to shared memory. diff --git a/src/pb_main_utils.cc b/src/pb_main_utils.cc index a11318e6..e9f371b3 100644 --- a/src/pb_main_utils.cc +++ b/src/pb_main_utils.cc @@ -203,6 +203,12 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetId( irequest, infer_request->RequestId().c_str())); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetCorrelationId( + irequest, infer_request->CorrelationId())); + + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetFlags( + irequest, infer_request->Flags())); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, nullptr /* request_release_userp */)); @@ -238,7 +244,6 @@ RequestExecutor::Infer( response = completed.get(); *triton_response = response; delete_inference_request = false; - THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseError(response)); uint32_t output_count; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 0c16d0b1..26c13d31 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -698,15 +698,18 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::string&, uint64_t, const std::vector>&, const std::vector&, const std::string&, - const int64_t>(), + const int64_t, const uint32_t>(), py::arg("request_id") = "", py::arg("correlation_id") = 0, py::arg("inputs"), py::arg("requested_output_names"), - py::arg("model_name"), py::arg("model_version") = -1) + py::arg("model_name"), py::arg("model_version") = -1, + py::arg("flags") = 0) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) .def("request_id", &InferRequest::RequestId) .def("correlation_id", &InferRequest::CorrelationId) + .def("flags", &InferRequest::Flags) + .def("set_flags", &InferRequest::SetFlags) .def("exec", &InferRequest::Exec) .def( "async_exec", diff --git a/src/pb_utils.h b/src/pb_utils.h index 157adf5b..8bb76193 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -145,6 +145,7 @@ struct Request { uint32_t requested_output_count; off_t model_name; int64_t model_version; + uint32_t flags; }; struct Response { diff --git a/src/python.cc b/src/python.cc index 572f6c4c..9370d8f4 100644 --- a/src/python.cc +++ b/src/python.cc @@ -784,9 +784,13 @@ ModelInstanceState::ProcessRequests( responses, request_count, TRITONBACKEND_RequestCorrelationId(request, &correlation_id)); + uint32_t flags; + RESPOND_ALL_AND_RETURN_IF_ERROR( + responses, request_count, TRITONBACKEND_RequestFlags(request, &flags)); + InferRequest infer_request( id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version()); + model_state->Name(), model_state->Version(), flags); RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, infer_request.SaveToSharedMemory(shm_pool_, python_infer_request)); diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index 4789e9e8..c5fedc5d 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -273,3 +273,7 @@ def numpy_to_triton_type(data_type): def triton_string_to_numpy(triton_type_string): return TRITON_STRING_TO_NUMPY[triton_type_string] + + +TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1 +TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2 From 0ca52c8718a063aaa3bc5ac87b645a1f91407e51 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 21 Jan 2022 17:14:38 -0500 Subject: [PATCH 008/216] Add documentation for using sequence models in BLS (#116) * Add documentation for using sequence models in BLS * Review edit --- README.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 403faa0d..80654474 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ any C++ code. - [Error Handling](#error-handling) - [Managing Shared Memory](#managing-shared-memory) - [Business Logic Scripting](#business-logic-scripting) + - [Using BLS with Stateful Models](#using-bls-with-stateful-models) - [Limitations](#limitations) - [Interoperability and GPU Support](#interoperability-and-gpu-support) - [`pb_utils.Tensor.to_dlpack() -> PyCapsule`](#pb_utilstensorto_dlpack---pycapsule) @@ -587,7 +588,7 @@ class TritonPythonModel: # inference_request = pb_utils.InferenceRequest(model_name='model_name', # requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], # inputs=[], - # request_id="1", correlation_id=4, model_version=1) + # request_id="1", correlation_id=4, model_version=1, flags=0) # Execute the inference_request and wait for the response inference_response = inference_request.exec() @@ -660,6 +661,32 @@ class TritonPythonModel: A complete example for sync and async BLS in Python backend is included in the [Examples](#examples) section. +## Using BLS with Stateful Models + +[Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/architecture.md#stateful-models) +require setting additional flags in the inference request to indicate the +start and of a sequence. The `flags` argument in the `pb_utils.InferenceRequest` +object can be used to indicate whether the request is the first or last request +in the sequence. An example indicating that the request is starting the +sequence: + +```python +inference_request = pb_utils.InferenceRequest(model_name='model_name', + requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], + inputs=[], + request_id="1", correlation_id=4, flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START) +``` + +For indicating the ending of the sequence you can use the +`pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END` flag. If the request is both +starting and ending a sequence at the same time (i.e. the sequence has only a +single request), you can use the bitwise OR operator to enable both of the +flags: + +``` +flags = pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START | pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END +``` + ## Limitations - The number of inference requests that can be executed as a part of your model From b3856205a21c17b19314de3a8854c709f485d9f9 Mon Sep 17 00:00:00 2001 From: Hemant Jain Date: Wed, 26 Jan 2022 09:58:53 -0800 Subject: [PATCH 009/216] Docs to let user know async BLS needed Python >= 3.7 (#117) * Docs to let user know async BLS needed Python >= 3.7 * review edits --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 80654474..95de9203 100644 --- a/README.md +++ b/README.md @@ -661,6 +661,9 @@ class TritonPythonModel: A complete example for sync and async BLS in Python backend is included in the [Examples](#examples) section. +Note: Async BLS is not supported on Python 3.6 or lower due to the `async` keyword +and `asyncio.run` being introduced in Python 3.7. + ## Using BLS with Stateful Models [Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/architecture.md#stateful-models) From c9e4d98bef4db4599df70b631ec42ccd3825ac18 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 2 Feb 2022 14:25:52 -0500 Subject: [PATCH 010/216] Add zlib to the dependencies (#118) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 95de9203..8954813b 100644 --- a/README.md +++ b/README.md @@ -120,14 +120,15 @@ $ python3 python_backend/examples/add_sub/client.py * numpy * rapidjson-dev * libarchive-dev +* zlib1g-dev ``` pip3 install numpy ``` -On Ubuntu or Debian you can use the command below to install `rapidjson` and `libarchive`: +On Ubuntu or Debian you can use the command below to install `rapidjson`, `libarchive`, and `zlib`: ``` -sudo apt-get install rapidjson-dev libarchive-dev +sudo apt-get install rapidjson-dev libarchive-dev zlib1g-dev ``` 2. Build Python backend. Replace \ with the GitHub branch From cca536710f8e9b66cd450564ff07d7f7418dae34 Mon Sep 17 00:00:00 2001 From: jbkyang-nvi <80359429+jbkyang-nvi@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:16:30 -0800 Subject: [PATCH 011/216] add optional change for DEFAULT_REPO_TAG for release testing purposes (#120) --- inferentia/qa/setup_test_enviroment_and_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inferentia/qa/setup_test_enviroment_and_test.sh b/inferentia/qa/setup_test_enviroment_and_test.sh index c396f016..0999dd97 100644 --- a/inferentia/qa/setup_test_enviroment_and_test.sh +++ b/inferentia/qa/setup_test_enviroment_and_test.sh @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. export TRITON_PATH="/home/ubuntu" -export DEFAULT_REPO_TAG="main" +export DEFAULT_REPO_TAG=${DEFAULT_REPO_TAG:="main"} export TRITON_COMMON_REPO_TAG=${DEFAULT_REPO_TAG} export TRITON_CORE_REPO_TAG=${DEFAULT_REPO_TAG} export TRITON_BACKEND_REPO_TAG=${DEFAULT_REPO_TAG} From 56727b5b78d5d04720eac718c5f3ef1b9d591a94 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 4 Feb 2022 09:01:21 -0500 Subject: [PATCH 012/216] Fix GPU tensors when peer access is not enabled (#119) --- src/pb_tensor.cc | 24 ++++++++++++++++++++++-- src/pb_tensor.h | 3 +++ src/python.cc | 2 ++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index c2d28400..ea4baadf 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -634,11 +634,15 @@ PbTensor::LoadGPUData(std::unique_ptr& shm_pool) "'."); } char* d_buffer; + + // Sync the memory type id. Since it will be updated by the main process + // after providing the GPU buffers. + memory_type_id_ = raw_data_shm_->memory_type_id; + cudaSetDevice(this->MemoryTypeId()); shm_pool->MapOffset( (char**)&cuda_ipc_mem_handle_, raw_data_shm_->memory_ptr); - // Lock the mutex when using cudaIpcOpenMemHandle. This code is only // required in the stub process. In the Triton process, we never use // cudaIpcOpenMemHandle. The mutex is required because cudaIpcOpenMemHandle @@ -727,6 +731,20 @@ PbTensor::SetDataPtr(void* ptr) memory_ptr_ = ptr; } +void +PbTensor::SetMemoryType(TRITONSERVER_MemoryType memory_type) +{ + memory_type_ = memory_type; + raw_data_shm_->memory_type = memory_type; +} + +void +PbTensor::SetMemoryTypeId(int64_t memory_type_id) +{ + memory_type_id_ = memory_type_id; + raw_data_shm_->memory_type_id = memory_type_id; +} + #ifdef TRITON_ENABLE_GPU #ifndef TRITON_PB_STUB void @@ -746,6 +764,8 @@ PbTensor::SetBackendMemory( } memory_ptr_ = backend_memory->MemoryPtr(); + SetMemoryType(backend_memory->MemoryType()); + SetMemoryTypeId(backend_memory->MemoryTypeId()); backend_memory_ = std::move(backend_memory); raw_data_shm_->offset = this->GetGPUPointerOffset(); tensor_shm_->is_cuda_handle_set = true; diff --git a/src/pb_tensor.h b/src/pb_tensor.h index 83c60bd2..09ab36f8 100644 --- a/src/pb_tensor.h +++ b/src/pb_tensor.h @@ -244,6 +244,9 @@ class PbTensor { /// \return The location to the memory where the data is stored. void* GetDataPtr() const; + void SetMemoryType(TRITONSERVER_MemoryType memory_type); + void SetMemoryTypeId(int64_t memory_type_id); + /// Set the underlying pointer to use. This must be only used when the tensor /// is being reused. void SetDataPtr(void* ptr); diff --git a/src/python.cc b/src/python.cc index 9370d8f4..12e13011 100644 --- a/src/python.cc +++ b/src/python.cc @@ -997,6 +997,8 @@ ModelInstanceState::ProcessRequests( cudaIpcMemHandle_t cuda_ipc_mem_handle; cudaError_t err = cudaIpcGetMemHandle(&cuda_ipc_mem_handle, buffer); output_tensor->SetCudaIpcMemHandle(&cuda_ipc_mem_handle); + output_tensor->SetMemoryType(actual_memory_type); + output_tensor->SetMemoryTypeId(actual_memory_type_id); if (err != cudaSuccess) { GUARDED_RESPOND_IF_ERROR( From abad07ffb42f8eddebe6f9599131a8dcb843fbd3 Mon Sep 17 00:00:00 2001 From: madhu-nvda <77174106+madhu-nvda@users.noreply.github.com> Date: Wed, 9 Feb 2022 12:14:32 -0800 Subject: [PATCH 013/216] Turn off git shallow so that users can pin older commits (#121) Co-authored-by: msridhara --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6141312..e9794bfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,19 +57,16 @@ FetchContent_Declare( repo-common GIT_REPOSITORY https://github.com/triton-inference-server/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} - GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY https://github.com/triton-inference-server/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} - GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY https://github.com/triton-inference-server/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} - GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) From 630293054cc2c4fb536cbc927953a9721e60aed7 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 8 Mar 2022 12:21:20 -0500 Subject: [PATCH 014/216] Add a note about Python GIL (#124) * Add a note about Python GIL * Review edits --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 8954813b..4fa1a816 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ any C++ code. - [Important Notes](#important-notes) - [Error Handling](#error-handling) - [Managing Shared Memory](#managing-shared-memory) + - [Multiple Model Instance Support](#multiple-model-instance-support) - [Business Logic Scripting](#business-logic-scripting) - [Using BLS with Stateful Models](#using-bls-with-stateful-models) - [Limitations](#limitations) @@ -549,6 +550,22 @@ properly set the `--shm-size` flag depending on the size of your inputs and outputs. The default value for docker run command is `64MB` which is very small. +## Multiple Model Instance Support + +Python interpreter uses a global lock known as +[GIL](https://docs.python.org/3/c-api/init.html#thread-state-and-the-global-interpreter-lock). +Because of GIL, it is not possible have multiple threads running in the same +Python interpreter simultaneously as each thread requires to acquire the GIL +when accessing Python objects which will serialize all the operations. In order +to work around this issue, Python backend spawns a separate process for each +[model instance](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#multiple-model-instances). +This is in contrast with how other Triton backends such as +[ONNXRuntime](https://github.com/triton-inference-server/onnxruntime_backend), +[TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), and +[PyTorch](https://github.com/triton-inference-server/pytorch_backend) handle +multiple instances. Increasing the instance count for these backends will create +additional threads instead of spawning separate processes. + # Business Logic Scripting Triton's From 17ecaf6a262b90fe059afff8d67469823e1d3682 Mon Sep 17 00:00:00 2001 From: jbkyang-nvi <80359429+jbkyang-nvi@users.noreply.github.com> Date: Mon, 14 Mar 2022 11:04:07 -0700 Subject: [PATCH 015/216] Update Inferentia test run script (#125) Update Inferentia test run script --- .../qa/setup_test_enviroment_and_test.sh | 83 ++++++++++++++++++- inferentia/scripts/setup.sh | 4 +- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/inferentia/qa/setup_test_enviroment_and_test.sh b/inferentia/qa/setup_test_enviroment_and_test.sh index 0999dd97..7d066157 100644 --- a/inferentia/qa/setup_test_enviroment_and_test.sh +++ b/inferentia/qa/setup_test_enviroment_and_test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -44,6 +44,64 @@ export TEST_JSON_REPO=/opt/tritonserver/qa/common/inferentia_perf_analyzer_input export TEST_REPO=/opt/tritonserver/qa/L0_inferentia_perf_analyzer export TEST_SCRIPT="test.sh" CONTAINER_NAME="qa_container" +CONTAINER_VERSION="" +UPSTREAM_CONTAINER_VERSION="" + + +USAGE=" +usage: setup_test_enviroment_and_test.sh [options]. These setting will override exported variables + +Setup enviroment for testing on Inferentia chips and run perf analyzer tests. +-h|--help Shows usage +-d|--default-repo-tag DEFAULT_REPO_TAG for building the test container. Default is main +-s|--server-repo-tag TRITON_SERVER_REPO_TAG for building test container. Default same DEFAULT_REPO_TAG +-c|--client-repo-tag TRITON_CLIENT_REPO_TAG for building test container. Default same DEFAULT_REPO_TAG +-v|--container-version Container version used in build.py. Default is container version used in build.py +-u|--upstream-container-version Upstream container version for test container. Default is container version used in build.py +-p|--triton-path The path where python backend is located and where server repo will be cloned to. Default is /home/ubuntu +" + +# Get all options: +OPTS=$(getopt -o hd:s:c:v:u:p: --long help,default-repo-tag:,server-repo-tag:,client-repo-tag:,container-version:,upstream-container-version:,triton-path -- "$@") + +for OPTS; do + case "$OPTS" in + -h|--help) + printf "%s\\n" "$USAGE" + return 0 + ;; + -d|--default-repo-tag) + export DEFAULT_REPO_TAG=$2 + echo "Default repo tag set to: ${DEFAULT_REPO_TAG}" + shift 2 + ;; + -s|--server-repo-tag) + export TRITON_SERVER_REPO_TAG=$2 + shift 2 + echo "Server repo tag set to: ${TRITON_SERVER_REPO_TAG}" + ;; + -c|--client-repo-tag) + export TRITON_CLIENT_REPO_TAG=$2 + echo "Client repo tag set to: ${TRITON_CLIENT_REPO_TAG}" + shift 2 + ;; + -v|--container-version) + export CONTAINER_VERSION=$2 + echo "Container version set to: ${CONTAINER_VERSION}" + shift 2 + ;; + -u|--upstream-container-version) + export UPSTREAM_CONTAINER_VERSION=$2 + echo "Upstream container version set to: ${UPSTREAM_CONTAINER_VERSION}" + shift 2 + ;; + -p|--triton-path) + export TRITON_PATH=$2 + echo "Triton path set to: ${TRITON_PATH}" + shift 2 + ;; + esac +done cd ${TRITON_PATH} echo "Using server repo tag: $TRITON_SERVER_REPO_TAG" @@ -60,11 +118,26 @@ cd ${TRITON_PATH}/python_backend chmod 777 ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh sudo ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh +# If container version is not known, look up container version and upstream container version from build.py +cd ${TRITON_PATH}/server +if [ "${CONTAINER_VERSION}" = "" ]; then + QUERY_STRING="import build; container_version,_= build.get_container_versions('$(cat TRITON_VERSION)', None, None); print(container_version)" + CONTAINER_VERSION=$(python3 -c "${QUERY_STRING}") + echo "found container version: ${CONTAINER_VERSION} from build.py" +fi +if [ "${UPSTREAM_CONTAINER_VERSION}" = "" ]; then + QUERY_STRING="import build; _,upstream_container_version = build.get_container_versions('$(cat TRITON_VERSION)', None, None); print(upstream_container_version)" + UPSTREAM_CONTAINER_VERSION=$(python3 -c "${QUERY_STRING}") + echo "found upstream container version: ${UPSTREAM_CONTAINER_VERSION} from build.py" +fi + # Build container with only python backend cd ${TRITON_PATH}/server pip3 install docker ./build.py --build-dir=/tmp/tritonbuild \ --cmake-dir=${TRITON_PATH}/server/build \ + --container-version=${CONTAINER_VERSION} \ + --upstream-container-version=${UPSTREAM_CONTAINER_VERSION} \ --enable-logging --enable-stats --enable-tracing \ --enable-metrics --enable-gpu-metrics --enable-gpu \ --filesystem=gcs --filesystem=azure_storage --filesystem=s3 \ @@ -82,7 +155,13 @@ docker tag tritonserver "${BASE_IMAGE}" # Build docker container for SDK docker build -t ${SDK_IMAGE} \ -f ${TRITON_PATH}/server/Dockerfile.sdk \ - --build-arg "TRITON_CLIENT_REPO_SUBDIR=clientrepo" . + --build-arg "BASE_IMAGE=nvcr.io/nvidia/tritonserver:${UPSTREAM_CONTAINER_VERSION}-py3-min" \ + --build-arg "TRITON_CLIENT_REPO_SUBDIR=clientrepo" \ + --build-arg "TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG}" \ + --build-arg "TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG}" \ + --build-arg "TRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG}" \ + --build-arg "TRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG}" \ + --build-arg "NVIDIA_TRITON_SERVER_SDK_VERSION=${CONTAINER_VERSION}" . # Build QA container docker build -t ${QA_IMAGE} \ diff --git a/inferentia/scripts/setup.sh b/inferentia/scripts/setup.sh index 4cf4920a..2fa322ff 100644 --- a/inferentia/scripts/setup.sh +++ b/inferentia/scripts/setup.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USAGE=" -usage: test_script.sh [options] +usage: setup.sh [options] Sets up python execution environment for AWS Neuron SDK for execution on Inferentia chips. -h|--help Shows usage From a0b22d9ab35945e8352e36a0f00a62779191e199 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 15 Mar 2022 17:28:37 -0400 Subject: [PATCH 016/216] Add buffer attributes to fix the cuda shared memory issue (#126) * Add buffer attributes to Python backend * Remove extra line --- src/pb_main_utils.cc | 2 + src/python.cc | 87 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 70 insertions(+), 19 deletions(-) diff --git a/src/pb_main_utils.cc b/src/pb_main_utils.cc index e9f371b3..dbbfc929 100644 --- a/src/pb_main_utils.cc +++ b/src/pb_main_utils.cc @@ -73,6 +73,8 @@ ResponseAlloc( int64_t* actual_memory_type_id) { SharedMemory* shm_pool = reinterpret_cast(userp); + *actual_memory_type = preferred_memory_type; + *actual_memory_type_id = preferred_memory_type_id; // If 'byte_size' is zero just return 'buffer' == nullptr, we don't // need to do any other book-keeping. diff --git a/src/python.cc b/src/python.cc index 12e13011..698e392b 100644 --- a/src/python.cc +++ b/src/python.cc @@ -990,29 +990,51 @@ ModelInstanceState::ProcessRequests( response_output, &buffer, output_tensor->ByteSize(), &actual_memory_type, &actual_memory_type_id)); + TRITONSERVER_BufferAttributes* output_buffer_attributes; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_OutputBufferAttributes( + response_output, &output_buffer_attributes)); + if (src_memory_type == TRITONSERVER_MEMORY_GPU && actual_memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU - cudaSetDevice(output_tensor->MemoryTypeId()); - cudaIpcMemHandle_t cuda_ipc_mem_handle; - cudaError_t err = cudaIpcGetMemHandle(&cuda_ipc_mem_handle, buffer); - output_tensor->SetCudaIpcMemHandle(&cuda_ipc_mem_handle); - output_tensor->SetMemoryType(actual_memory_type); - output_tensor->SetMemoryTypeId(actual_memory_type_id); - - if (err != cudaSuccess) { + + if ((*responses)[r] != nullptr) { + cudaIpcMemHandle_t* cuda_ipc_mem_handle_p; GUARDED_RESPOND_IF_ERROR( responses, r, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "failed to get cuda ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str())); - } else { + TRITONSERVER_BufferAttributesCudaIpcHandle( + output_buffer_attributes, + reinterpret_cast(&cuda_ipc_mem_handle_p))); + + cudaSetDevice(output_tensor->MemoryTypeId()); + output_tensor->SetMemoryType(actual_memory_type); + output_tensor->SetMemoryTypeId(actual_memory_type_id); output_tensor->SetDataPtr(buffer); - output_tensor->RawDataShm()->offset = - output_tensor->GetGPUPointerOffset(); + + if (cuda_ipc_mem_handle_p != nullptr) { + output_tensor->SetCudaIpcMemHandle(cuda_ipc_mem_handle_p); + output_tensor->RawDataShm()->offset = + output_tensor->GetGPUPointerOffset(); + } else { + cudaIpcMemHandle_t cuda_ipc_mem_handle; + cudaError_t err = cudaIpcGetMemHandle(&cuda_ipc_mem_handle, buffer); + output_tensor->SetCudaIpcMemHandle(&cuda_ipc_mem_handle); + if (err != cudaSuccess) { + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "failed to get cuda ipc handle: " + + std::string(cudaGetErrorString(err))) + .c_str())); + } else { + output_tensor->RawDataShm()->offset = + output_tensor->GetGPUPointerOffset(); + } + } } #endif } @@ -1556,8 +1578,35 @@ ModelInstanceState::GetInputTensor( input_dtype, src_memory_type, src_memory_type_id, const_cast(buffer), input_byte_size, nullptr /* DLManagedTensor */); - RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( - shm_pool_, input_tensor_shm, true /* copy_cpu */, true /* copy_gpu */)); + + // If the tensor is using the cuda shared memory, we need to extract the + // handle that was used to create the device pointer. This is because of a + // limitation in the legacy CUDA IPC API that doesn't allow getting the + // handle of an exported pointer. If the cuda handle exists, it indicates + // that the cuda shared memory was used and the input is in a single buffer. + // [FIXME] for the case where the input is in cuda shared memory and uses + // multiple input buffers this needs to be changed. + TRITONSERVER_BufferAttributes* buffer_attributes; + + // This value is not used. + const void* buffer_p; + RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes( + in, 0, &buffer_p, &buffer_attributes)); + + cudaIpcMemHandle_t* cuda_ipc_handle; + RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle( + buffer_attributes, reinterpret_cast(&cuda_ipc_handle))); + if (cuda_ipc_handle != nullptr) { + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( + shm_pool_, input_tensor_shm, true /* copy_cpu */, + false /* copy_gpu */)); + RETURN_IF_EXCEPTION(input_tensor->SetCudaIpcMemHandle(cuda_ipc_handle)); + input_tensor->RawDataShm()->offset = input_tensor->GetGPUPointerOffset(); + } else { + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( + shm_pool_, input_tensor_shm, true /* copy_cpu */, + true /* copy_gpu */)); + } #else return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, From c9f79ce25491a66075a563c655a9cc3285069cfb Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 23 Mar 2022 15:25:03 -0400 Subject: [PATCH 017/216] Add new shared memory manager (#128) Move message queue to the new interface Add IPCMessage Refactor pb_env Add shared memory backend string Add Map data structure Add compiling version of initilization using new interface Fix ups Add PbMemory Move pb_tensor to the new interface Move InferRequest class to the new interface Add inference resposne object Add request execution to both of the processes Fix execute using the new interface Add semi-reference counting approach for lifecycle management Fix some lifecycle issues Fix shm growth test Improve the shm growth Fix copyrights and offset renaming Clean up Add NVTX markers and slightly improve the perf Reduce the number of shared memory deallocations Reduce the number of deallocations for InferenceRequest Require fewer shm objects for infer_response and pb_memory Require fewer objects for request_batch response batch Merge ownership of pb_memory and pb_tensor Review edits Move BLS to use the new interface Fix the BLS response release Add GPU tensor support for output tensors Add scoped defer Fix BLS input GPU support Fix error handling Fix bls async Bug fixes Fix restart test Fix CPU only build Add buffer attributes changes Review edits --- CMakeLists.txt | 22 +- src/infer_request.cc | 309 ++- src/infer_request.h | 80 +- src/infer_response.cc | 117 +- src/infer_response.h | 42 +- src/ipc_message.cc | 109 +- src/ipc_message.h | 96 +- src/message_queue.cc | 191 +- src/message_queue.h | 104 +- src/pb_env.cc | 123 +- src/pb_env.h | 6 +- src/pb_error.cc | 31 +- src/pb_error.h | 18 +- src/pb_exception.h | 46 + src/pb_map.cc | 110 + src/pb_map.h | 69 + src/pb_memory.cc | 388 ++++ src/pb_memory.h | 155 ++ src/pb_string.cc | 126 ++ src/pb_string.h | 80 + src/pb_stub.cc | 603 +++--- src/pb_stub.h | 103 +- src/pb_tensor.cc | 498 ++--- src/pb_tensor.h | 213 +- src/pb_utils.cc | 377 +--- src/pb_utils.h | 197 +- src/python.cc | 1764 +++++++++-------- src/{pb_main_utils.cc => request_executor.cc} | 46 +- src/{pb_main_utils.h => request_executor.h} | 12 +- src/scoped_defer.cc | 52 + src/scoped_defer.h | 42 + src/shm_manager.cc | 187 +- src/shm_manager.h | 192 +- 33 files changed, 3842 insertions(+), 2666 deletions(-) create mode 100644 src/pb_exception.h create mode 100644 src/pb_map.cc create mode 100644 src/pb_map.h create mode 100644 src/pb_memory.cc create mode 100644 src/pb_memory.h create mode 100644 src/pb_string.cc create mode 100644 src/pb_string.h rename src/{pb_main_utils.cc => request_executor.cc} (88%) rename src/{pb_main_utils.h => request_executor.h} (86%) create mode 100644 src/scoped_defer.cc create mode 100644 src/scoped_defer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e9794bfa..157f27bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ project(tritonpythonbackend LANGUAGES C CXX) # option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) +option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") @@ -115,6 +116,10 @@ elseif() message(WARNING "TRITON_ENABLE_GPU is OFF, GPU Tensor support will be disabled") endif() # TRITON_ENABLE_GPU +if(${TRITON_ENABLE_NVTX}) + add_definitions(-DTRITON_ENABLE_NVTX=1) +endif() # TRITON_ENABLE_NVTX + find_package(ZLIB REQUIRED) find_package(Threads REQUIRED) @@ -131,14 +136,23 @@ set( src/message_queue.h src/ipc_message.cc src/ipc_message.h + src/pb_string.cc + src/pb_string.h + src/pb_map.cc + src/pb_map.h + src/scoped_defer.cc + src/scoped_defer.h src/pb_error.cc src/pb_error.h + src/pb_memory.cc + src/pb_memory.h src/pb_tensor.cc src/pb_tensor.h src/pb_utils.cc src/pb_utils.h src/shm_manager.cc src/shm_manager.h + src/pb_exception.h ) set( @@ -146,10 +160,10 @@ set( src/python.cc src/pb_env.cc src/pb_env.h - src/pb_main_utils.cc - src/pb_main_utils.h src/pb_metric_reporter.cc src/pb_metric_reporter.h + src/request_executor.cc + src/request_executor.h ) list(APPEND @@ -164,10 +178,10 @@ add_library( set( PYTHNON_BACKEND_STUB_SRCS - src/pb_stub_utils.cc src/pb_stub_utils.h - src/pb_stub.cc + src/pb_stub_utils.cc src/pb_stub.h + src/pb_stub.cc ) list(APPEND diff --git a/src/infer_request.cc b/src/infer_request.cc index 8a011f94..7db7f5bf 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,11 +25,11 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "infer_request.h" -#include +#include #include "pb_utils.h" +#include "scoped_defer.h" #ifdef TRITON_PB_STUB -#include "infer_response.h" #include "pb_stub.h" #endif @@ -95,77 +95,187 @@ InferRequest::SetFlags(uint32_t flags) flags_ = flags; } +bi::managed_external_buffer::handle_t +InferRequest::ShmHandle() +{ + return shm_handle_; +} + void -InferRequest::SaveToSharedMemory( - std::unique_ptr& shm_pool, Request* request_shm) +InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { - request_shm->correlation_id = this->CorrelationId(); - off_t id_offset; - SaveStringToSharedMemory(shm_pool, id_offset, this->RequestId().c_str()); - request_shm->id = id_offset; - request_shm->requested_output_count = this->RequestedOutputNames().size(); - off_t requested_output_names_offset; - off_t* requested_output_names; - shm_pool->Map( - (char**)&requested_output_names, - sizeof(off_t) * request_shm->requested_output_count, - requested_output_names_offset); - request_shm->flags = Flags(); - - request_shm->requested_output_names = requested_output_names_offset; + AllocatedSharedMemory infer_request_shm = shm_pool->Construct( + sizeof(InferRequestShm) + + (RequestedOutputNames().size() * + sizeof(bi::managed_external_buffer::handle_t)) + + (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)) + + PbString::ShmStructSize(ModelName()) + + PbString::ShmStructSize(RequestId())); + + infer_request_shm_ptr_ = + reinterpret_cast(infer_request_shm.data_.get()); + infer_request_shm_ptr_->correlation_id = CorrelationId(); + infer_request_shm_ptr_->input_count = Inputs().size(); + infer_request_shm_ptr_->model_version = model_version_; + infer_request_shm_ptr_->requested_output_count = + RequestedOutputNames().size(); + infer_request_shm_ptr_->flags = Flags(); + + output_names_handle_shm_ptr_ = + reinterpret_cast( + reinterpret_cast(infer_request_shm_ptr_) + + sizeof(InferRequestShm)); + + // [FIXME] This could also be a part of the single allocated memory for this + // object. size_t i = 0; + std::vector> requested_output_names_shm; for (auto& requested_output_name : requested_output_names_) { - SaveStringToSharedMemory( - shm_pool, requested_output_names[i], requested_output_name.c_str()); + std::unique_ptr requested_output_name_shm = + PbString::Create(shm_pool, requested_output_name); + output_names_handle_shm_ptr_[i] = requested_output_name_shm->ShmHandle(); + requested_output_names_shm.emplace_back( + std::move(requested_output_name_shm)); i++; } - request_shm->requested_input_count = this->Inputs().size(); - request_shm->model_version = this->model_version_; - SaveStringToSharedMemory( - shm_pool, request_shm->model_name, this->model_name_.c_str()); + input_tensors_handle_ptr_ = + reinterpret_cast( + reinterpret_cast(output_names_handle_shm_ptr_) + + sizeof(bi::managed_external_buffer::handle_t) * + RequestedOutputNames().size()); + i = 0; + for (auto& input : Inputs()) { + input_tensors_handle_ptr_[i] = input->ShmHandle(); + i++; + } + + size_t model_name_offset = + sizeof(InferRequestShm) + + (RequestedOutputNames().size() * + sizeof(bi::managed_external_buffer::handle_t)) + + (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)); + + std::unique_ptr model_name_shm = PbString::Create( + ModelName(), + reinterpret_cast(infer_request_shm_ptr_) + model_name_offset, + infer_request_shm.handle_ + model_name_offset); + + size_t request_id_offset = + model_name_offset + PbString::ShmStructSize(ModelName()); + std::unique_ptr request_id_shm = PbString::Create( + RequestId(), + reinterpret_cast(infer_request_shm_ptr_) + request_id_offset, + infer_request_shm.handle_ + request_id_offset); + + // Save the references to shared memory. + infer_request_shm_ = std::move(infer_request_shm); + request_id_shm_ = std::move(request_id_shm); + model_name_shm_ = std::move(model_name_shm); + shm_handle_ = infer_request_shm_.handle_; + requested_output_names_shm_ = std::move(requested_output_names_shm); } std::unique_ptr InferRequest::LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t request_offset, - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex) + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle) { - Request* request; - shm_pool->MapOffset((char**)&request, request_offset); + AllocatedSharedMemory infer_request_shm = + shm_pool->Load(request_handle); + InferRequestShm* infer_request_shm_ptr = + reinterpret_cast(infer_request_shm.data_.get()); + + std::vector> requested_output_names_shm; + uint32_t requested_output_count = + infer_request_shm_ptr->requested_output_count; - char* id = nullptr; - LoadStringFromSharedMemory(shm_pool, request->id, id); + bi::managed_external_buffer::handle_t* output_names_handle_shm_ptr = + reinterpret_cast( + (reinterpret_cast(infer_request_shm_ptr) + + sizeof(InferRequestShm))); - uint32_t requested_input_count = request->requested_input_count; + for (size_t output_idx = 0; output_idx < requested_output_count; + ++output_idx) { + std::unique_ptr pb_string = PbString::LoadFromSharedMemory( + shm_pool, output_names_handle_shm_ptr[output_idx]); + requested_output_names_shm.emplace_back(std::move(pb_string)); + } - std::vector> py_input_tensors; - for (size_t input_idx = 0; input_idx < requested_input_count; ++input_idx) { - std::shared_ptr pb_input_tensor = PbTensor::LoadFromSharedMemory( - shm_pool, request->inputs + sizeof(Tensor) * input_idx, - cuda_ipc_open_mutex, cuda_ipc_close_mutex); - py_input_tensors.emplace_back(std::move(pb_input_tensor)); + bi::managed_external_buffer::handle_t* input_names_handle_shm_ptr = + reinterpret_cast( + (reinterpret_cast(infer_request_shm_ptr) + + sizeof(InferRequestShm) + + (infer_request_shm_ptr->requested_output_count * + sizeof(bi::managed_external_buffer::handle_t)))); + + std::vector> input_tensors; + for (size_t input_idx = 0; input_idx < infer_request_shm_ptr->input_count; + ++input_idx) { + std::shared_ptr input_tensor = PbTensor::LoadFromSharedMemory( + shm_pool, input_names_handle_shm_ptr[input_idx], open_cuda_handle); + input_tensors.emplace_back(std::move(input_tensor)); } - std::vector requested_output_names; - uint32_t requested_output_count = request->requested_output_count; - off_t* output_names; - shm_pool->MapOffset((char**)&output_names, request->requested_output_names); + size_t model_name_offset = + sizeof(InferRequestShm) + + (requested_output_count * sizeof(bi::managed_external_buffer::handle_t)) + + (infer_request_shm_ptr->input_count * + sizeof(bi::managed_external_buffer::handle_t)); - for (size_t output_idx = 0; output_idx < requested_output_count; + std::unique_ptr model_name_shm = PbString::LoadFromSharedMemory( + request_handle + model_name_offset, + reinterpret_cast(infer_request_shm_ptr) + model_name_offset); + + size_t request_id_offset = model_name_offset + model_name_shm->Size(); + std::unique_ptr request_id_shm = PbString::LoadFromSharedMemory( + request_handle + request_id_offset, + reinterpret_cast(infer_request_shm_ptr) + request_id_offset); + + return std::unique_ptr(new InferRequest( + infer_request_shm, request_id_shm, requested_output_names_shm, + model_name_shm, input_tensors)); +} + +InferRequest::InferRequest( + AllocatedSharedMemory& infer_request_shm, + std::unique_ptr& request_id_shm, + std::vector>& requested_output_names_shm, + std::unique_ptr& model_name_shm, + std::vector>& input_tensors) + : infer_request_shm_(std::move(infer_request_shm)), + request_id_shm_(std::move(request_id_shm)), + requested_output_names_shm_(std::move(requested_output_names_shm)), + model_name_shm_(std::move(model_name_shm)) +{ + infer_request_shm_ptr_ = + reinterpret_cast(infer_request_shm_.data_.get()); + output_names_handle_shm_ptr_ = + reinterpret_cast( + reinterpret_cast(infer_request_shm_ptr_) + + sizeof(InferRequestShm)); + input_tensors_handle_ptr_ = + reinterpret_cast( + reinterpret_cast(infer_request_shm_ptr_) + + sizeof(InferRequestShm) + + sizeof(bi::managed_external_buffer::handle_t) * + infer_request_shm_ptr_->requested_output_count); + inputs_ = std::move(input_tensors); + + std::vector requested_output_names; + for (size_t output_idx = 0; + output_idx < infer_request_shm_ptr_->requested_output_count; ++output_idx) { - char* output_name = nullptr; - LoadStringFromSharedMemory(shm_pool, output_names[output_idx], output_name); - requested_output_names.emplace_back(output_name); + auto& pb_string = requested_output_names_shm_[output_idx]; + requested_output_names.emplace_back(pb_string->String()); } - char* model_name; - LoadStringFromSharedMemory(shm_pool, request->model_name, model_name); - return std::make_unique( - id, request->correlation_id, std::move(py_input_tensors), - requested_output_names, model_name, request->model_version, - request->flags); + request_id_ = request_id_shm_->String(); + requested_output_names_ = std::move(requested_output_names); + model_name_ = model_name_shm_->String(); + flags_ = infer_request_shm_ptr_->flags; + model_version_ = infer_request_shm_ptr_->model_version; + correlation_id_ = infer_request_shm_ptr_->correlation_id; } #ifdef TRITON_PB_STUB @@ -175,44 +285,57 @@ InferRequest::Exec() ResponseBatch* response_batch = nullptr; bool responses_is_set = false; std::unique_ptr& stub = Stub::GetOrCreateInstance(); - std::unique_ptr& shm_pool = stub->GetSharedMemory(); + std::unique_ptr& shm_pool = stub->SharedMemory(); + bi::managed_external_buffer::handle_t* response_handle = nullptr; + + PythonBackendException pb_exception(std::string{}); + std::unique_ptr ipc_message; + + AllocatedSharedMemory request_batch; + ScopedDefer data_load_complete(std::bind([&ipc_message] { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + ipc_message->ResponseCondition()->notify_all(); + })); try { py::gil_scoped_release release; - std::unique_ptr ipc_message = - std::make_unique(shm_pool, true /* inline_response */); + ipc_message = IPCMessage::Create(shm_pool, true /* inline_response */); bool has_exception = false; PythonBackendException pb_exception(std::string{}); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest; - RequestBatch* request_batch; - shm_pool->Map( - (char**)&request_batch, sizeof(RequestBatch), ipc_message->Args()); - request_batch->batch_size = 1; + request_batch = shm_pool->Construct( + sizeof(RequestBatch) + sizeof(bi::managed_external_buffer::handle_t)); - Request* request; - shm_pool->Map((char**)&request, sizeof(Request), request_batch->requests); + RequestBatch* request_batch_shm_ptr = + reinterpret_cast(request_batch.data_.get()); + request_batch_shm_ptr->batch_size = 1; + ipc_message->Args() = request_batch.handle_; - request->requested_input_count = this->Inputs().size(); - Tensor* tensors; - bool has_gpu_tensor = false; - shm_pool->Map( - (char**)&tensors, sizeof(Tensor) * request->requested_input_count, - request->inputs); + bi::managed_external_buffer::handle_t* requests_shm = + reinterpret_cast( + request_batch.data_.get() + sizeof(RequestBatch)); + request_batch_shm_ptr->batch_size = 1; + bool has_gpu_tensor = false; size_t i = 0; for (auto& input_tensor : inputs_) { - input_tensor->SaveToSharedMemory( - shm_pool, &tensors[i], true /* copy_cpu */, false /* copy_gpu */); + input_tensor->SaveToSharedMemory(shm_pool, false /* copy_gpu */); if (!input_tensor->IsCPU()) { has_gpu_tensor = true; } ++i; } - SaveToSharedMemory(shm_pool, request); + SaveToSharedMemory(shm_pool); + + // Save the shared memory offset of the request. + *requests_shm = ShmHandle(); + + // Send the BLS request to the parent process and wait for the response. { bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; @@ -220,21 +343,31 @@ InferRequest::Exec() ipc_message->ResponseCondition()->wait(lock); } + // Additional round trip required for asking the stub process + // to fill in the GPU tensor buffers if (has_gpu_tensor) { + AllocatedSharedMemory + gpu_buffers_handle = + shm_pool->Load( + request_batch_shm_ptr->gpu_buffers_handle); try { +#ifdef TRITON_ENABLE_GPU + size_t i = 0; for (auto& input_tensor : this->Inputs()) { if (!input_tensor->IsCPU()) { -#ifdef TRITON_ENABLE_GPU - input_tensor->SetCudaIpcMutexes( - stub->CudaIpcOpenMutex(), stub->CudaIpcCloseMutex()); - input_tensor->LoadGPUData(shm_pool); -#endif // TRITON_ENABLE_GPU + std::unique_ptr dst_buffer = + PbMemory::LoadFromSharedMemory( + shm_pool, (gpu_buffers_handle.data_.get())[i], + true /* open cuda handle */); + PbMemory::CopyBuffer(dst_buffer, input_tensor->Memory()); + ++i; } } +#endif // TRITON_ENABLE_GPU } catch (const PythonBackendException& exception) { // We need to catch the exception here. Otherwise, we will not notify - // the main process and it will wait for the resposne forever. + // the main process and it will wait for the response forever. pb_exception = exception; has_exception = true; } @@ -255,17 +388,23 @@ InferRequest::Exec() // Get the response for the current message. std::unique_ptr bls_response = IPCMessage::LoadFromSharedMemory( - shm_pool, ipc_message->RequestOffset()); - shm_pool->MapOffset((char**)&response_batch, bls_response->Args()); - responses_is_set = true; + shm_pool, ipc_message->ResponseHandle()); + + AllocatedSharedMemory response_batch_shm = + shm_pool->Load(bls_response->Args()); + response_batch = + reinterpret_cast(response_batch_shm.data_.get()); + response_handle = reinterpret_cast( + response_batch_shm.data_.get() + sizeof(ResponseBatch)); + responses_is_set = true; if (response_batch->has_error) { if (response_batch->is_error_set) { - char* err_string; - LoadStringFromSharedMemory(shm_pool, response_batch->error, err_string); + std::unique_ptr pb_string = + PbString::LoadFromSharedMemory(shm_pool, response_batch->error); return std::make_unique( std::vector>{}, - std::make_shared(err_string)); + std::make_shared(pb_string->String())); } else { return std::make_unique( std::vector>{}, @@ -283,8 +422,7 @@ InferRequest::Exec() if (responses_is_set) { std::unique_ptr infer_response = InferResponse::LoadFromSharedMemory( - shm_pool, response_batch->responses, stub->CudaIpcOpenMutex(), - stub->CudaIpcCloseMutex()); + shm_pool, *response_handle, true /* open cuda handle */); return infer_response; } else { @@ -296,4 +434,5 @@ InferRequest::Exec() } #endif + }}} // namespace triton::backend::python diff --git a/src/infer_request.h b/src/infer_request.h index 233a77d4..3d773a7f 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -31,15 +31,19 @@ #include "pb_tensor.h" namespace triton { namespace backend { namespace python { -class InferRequest { - std::string request_id_; - uint64_t correlation_id_; - std::vector> inputs_; - std::vector requested_output_names_; - std::string model_name_; - int64_t model_version_; - uint32_t flags_; +// +// Inference Request +// +struct InferRequestShm { + uint64_t correlation_id; + uint32_t input_count; + uint32_t requested_output_count; + int64_t model_version; + uint32_t flags; +}; + +class InferRequest { public: InferRequest( const std::string& request_id, uint64_t correlation_id, @@ -56,23 +60,59 @@ class InferRequest { uint32_t Flags(); void SetFlags(uint32_t flags); const std::vector& RequestedOutputNames(); + bi::managed_external_buffer::handle_t ShmHandle(); + +#ifdef TRITON_PB_STUB + std::unique_ptr Exec(); +#endif /// Save an Inference Request to shared memory. /// \param shm_pool Shared memory pool to save the inference request. - /// \param request_shm A pointer to a location in shared memory with enough - /// space to save the inference request. - void SaveToSharedMemory( - std::unique_ptr& shm_pool, Request* request_shm); + void SaveToSharedMemory(std::unique_ptr& shm_pool); /// Create an Inference Request object from shared memory. /// \param shm_pool Shared memory pool - /// \param request_offset Shared memory offset of the request. + /// \param request_handle Shared memory handle of the request. + /// \param open_cuda_handle Determines if the tensor in the infer request + /// object is a GPU tensor, to call the cudaIpcOpenMemHandle to obtain the + /// tensor or not. + /// \return Returns the infer request in the specified request_handle + /// location. static std::unique_ptr LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t request_offset, - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex); -#ifdef TRITON_PB_STUB - std::unique_ptr Exec(); -#endif + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t request_handle, + bool open_cuda_handle); + + /// Disallow copying the inference request object. + DISALLOW_COPY_AND_ASSIGN(InferRequest); + + ~InferRequest() {} + + private: + InferRequest( + AllocatedSharedMemory& infer_request_shm, + std::unique_ptr& request_id_shm, + std::vector>& requested_output_names_shm, + std::unique_ptr& model_name_shm, + std::vector>& input_tensors); + + std::string request_id_; + uint64_t correlation_id_; + std::vector> inputs_; + std::vector requested_output_names_; + std::string model_name_; + int64_t model_version_; + uint32_t flags_; + + // Shared Memory Data Structures + AllocatedSharedMemory infer_request_shm_; + InferRequestShm* infer_request_shm_ptr_; + + std::unique_ptr request_id_shm_; + std::vector> requested_output_names_shm_; + std::unique_ptr model_name_shm_; + bi::managed_external_buffer::handle_t* output_names_handle_shm_ptr_; + bi::managed_external_buffer::handle_t* input_tensors_handle_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; }; }}}; // namespace triton::backend::python diff --git a/src/infer_response.cc b/src/infer_response.cc index 56b2c366..c8f5543d 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -52,87 +52,100 @@ InferResponse::HasError() return error_.get() != nullptr; } -bool -InferResponse::IsErrorMessageSet() -{ - return is_message_set_; -} - void InferResponse::SaveToSharedMemory( - std::unique_ptr& shm_pool, Response* response_shm, - bool copy_cpu, bool copy_gpu) + std::unique_ptr& shm_pool, bool copy_gpu) { size_t output_tensor_length = output_tensors_.size(); - response_shm->has_error = false; - response_shm->is_error_set = false; + if (HasError()) { + response_shm_ = shm_pool->Construct(sizeof(ResponseShm)); + } else { + response_shm_ = shm_pool->Construct( + sizeof(ResponseShm) + + output_tensor_length * sizeof(bi::managed_external_buffer::handle_t)); + } + + ResponseShm* response_shm_ptr = + reinterpret_cast(response_shm_.data_.get()); + response_shm_ptr->has_error = false; + response_shm_ptr->is_error_set = false; + shm_handle_ = response_shm_.handle_; // Only save the output tensors to shared memory when the inference response // doesn't have error. - if (this->HasError()) { - response_shm->has_error = true; - off_t error_offset; - SaveStringToSharedMemory( - shm_pool, error_offset, this->Error()->Message().c_str()); - response_shm->is_error_set = true; - response_shm->error = error_offset; - response_shm->outputs_size = 0; + if (HasError()) { + response_shm_ptr->has_error = true; + Error()->SaveToSharedMemory(shm_pool); + + response_shm_ptr->is_error_set = true; + response_shm_ptr->error = Error()->ShmHandle(); + response_shm_ptr->outputs_size = 0; } else { - Tensor* output_tensors_shm; - off_t output_tensors_offset; - shm_pool->Map( - (char**)&output_tensors_shm, sizeof(Tensor) * output_tensor_length, - output_tensors_offset); - response_shm->outputs = output_tensors_offset; - response_shm->outputs_size = output_tensor_length; + bi::managed_external_buffer::handle_t* tensor_handle_shm_ptr = + reinterpret_cast( + response_shm_.data_.get() + sizeof(ResponseShm)); + response_shm_ptr->outputs_size = output_tensor_length; size_t j = 0; for (auto& output_tensor : output_tensors_) { - Tensor* output_tensor_shm = &output_tensors_shm[j]; - output_tensor->SaveToSharedMemory( - shm_pool, output_tensor_shm, copy_cpu, copy_gpu); + output_tensor->SaveToSharedMemory(shm_pool, copy_gpu); + tensor_handle_shm_ptr[j] = output_tensor->ShmHandle(); j++; } } } +bi::managed_external_buffer::handle_t +InferResponse::ShmHandle() +{ + return shm_handle_; +} + std::unique_ptr InferResponse::LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t response_offset, - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex) + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t response_handle, + bool open_cuda_handle) { - Response* response; - shm_pool->MapOffset((char**)&response, response_offset); - uint32_t requested_output_count = response->outputs_size; + AllocatedSharedMemory response_shm = + shm_pool->Load(response_handle); + ResponseShm* response_shm_ptr = + reinterpret_cast(response_shm.data_.get()); + uint32_t requested_output_count = response_shm_ptr->outputs_size; std::shared_ptr pb_error; - std::vector> py_output_tensors; + std::vector> output_tensors; // If the error field is set, do not load output tensors from shared memory. - if (response->has_error) { - pb_error = std::make_shared(""); - - char* error_string; - if (response->is_error_set) { - LoadStringFromSharedMemory(shm_pool, response->error, error_string); - pb_error = std::make_shared(error_string); - } + if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) { + pb_error = PbError::LoadFromSharedMemory(shm_pool, response_shm_ptr->error); + } else if (response_shm_ptr->has_error && !response_shm_ptr->is_error_set) { + pb_error = + std::make_shared("Failed to retrieve the response error."); } else { + bi::managed_external_buffer::handle_t* tensor_handle_shm = + reinterpret_cast( + response_shm.data_.get() + sizeof(ResponseShm)); for (size_t idx = 0; idx < requested_output_count; ++idx) { std::shared_ptr pb_tensor = PbTensor::LoadFromSharedMemory( - shm_pool, response->outputs + sizeof(Tensor) * idx, - cuda_ipc_open_mutex, cuda_ipc_close_mutex); - py_output_tensors.emplace_back(std::move(pb_tensor)); + shm_pool, tensor_handle_shm[idx], open_cuda_handle); + output_tensors.emplace_back(std::move(pb_tensor)); } } - std::unique_ptr infer_response = - std::make_unique(py_output_tensors, pb_error); - if (response->is_error_set) - infer_response->is_message_set_ = true; + return std::unique_ptr( + new InferResponse(response_shm, output_tensors, pb_error)); +} - return infer_response; +InferResponse::InferResponse( + AllocatedSharedMemory& response_shm, + std::vector>& output_tensors, + std::shared_ptr& pb_error) +{ + response_shm_ = std::move(response_shm); + output_tensors_ = std::move(output_tensors); + error_ = std::move(pb_error); + shm_handle_ = response_shm_.handle_; } std::shared_ptr& diff --git a/src/infer_response.h b/src/infer_response.h index eb1eccac..30a0c6e5 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -31,30 +31,42 @@ #include "pb_utils.h" namespace triton { namespace backend { namespace python { -class InferResponse { - std::vector> output_tensors_; - std::shared_ptr error_; - bool is_message_set_ = false; +struct ResponseShm { + uint32_t outputs_size; + bi::managed_external_buffer::handle_t error; + bool has_error; + // Indicates whether this error has a message or not. + bool is_error_set; +}; + +class InferResponse { public: InferResponse( const std::vector>& output_tensors, std::shared_ptr error = nullptr); - InferResponse(const std::vector>& output_tensors); - bool IsErrorMessageSet(); std::vector>& OutputTensors(); void SaveToSharedMemory( - std::unique_ptr& shm_pool, Response* response_shm, - bool copy_cpu, bool copy_gpu); + std::unique_ptr& shm_pool, bool copy_gpu = true); static std::unique_ptr LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t response_offset, - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex); + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t response_handle, + bool open_cuda_handle); bool HasError(); std::shared_ptr& Error(); + bi::managed_external_buffer::handle_t ShmHandle(); + + // Disallow copying the inference response object. + DISALLOW_COPY_AND_ASSIGN(InferResponse); - // Copying inference response objects is not allowed. - InferResponse(const InferResponse& other) = delete; - InferResponse& operator=(const InferResponse& other) = delete; + private: + InferResponse( + AllocatedSharedMemory& response_shm, + std::vector>& output_tensors, + std::shared_ptr& pb_error); + std::vector> output_tensors_; + std::shared_ptr error_; + bi::managed_external_buffer::handle_t shm_handle_; + AllocatedSharedMemory response_shm_; }; }}} // namespace triton::backend::python diff --git a/src/ipc_message.cc b/src/ipc_message.cc index a2f4c152..a81e3a2e 100644 --- a/src/ipc_message.cc +++ b/src/ipc_message.cc @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,61 +29,124 @@ #include namespace triton { namespace backend { namespace python { +std::unique_ptr +IPCMessage::Create( + const std::unique_ptr& shm_pool, bool inline_response) +{ + AllocatedSharedMemory ipc_message_shm = + shm_pool->Construct(); + + ipc_message_shm.data_->inline_response = inline_response; + AllocatedSharedMemory response_mutex_shm; + AllocatedSharedMemory response_cond_shm; + if (inline_response) { + response_mutex_shm = std::move(shm_pool->Construct( + 1 /* count */, true /* aligned */)); + response_cond_shm = + std::move(shm_pool->Construct( + 1 /* count */, true /* aligned */)); + + ipc_message_shm.data_->response_mutex = response_mutex_shm.handle_; + ipc_message_shm.data_->response_cond = response_cond_shm.handle_; + new (response_mutex_shm.data_.get()) bi::interprocess_mutex{}; + new (response_cond_shm.data_.get()) bi::interprocess_condition{}; + } + + return std::unique_ptr( + new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm)); +} std::unique_ptr IPCMessage::LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t message_offset) + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t message_handle) { - std::unique_ptr ipc_message = std::make_unique(); - ipc_message->shm_offset_ = message_offset; - shm_pool->MapOffset((char**)&ipc_message->ipc_message_shm_, message_offset); - - if (ipc_message->ipc_message_shm_->inline_response) { - shm_pool->MapOffset( - (char**)&ipc_message->response_mutex_, - ipc_message->ipc_message_shm_->response_mutex); - shm_pool->MapOffset( - (char**)&ipc_message->response_cond_, - ipc_message->ipc_message_shm_->response_cond); + AllocatedSharedMemory ipc_message_shm = + shm_pool->Load(message_handle); + + AllocatedSharedMemory response_mutex_shm; + AllocatedSharedMemory response_cond_shm; + if (ipc_message_shm.data_->inline_response) { + response_mutex_shm = shm_pool->Load( + ipc_message_shm.data_->response_mutex); + response_cond_shm = shm_pool->Load( + ipc_message_shm.data_->response_cond); } - return ipc_message; + return std::unique_ptr( + new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm)); } PYTHONSTUB_CommandType& IPCMessage::Command() { - return ipc_message_shm_->command; + return ipc_message_shm_ptr_->command; } -off_t& +bi::managed_external_buffer::handle_t& IPCMessage::Args() { - return ipc_message_shm_->args; + return ipc_message_shm_ptr_->args; } bool& IPCMessage::InlineResponse() { - return ipc_message_shm_->inline_response; + return ipc_message_shm_ptr_->inline_response; } bi::interprocess_condition* IPCMessage::ResponseCondition() { - return response_cond_; + return response_cond_shm_ptr_; } bi::interprocess_mutex* IPCMessage::ResponseMutex() { - return response_mutex_; + return response_mutex_shm_ptr_; +} + +bi::managed_external_buffer::handle_t& +IPCMessage::ResponseHandle() +{ + return ipc_message_shm_ptr_->response_handle; } -off_t& -IPCMessage::RequestOffset() +bi::managed_external_buffer::handle_t +IPCMessage::ShmHandle() { - return this->ipc_message_shm_->request_offset; + return ipc_message_handle_; +} + +IPCMessage::IPCMessage( + AllocatedSharedMemory& ipc_message_shm, + AllocatedSharedMemory& response_mutex_shm, + AllocatedSharedMemory& response_cond_shm) + : ipc_message_shm_(std::move(ipc_message_shm)), + response_mutex_shm_(std::move(response_mutex_shm)), + response_cond_shm_(std::move(response_cond_shm)) +{ + ipc_message_shm_ptr_ = ipc_message_shm_.data_.get(); + response_mutex_shm_ptr_ = response_mutex_shm_.data_.get(); + response_cond_shm_ptr_ = response_cond_shm_.data_.get(); + ipc_message_handle_ = ipc_message_shm_.handle_; +} + +void +IPCMessage::Release() +{ + if (ipc_message_shm_.data_ != nullptr) { + ipc_message_shm_.data_.release(); + } + + if (response_mutex_shm_.data_ != nullptr) { + response_mutex_shm_.data_.release(); + } + + if (response_cond_shm_.data_ != nullptr) { + response_cond_shm_.data_.release(); + } } }}}; // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index 531e8f55..c51deca2 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -51,68 +51,62 @@ typedef enum PYTHONSTUB_commandtype_enum { /// Shared memory representation of IPCMessage /// /// \param command determines the IPC command that is going to be passed. -/// \param args determines the shared memory offset for the input parameters. -/// \param is_response determines whether this is a response of another IPC -/// message. If this parameter is set, it must provide the offset of the -/// corresponding request in \param request_offset. -/// \param request_offset determines the request offset. +/// \param args determines the shared memory handle for the input parameters. +/// \param inline_response determines whether this is a response of another IPC +/// message. If this parameter is set, it must provide the handle of the +/// corresponding request in \param response_handle. +/// \param response_handle determines the request handle. +/// \param response_mutex stores the handle for the mutex for the response +/// object. +/// \param response_cond stores the handle for the condition variable +/// for the response object. struct IPCMessageShm { PYTHONSTUB_CommandType command; - off_t args; + bi::managed_external_buffer::handle_t args; bool inline_response = false; - off_t request_offset; - off_t response_mutex; - off_t response_cond; + bi::managed_external_buffer::handle_t response_handle; + bi::managed_external_buffer::handle_t response_mutex; + bi::managed_external_buffer::handle_t response_cond; }; class IPCMessage { - struct IPCMessageShm* ipc_message_shm_; - off_t shm_offset_; - bi::interprocess_mutex* response_mutex_; - bi::interprocess_condition* response_cond_; - public: - IPCMessage() {} - IPCMessage( - const std::unique_ptr& shm_pool, bool inline_response) - { - shm_pool->Map( - (char**)&ipc_message_shm_, sizeof(IPCMessageShm), shm_offset_); - - ipc_message_shm_->inline_response = inline_response; - if (inline_response) { - shm_pool->Map( - (char**)&response_mutex_, sizeof(bi::interprocess_mutex) + 15, - ipc_message_shm_->response_mutex); - shm_pool->Map( - (char**)&response_cond_, sizeof(bi::interprocess_condition) + 15, - ipc_message_shm_->response_cond); - - void* ptr_a = reinterpret_cast( - ((uintptr_t)response_cond_ + 15) & ~(uintptr_t)0x0F); - ipc_message_shm_->response_cond += ((char*)ptr_a - (char*)response_cond_); - void* ptr_b = reinterpret_cast( - ((uintptr_t)response_mutex_ + 15) & ~(uintptr_t)0x0F); - ipc_message_shm_->response_mutex += - ((char*)ptr_b - (char*)response_mutex_); - response_cond_ = reinterpret_cast(ptr_a); - response_mutex_ = reinterpret_cast(ptr_b); - - new (response_cond_) bi::interprocess_condition; - new (response_mutex_) bi::interprocess_mutex; - } - } - - off_t SharedMemoryOffset() { return shm_offset_; } - + static std::unique_ptr Create( + const std::unique_ptr& shm_pool, + bool inline_response); static std::unique_ptr LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t message_offset); + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t message_handle); + PYTHONSTUB_CommandType& Command(); bool& InlineResponse(); - off_t& RequestOffset(); + bi::managed_external_buffer::handle_t& ResponseHandle(); bi::interprocess_condition* ResponseCondition(); bi::interprocess_mutex* ResponseMutex(); - off_t& Args(); + bi::managed_external_buffer::handle_t& Args(); + bi::managed_external_buffer::handle_t ShmHandle(); + void Release(); + + private: + AllocatedSharedMemory ipc_message_shm_; + IPCMessageShm* ipc_message_shm_ptr_; + + AllocatedSharedMemory response_mutex_shm_; + bi::interprocess_mutex* response_mutex_shm_ptr_; + + AllocatedSharedMemory response_cond_shm_; + bi::interprocess_condition* response_cond_shm_ptr_; + + bi::managed_external_buffer::handle_t ipc_message_handle_; + + /// Create/load a IPCMessage shm object. + /// \param ipc_message_shm IPCMessage representation in shared memory. + /// \param response_mutex_shm response mutex. + /// \param response_condition_shm response condition. + IPCMessage( + AllocatedSharedMemory& ipc_message_shm, + AllocatedSharedMemory& response_mutex_shm, + AllocatedSharedMemory& response_cond_shm); }; }}}; // namespace triton::backend::python diff --git a/src/message_queue.cc b/src/message_queue.cc index fc69cbcc..da4a5f8b 100644 --- a/src/message_queue.cc +++ b/src/message_queue.cc @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,52 +29,53 @@ #include #include #include -#include "ipc_message.h" -#include "pb_utils.h" namespace triton { namespace backend { namespace python { + +std::unique_ptr +MessageQueue::Create( + std::unique_ptr& shm_pool, uint32_t message_queue_size) +{ + AllocatedSharedMemory mq_shm = + shm_pool->Construct(); + mq_shm.data_->size = message_queue_size; + + AllocatedSharedMemory mq_buffer_shm = + shm_pool->Construct( + message_queue_size); + mq_shm.data_->buffer = mq_buffer_shm.handle_; + mq_shm.data_->head = 0; + mq_shm.data_->tail = 0; + + new (&(mq_shm.data_->mutex)) bi::interprocess_mutex{}; + new (&(mq_shm.data_->sem_empty)) + bi::interprocess_semaphore{message_queue_size}; + new (&(mq_shm.data_->sem_full)) bi::interprocess_semaphore{0}; + + return std::unique_ptr(new MessageQueue(mq_shm, mq_buffer_shm)); +} + MessageQueue::MessageQueue( - std::unique_ptr& shm_pool, std::size_t number_of_messages) + AllocatedSharedMemory& mq_shm, + AllocatedSharedMemory& mq_buffer_shm) + : mq_shm_(std::move(mq_shm)), mq_buffer_shm_(std::move(mq_buffer_shm)) { - MessageQueueShm* message_queue_shm; - shm_pool->Map( - (char**)&message_queue_shm, sizeof(MessageQueueShm), shm_struct_); - - message_queue_shm->size = number_of_messages; - size_ = &(message_queue_shm->size); - - message_queue_shm->index = 0; - index_ = &(message_queue_shm->index); - - shm_pool->Map( - (char**)&sem_full_, sizeof(bi::interprocess_semaphore), - message_queue_shm->sem_full); - shm_pool->Map( - (char**)&sem_empty_, sizeof(bi::interprocess_semaphore), - message_queue_shm->sem_empty); - - new (sem_full_) bi::interprocess_semaphore(0); - new (sem_empty_) bi::interprocess_semaphore(number_of_messages); - - shm_pool->Map( - (char**)&mutex_, sizeof(bi::interprocess_mutex), - message_queue_shm->mutex); - new (mutex_) bi::interprocess_mutex; - - shm_pool->Map( - (char**)&buffer_, sizeof(off_t) * number_of_messages, - message_queue_shm->buffer); + mq_buffer_shm_ptr_ = mq_buffer_shm_.data_.get(); + mq_shm_ptr_ = mq_shm_.data_.get(); + mq_handle_ = mq_shm_.handle_; } void -MessageQueue::Push(off_t message, int const& duration, bool& success) +MessageQueue::Push( + bi::managed_external_buffer::handle_t message, int const& duration, + bool& success) { boost::system_time timeout = boost::get_system_time() + boost::posix_time::milliseconds(duration); while (true) { try { - if (!sem_empty_->timed_wait(timeout)) { + if (!SemEmptyMutable()->timed_wait(timeout)) { success = false; return; } else { @@ -88,26 +89,26 @@ MessageQueue::Push(off_t message, int const& duration, bool& success) { timeout = boost::get_system_time() + boost::posix_time::milliseconds(duration); - bi::scoped_lock lock{*mutex_, timeout}; + bi::scoped_lock lock{*MutexMutable(), timeout}; if (!lock) { - sem_empty_->post(); + SemEmptyMutable()->post(); success = false; return; } success = true; - buffer_[*index_] = message; - (*index_)++; + Buffer()[Head()] = message; + HeadIncrement(); } - sem_full_->post(); + SemFullMutable()->post(); } void -MessageQueue::Push(off_t message) +MessageQueue::Push(bi::managed_external_buffer::handle_t message) { while (true) { try { - sem_empty_->wait(); + SemEmptyMutable()->wait(); break; } catch (bi::interprocess_exception& ex) { @@ -115,27 +116,21 @@ MessageQueue::Push(off_t message) } { - bi::scoped_lock lock{*mutex_}; - buffer_[*index_] = message; - (*index_)++; + bi::scoped_lock lock{*MutexMutable()}; + Buffer()[Head()] = message; + HeadIncrement(); } - sem_full_->post(); -} - -off_t -MessageQueue::ShmOffset() -{ - return shm_struct_; + SemFullMutable()->post(); } -off_t +bi::managed_external_buffer::handle_t MessageQueue::Pop() { - off_t message; + bi::managed_external_buffer::handle_t message; while (true) { try { - sem_full_->wait(); + SemFullMutable()->wait(); break; } catch (bi::interprocess_exception& ex) { @@ -143,25 +138,38 @@ MessageQueue::Pop() } { - bi::scoped_lock lock{*mutex_}; - message = buffer_[*index_ - 1]; - (*index_)--; + bi::scoped_lock lock{*MutexMutable()}; + + message = Buffer()[Tail()]; + TailIncrement(); } - sem_empty_->post(); + SemEmptyMutable()->post(); return message; } -off_t +void +MessageQueue::HeadIncrement() +{ + mq_shm_ptr_->head = (mq_shm_ptr_->head + 1) % Size(); +} + +void +MessageQueue::TailIncrement() +{ + mq_shm_ptr_->tail = (mq_shm_ptr_->tail + 1) % Size(); +} + +bi::managed_external_buffer::handle_t MessageQueue::Pop(int const& duration, bool& success) { - off_t message = 0; + bi::managed_external_buffer::handle_t message = 0; boost::system_time timeout = boost::get_system_time() + boost::posix_time::milliseconds(duration); while (true) { try { - if (!sem_full_->timed_wait(timeout)) { + if (!SemFullMutable()->timed_wait(timeout)) { success = false; return message; } else { @@ -175,18 +183,18 @@ MessageQueue::Pop(int const& duration, bool& success) { timeout = boost::get_system_time() + boost::posix_time::milliseconds(duration); - bi::scoped_lock lock{*mutex_, timeout}; + bi::scoped_lock lock{*MutexMutable(), timeout}; if (!lock) { - sem_full_->post(); + SemFullMutable()->post(); success = false; return message; } success = true; - message = buffer_[*index_ - 1]; - (*index_)--; + message = Buffer()[Tail()]; + TailIncrement(); } - sem_empty_->post(); + SemEmptyMutable()->post(); return message; } @@ -194,31 +202,42 @@ MessageQueue::Pop(int const& duration, bool& success) void MessageQueue::ResetSemaphores() { - new (sem_full_) bi::interprocess_semaphore(0); - new (sem_empty_) bi::interprocess_semaphore(*size_); - new (mutex_) bi::interprocess_mutex; + new (SemFullMutable()) bi::interprocess_semaphore(0); + new (SemEmptyMutable()) bi::interprocess_semaphore(Size()); + new (MutexMutable()) bi::interprocess_mutex; + mq_shm_ptr_->tail = 0; + mq_shm_ptr_->head = 0; } std::unique_ptr MessageQueue::LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t message_queue_offset) + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t message_queue_handle) +{ + AllocatedSharedMemory mq_shm = + shm_pool->Load(message_queue_handle); + AllocatedSharedMemory mq_shm_buffer = + shm_pool->Load( + mq_shm.data_->buffer); + + return std::unique_ptr(new MessageQueue(mq_shm, mq_shm_buffer)); +} + +bi::managed_external_buffer::handle_t +MessageQueue::ShmHandle() { - std::unique_ptr message_queue = - std::make_unique(); - MessageQueueShm* message_queue_shm; - shm_pool->MapOffset((char**)&message_queue_shm, message_queue_offset); - message_queue->size_ = &(message_queue_shm->size); - message_queue->index_ = &(message_queue_shm->index); - - shm_pool->MapOffset((char**)&message_queue->mutex_, message_queue_shm->mutex); - shm_pool->MapOffset( - (char**)&message_queue->sem_full_, message_queue_shm->sem_full); - shm_pool->MapOffset( - (char**)&message_queue->sem_empty_, message_queue_shm->sem_empty); - shm_pool->MapOffset( - (char**)&message_queue->buffer_, message_queue_shm->buffer); - message_queue->shm_struct_ = message_queue_offset; - return message_queue; + return mq_handle_; } +void +MessageQueue::Release() +{ + if (mq_shm_.data_ != nullptr) { + mq_shm_.data_.release(); + } + + if (mq_buffer_shm_.data_ != nullptr) { + mq_buffer_shm_.data_.release(); + } +} }}} // namespace triton::backend::python diff --git a/src/message_queue.h b/src/message_queue.h index 46e9ef12..19811a55 100644 --- a/src/message_queue.h +++ b/src/message_queue.h @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -29,7 +29,6 @@ #include #include #include -#include "ipc_message.h" #include "shm_manager.h" namespace triton { namespace backend { namespace python { @@ -38,55 +37,92 @@ namespace bi = boost::interprocess; /// Struct holding the represenation of a message queue inside the shared /// memory. /// \param size Total size of the message queue. -/// \param mutex Offset of the mutex variable protecting index. +/// \param mutex Handle of the mutex variable protecting index. /// \param index Used element index. /// \param sem_empty Semaphore object counting the number of empty buffer slots. /// \param sem_full Semaphore object counting the number of used buffer slots. struct MessageQueueShm { std::size_t size; - off_t buffer; - off_t mutex; - int index; - off_t sem_empty; - off_t sem_full; + bi::managed_external_buffer::handle_t buffer; + bi::interprocess_mutex mutex; + int head; + int tail; + bi::interprocess_semaphore sem_empty{0}; + bi::interprocess_semaphore sem_full{0}; }; class MessageQueue { - std::size_t* size_; - off_t* buffer_; - bi::interprocess_mutex* mutex_; - int* index_; - bi::interprocess_semaphore* sem_empty_; - bi::interprocess_semaphore* sem_full_; - off_t shm_struct_; - public: - /// Create a Message queue. - /// \param shm_pool Shared memory pool - /// \param number_of_messages Maximum number of messages that the - /// message queue can hold. - MessageQueue( - std::unique_ptr& shm_pool, std::size_t number_of_messages); - MessageQueue() {} + /// Create a new MessageQueue in the shared memory. + static std::unique_ptr Create( + std::unique_ptr& shm_pool, + uint32_t message_queue_size); + + /// Load an already existing message queue from the shared memory. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t message_queue_handle); /// Push a message inside the message queue. - /// \param message The shared memory offset of the message. - void Push(off_t message); - void Push(off_t message, int const& duration, bool& success); + /// \param message The shared memory handle of the message. + void Push(bi::managed_external_buffer::handle_t message); + void Push( + bi::managed_external_buffer::handle_t message, int const& duration, + bool& success); /// Pop a message from the message queue. This call will block until there - /// is a message inside the message queue to return. \return the offset of - /// the new message. - off_t Pop(); - off_t Pop(int const& duration, bool& success); - - off_t ShmOffset(); - static std::unique_ptr LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t message_queue_offset); + /// is a message inside the message queue to return. + /// \return the handle of the new message. + bi::managed_external_buffer::handle_t Pop(); + bi::managed_external_buffer::handle_t Pop(int const& duration, bool& success); /// Resets the semaphores for the message queue. This function is useful for /// when the stub process may have exited unexpectedly and the semaphores need /// to be restarted so that the message queue is in a proper state. void ResetSemaphores(); + + /// Get the shared memory handle of MessageQueue + bi::managed_external_buffer::handle_t ShmHandle(); + + /// Release the ownership of this object in shared memory. + void Release(); + + private: + std::size_t& Size() { return mq_shm_ptr_->size; } + const bi::interprocess_mutex& Mutex() { return mq_shm_ptr_->mutex; } + bi::interprocess_mutex* MutexMutable() { return &(mq_shm_ptr_->mutex); } + int& Head() { return mq_shm_ptr_->head; } + int& Tail() { return mq_shm_ptr_->tail; } + bi::managed_external_buffer::handle_t* Buffer() { return mq_buffer_shm_ptr_; } + const bi::interprocess_semaphore& SemEmpty() + { + return mq_shm_ptr_->sem_empty; + } + bi::interprocess_semaphore* SemEmptyMutable() + { + return &(mq_shm_ptr_->sem_empty); + } + const bi::interprocess_semaphore& SemFull() { return mq_shm_ptr_->sem_full; } + bi::interprocess_semaphore* SemFullMutable() + { + return &(mq_shm_ptr_->sem_full); + } + + void HeadIncrement(); + void TailIncrement(); + + AllocatedSharedMemory mq_shm_; + AllocatedSharedMemory mq_buffer_shm_; + + MessageQueueShm* mq_shm_ptr_; + bi::managed_external_buffer::handle_t* mq_buffer_shm_ptr_; + bi::managed_external_buffer::handle_t mq_handle_; + + /// Create/load a Message queue. + /// \param mq_shm Message queue representation in shared memory. + MessageQueue( + AllocatedSharedMemory& mq_shm, + AllocatedSharedMemory& + mq_buffer_shm); }; }}} // namespace triton::backend::python diff --git a/src/pb_env.cc b/src/pb_env.cc index 4d09a0ce..b0bc5578 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -37,6 +37,127 @@ namespace triton { namespace backend { namespace python { +void +CopySingleArchiveEntry(archive* input_archive, archive* output_archive) +{ + const void* buff; + size_t size; +#if ARCHIVE_VERSION_NUMBER >= 3000000 + int64_t offset; +#else + off_t offset; +#endif + + for (;;) { + int return_status; + return_status = + archive_read_data_block(input_archive, &buff, &size, &offset); + if (return_status == ARCHIVE_EOF) + break; + if (return_status != ARCHIVE_OK) + throw PythonBackendException( + "archive_read_data_block() failed with error code = " + + std::to_string(return_status)); + + return_status = + archive_write_data_block(output_archive, buff, size, offset); + if (return_status != ARCHIVE_OK) { + throw PythonBackendException( + "archive_write_data_block() failed with error code = " + + std::to_string(return_status) + ", error message is " + + archive_error_string(output_archive)); + } + } +} + + +void +ExtractTarFile(std::string& archive_path, std::string& dst_path) +{ + char current_directory[PATH_MAX]; + if (getcwd(current_directory, PATH_MAX) == nullptr) { + throw PythonBackendException( + (std::string("Failed to get the current working directory. Error: ") + + std::strerror(errno))); + } + if (chdir(dst_path.c_str()) == -1) { + throw PythonBackendException( + (std::string("Failed to change the directory to ") + dst_path + + " Error: " + std::strerror(errno)) + .c_str()); + } + + struct archive_entry* entry; + int flags = ARCHIVE_EXTRACT_TIME; + + struct archive* input_archive = archive_read_new(); + struct archive* output_archive = archive_write_disk_new(); + archive_write_disk_set_options(output_archive, flags); + + archive_read_support_filter_gzip(input_archive); + archive_read_support_format_tar(input_archive); + + if (archive_path.size() == 0) { + throw PythonBackendException("The archive path is empty."); + } + + THROW_IF_ERROR( + "archive_read_open_filename() failed.", + archive_read_open_filename( + input_archive, archive_path.c_str(), 10240 /* block_size */)); + + while (true) { + int read_status = archive_read_next_header(input_archive, &entry); + if (read_status == ARCHIVE_EOF) + break; + if (read_status != ARCHIVE_OK) { + throw PythonBackendException( + std::string("archive_read_next_header() failed with error code = ") + + std::to_string(read_status) + std::string(" error message is ") + + archive_error_string(input_archive)); + } + + read_status = archive_write_header(output_archive, entry); + if (read_status != ARCHIVE_OK) { + throw PythonBackendException(std::string( + "archive_write_header() failed with error code = " + + std::to_string(read_status) + std::string(" error message is ") + + archive_error_string(output_archive))); + } + + CopySingleArchiveEntry(input_archive, output_archive); + + read_status = archive_write_finish_entry(output_archive); + if (read_status != ARCHIVE_OK) { + throw PythonBackendException(std::string( + "archive_write_finish_entry() failed with error code = " + + std::to_string(read_status) + std::string(" error message is ") + + archive_error_string(output_archive))); + } + } + + archive_read_close(input_archive); + archive_read_free(input_archive); + + archive_write_close(output_archive); + archive_write_free(output_archive); + + // Revert the directory change. + if (chdir(current_directory) == -1) { + throw PythonBackendException( + (std::string("Failed to change the directory to ") + current_directory) + .c_str()); + } +} + +bool +FileExists(std::string& path) +{ + struct stat buffer; + return stat(path.c_str(), &buffer) == 0; +} + + void RecursiveDirectoryDelete(const char* dir) { diff --git a/src/pb_env.h b/src/pb_env.h index 2b49e27c..9e2e5750 100644 --- a/src/pb_env.h +++ b/src/pb_env.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,6 +32,10 @@ namespace triton { namespace backend { namespace python { +void ExtractTarFile(std::string& archive_path, std::string& dst_path); + +bool FileExists(std::string& path); + // // A class that manages Python environments // diff --git a/src/pb_error.cc b/src/pb_error.cc index cabf02e4..e190af42 100644 --- a/src/pb_error.cc +++ b/src/pb_error.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,4 +32,33 @@ PbError::Message() { return message_; } + +bi::managed_external_buffer::handle_t +PbError::ShmHandle() +{ + return shm_handle_; +} + +void +PbError::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + message_shm_ = PbString::Create(shm_pool, message_); + shm_handle_ = message_shm_->ShmHandle(); +} + +std::shared_ptr +PbError::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t shm_handle) +{ + std::unique_ptr message_shm = + PbString::LoadFromSharedMemory(shm_pool, shm_handle); + return std::shared_ptr(new PbError(message_shm)); +} + +PbError::PbError(std::unique_ptr& message_shm) +{ + message_shm_ = std::move(message_shm); + message_ = message_shm_->String(); +} }}} // namespace triton::backend::python diff --git a/src/pb_error.h b/src/pb_error.h index 4123f22e..d4461082 100644 --- a/src/pb_error.h +++ b/src/pb_error.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,13 +27,25 @@ #pragma once #include +#include "pb_string.h" +#include "pb_utils.h" namespace triton { namespace backend { namespace python { class PbError { - std::string message_; - public: PbError(const std::string& message) : message_(message) {} const std::string& Message(); + void SaveToSharedMemory(std::unique_ptr& shm_pool); + bi::managed_external_buffer::handle_t ShmHandle(); + static std::shared_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + DISALLOW_COPY_AND_ASSIGN(PbError); + + private: + PbError(std::unique_ptr& pb_error); + std::string message_; + std::shared_ptr message_shm_; + bi::managed_external_buffer::handle_t shm_handle_; }; }}}; // namespace triton::backend::python diff --git a/src/pb_exception.h b/src/pb_exception.h new file mode 100644 index 00000000..6f96d02a --- /dev/null +++ b/src/pb_exception.h @@ -0,0 +1,46 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +namespace triton { namespace backend { namespace python { + +// +// PythonBackendException +// +// Exception thrown if error occurs in PythonBackend. +// +struct PythonBackendException : std::exception { + PythonBackendException(const std::string& message) : message_(message) {} + + const char* what() const throw() { return message_.c_str(); } + + std::string message_; +}; + +}}} // namespace triton::backend::python diff --git a/src/pb_map.cc b/src/pb_map.cc new file mode 100644 index 00000000..a122db56 --- /dev/null +++ b/src/pb_map.cc @@ -0,0 +1,110 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_map.h" + +namespace triton { namespace backend { namespace python { + +std::unique_ptr +PbMap::Create( + std::unique_ptr& shm_pool, + std::unordered_map& map) +{ + std::vector> strings; + AllocatedSharedMemory dict_shm = shm_pool->Construct(); + dict_shm.data_->length = map.size(); + + AllocatedSharedMemory pair_shms = + shm_pool->Construct(map.size()); + dict_shm.data_->values = pair_shms.handle_; + + size_t i = 0; + for (auto& pair : map) { + auto key = PbString::Create(shm_pool, pair.first); + auto value = PbString::Create(shm_pool, pair.second); + + (pair_shms.data_.get())[i].key = key->ShmHandle(); + (pair_shms.data_.get())[i].value = value->ShmHandle(); + + strings.emplace_back(std::move(key)); + strings.emplace_back(std::move(value)); + i++; + } + + return std::unique_ptr(new PbMap(strings, dict_shm, pair_shms, map)); +} + +const std::unordered_map& +PbMap::UnorderedMap() +{ + return map_; +} + +bi::managed_external_buffer::handle_t +PbMap::ShmHandle() +{ + return dict_handle_; +} + +std::unique_ptr +PbMap::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory dict_shm = shm_pool->Load(handle); + AllocatedSharedMemory pair_shms = + shm_pool->Load(dict_shm.data_->values); + + std::vector> pb_strings; + std::unordered_map map; + for (size_t i = 0; i < dict_shm.data_->length; i++) { + std::unique_ptr key = PbString::LoadFromSharedMemory( + shm_pool, (pair_shms.data_.get())[i].key); + + std::unique_ptr value = PbString::LoadFromSharedMemory( + shm_pool, (pair_shms.data_.get())[i].value); + + map.insert({key->String(), value->String()}); + pb_strings.emplace_back(std::move(key)); + pb_strings.emplace_back(std::move(value)); + } + + return std::unique_ptr( + new PbMap(pb_strings, dict_shm, pair_shms, map)); +} + +PbMap::PbMap( + std::vector>& strings, + AllocatedSharedMemory& dict_shm, + AllocatedSharedMemory& pair_shms, + std::unordered_map& map) + : strings_(std::move(strings)), dict_shm_(std::move(dict_shm)), + pair_shms_(std::move(pair_shms)), map_(std::move(map)) +{ + dict_handle_ = dict_shm.handle_; +} + +}}} // namespace triton::backend::python diff --git a/src/pb_map.h b/src/pb_map.h new file mode 100644 index 00000000..0172e5f4 --- /dev/null +++ b/src/pb_map.h @@ -0,0 +1,69 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include "pb_string.h" +#include "shm_manager.h" + +namespace triton { namespace backend { namespace python { + +struct PairShm { + bi::managed_external_buffer::handle_t key; + bi::managed_external_buffer::handle_t value; +}; + +struct DictShm { + uint32_t length; + // `values` point to the location where there are `length` of Pair objects. + bi::managed_external_buffer::handle_t values; +}; + + +class PbMap { + public: + static std::unique_ptr Create( + std::unique_ptr& shm_pool, + std::unordered_map& map); + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + const std::unordered_map& UnorderedMap(); + bi::managed_external_buffer::handle_t ShmHandle(); + + private: + PbMap( + std::vector>& strings, + AllocatedSharedMemory& dict_shm, + AllocatedSharedMemory& pair_shms, + std::unordered_map& map); + + std::vector> strings_; + AllocatedSharedMemory dict_shm_; + AllocatedSharedMemory pair_shms_; + bi::managed_external_buffer::handle_t dict_handle_; + std::unordered_map map_; +}; +}}} // namespace triton::backend::python diff --git a/src/pb_memory.cc b/src/pb_memory.cc new file mode 100644 index 00000000..e9801320 --- /dev/null +++ b/src/pb_memory.cc @@ -0,0 +1,388 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_memory.h" + +namespace triton { namespace backend { namespace python { + +std::unique_ptr +PbMemory::Create( + std::unique_ptr& shm_pool, + TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, + uint64_t byte_size, char* data, bool copy_gpu) +{ + size_t requested_byte_size = sizeof(MemoryShm); + + if (memory_type == TRITONSERVER_MEMORY_GPU) { +#ifdef TRITON_ENABLE_GPU + requested_byte_size += sizeof(cudaIpcMemHandle_t); +#endif + } else { + requested_byte_size += byte_size; + } + + AllocatedSharedMemory memory_shm = + shm_pool->Construct(requested_byte_size); + PbMemory::FillShmData( + memory_type, memory_type_id, byte_size, data, memory_shm.data_.get(), + memory_shm.handle_, copy_gpu); + + if (memory_type == TRITONSERVER_MEMORY_CPU) { + data = memory_shm.data_.get() + sizeof(MemoryShm); + } + + std::unique_ptr pb_memory( + new PbMemory(memory_shm, data, false /* opened_cuda_ipc_handle */)); + +#ifdef TRITON_ENABLE_GPU + if (memory_type == TRITONSERVER_MEMORY_GPU) { + pb_memory->memory_shm_ptr_->gpu_pointer_offset = + pb_memory->GetGPUPointerOffset(); + } +#endif + return pb_memory; +} + +#ifndef TRITON_PB_STUB +std::unique_ptr +PbMemory::Create( + std::unique_ptr& shm_pool, + std::unique_ptr&& backend_memory, bool copy_gpu) +{ + std::unique_ptr pb_memory = PbMemory::Create( + shm_pool, backend_memory->MemoryType(), backend_memory->MemoryTypeId(), + backend_memory->ByteSize(), backend_memory->MemoryPtr(), copy_gpu); + pb_memory->backend_memory_ = std::move(backend_memory); + + return pb_memory; +} +#endif + +std::unique_ptr +PbMemory::Create( + TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, + uint64_t byte_size, char* data, char* data_shm, + bi::managed_external_buffer::handle_t handle, bool copy_gpu) +{ + PbMemory::FillShmData( + memory_type, memory_type_id, byte_size, data, data_shm, handle, copy_gpu); + + if (memory_type == TRITONSERVER_MEMORY_CPU) { + data = data_shm + sizeof(MemoryShm); + } + + std::unique_ptr pb_memory( + new PbMemory(data_shm, data, handle, false /* opened_cuda_ipc_handle */)); + +#ifdef TRITON_ENABLE_GPU + if (memory_type == TRITONSERVER_MEMORY_GPU) { + pb_memory->memory_shm_ptr_->gpu_pointer_offset = + pb_memory->GetGPUPointerOffset(); + } +#endif + + return pb_memory; +} + +void +PbMemory::CopyBuffer( + std::unique_ptr& dst, std::unique_ptr& src) +{ + if (src->ByteSize() != dst->ByteSize()) { + throw PythonBackendException( + "Failed to copy memory buffers. Source and destination byte size do " + "not match: " + + std::to_string(dst->ByteSize()) + + " != " + std::to_string(src->ByteSize())); + } + + if (src->MemoryType() == TRITONSERVER_MEMORY_CPU && + dst->MemoryType() == TRITONSERVER_MEMORY_CPU) { + std::memcpy(dst->DataPtr(), src->DataPtr(), dst->ByteSize()); + return; + } + +#ifdef TRITON_ENABLE_GPU + cudaMemcpyKind kind = cudaMemcpyHostToDevice; + + if (src->MemoryType() == TRITONSERVER_MEMORY_CPU && + dst->MemoryType() == TRITONSERVER_MEMORY_GPU) { + kind = cudaMemcpyHostToDevice; + } else if ( + src->MemoryType() == TRITONSERVER_MEMORY_GPU && + dst->MemoryType() == TRITONSERVER_MEMORY_CPU) { + kind = cudaMemcpyDeviceToHost; + } else if ( + src->MemoryType() == TRITONSERVER_MEMORY_GPU && + dst->MemoryType() == TRITONSERVER_MEMORY_GPU) { + kind = cudaMemcpyDeviceToDevice; + } + + cudaError_t err = + cudaMemcpy(dst->DataPtr(), src->DataPtr(), src->ByteSize(), kind); + + if (err != cudaSuccess) { + throw PythonBackendException( + std::string( + "failed to copy data: " + std::string(cudaGetErrorString(err))) + .c_str()); + } +#endif +} + +void +PbMemory::FillShmData( + TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, + uint64_t byte_size, char* data, char* data_shm, + bi::managed_external_buffer::handle_t handle, bool copy_gpu) +{ + char* memory_data_shm = data_shm + sizeof(MemoryShm); + MemoryShm* memory_shm_ptr = reinterpret_cast(data_shm); + memory_shm_ptr->is_cuda_handle_set = copy_gpu; + + if (memory_type == TRITONSERVER_MEMORY_GPU) { +#ifdef TRITON_ENABLE_GPU + if (data != nullptr) { + if (copy_gpu) { + // [FIXME] Restore the previous device + THROW_IF_CUDA_ERROR(cudaSetDevice(memory_type_id)); + THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle( + reinterpret_cast(memory_data_shm), data)); + } + } +#endif + } else { + if (data != nullptr) { + std::copy(data, data + byte_size, memory_data_shm); + } + } + + memory_shm_ptr->byte_size = byte_size; + memory_shm_ptr->memory_type_id = memory_type_id; + memory_shm_ptr->memory_type = memory_type; +} + +std::unique_ptr +PbMemory::LoadFromSharedMemory( + bi::managed_external_buffer::handle_t handle, char* data_shm, + bool open_cuda_handle) +{ + MemoryShm* memory_shm_ptr = reinterpret_cast(data_shm); + char* memory_data_shm = data_shm + sizeof(MemoryShm); + + char* data_ptr = nullptr; + bool opened_cuda_ipc_handle = false; + if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU && + open_cuda_handle) { +#ifdef TRITON_ENABLE_GPU + cudaIpcMemHandle_t* cuda_handle = + reinterpret_cast(memory_data_shm); + + // The pointer opened by the cudaIpcOpenMemHandle will refer to the base + // address. We need to manually correct the offset. + void* data_ptr_base; + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.OpenCudaHandle( + memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); + + data_ptr = + (reinterpret_cast(data_ptr_base) + + memory_shm_ptr->gpu_pointer_offset); + opened_cuda_ipc_handle = true; +#endif + } else { + data_ptr = memory_data_shm; + } + return std::unique_ptr(new PbMemory( + data_shm, data_ptr, handle, + opened_cuda_ipc_handle /* opened_cuda_ipc_handle */)); +} + + +std::unique_ptr +PbMemory::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle, bool open_cuda_handle) +{ + AllocatedSharedMemory memory_shm = shm_pool->Load(handle); + MemoryShm* memory_shm_ptr = + reinterpret_cast(memory_shm.data_.get()); + char* memory_data_shm = memory_shm.data_.get() + sizeof(MemoryShm); + + char* data_ptr = nullptr; + bool opened_cuda_ipc_handle = false; + if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU) { + if (memory_shm_ptr->byte_size > 0 && open_cuda_handle) { +#ifdef TRITON_ENABLE_GPU + cudaIpcMemHandle_t* cuda_handle = + reinterpret_cast(memory_data_shm); + + // The pointer opened by the cudaIpcOpenMemHandle will refer to the base + // address. We need to manually correct the offset. + + void* data_ptr_base; + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.OpenCudaHandle( + memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); + + data_ptr = + (reinterpret_cast(data_ptr_base) + + memory_shm_ptr->gpu_pointer_offset); + opened_cuda_ipc_handle = true; +#endif + } + } else { + data_ptr = memory_data_shm; + } + return std::unique_ptr(new PbMemory( + memory_shm, data_ptr, + opened_cuda_ipc_handle /* opened_cuda_ipc_handle */)); +} + +PbMemory::PbMemory( + AllocatedSharedMemory& memory_shm, char* data, + bool opened_cuda_ipc_handle) + : memory_shm_(std::move(memory_shm)), data_ptr_(data), + opened_cuda_ipc_handle_(opened_cuda_ipc_handle) +{ + memory_shm_ptr_ = reinterpret_cast(memory_shm_.data_.get()); + memory_shm_handle_ = memory_shm_.handle_; +} + +PbMemory::PbMemory( + char* memory_shm, char* data, bi::managed_external_buffer::handle_t handle, + bool opened_cuda_ipc_handle) +{ + memory_shm_ptr_ = reinterpret_cast(memory_shm); + data_ptr_ = data; + opened_cuda_ipc_handle_ = opened_cuda_ipc_handle; + memory_shm_handle_ = handle; +} + +bi::managed_external_buffer::handle_t +PbMemory::ShmHandle() +{ + return memory_shm_handle_; +} + +#ifdef TRITON_ENABLE_GPU +void* +PbMemory::GetGPUStartAddress() +{ + if (memory_shm_ptr_->memory_type == TRITONSERVER_MEMORY_GPU) { + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + CUdeviceptr start_address; + + cuda_api.PointerGetAttribute( + &start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + reinterpret_cast(data_ptr_)); + + return reinterpret_cast(start_address); + } + + throw PythonBackendException( + "Calling GetGPUStartAddress function on CPU memory."); +} + +uint64_t +PbMemory::GetGPUPointerOffset() +{ + uint64_t offset; + if (memory_shm_ptr_->memory_type == TRITONSERVER_MEMORY_GPU) { + offset = data_ptr_ - reinterpret_cast(GetGPUStartAddress()); + } else { + throw PythonBackendException( + "Calling GetGPUPointerOffset function on CPU tensor."); + } + return offset; +} +#endif + +TRITONSERVER_MemoryType +PbMemory::MemoryType() const +{ + return memory_shm_ptr_->memory_type; +} + +int64_t +PbMemory::MemoryTypeId() const +{ + return memory_shm_ptr_->memory_type_id; +} + +uint64_t +PbMemory::ByteSize() const +{ + return memory_shm_ptr_->byte_size; +} + +char* +PbMemory::ShmData() const +{ + return reinterpret_cast(memory_shm_ptr_) + sizeof(MemoryShm); +} + +char* +PbMemory::DataPtr() const +{ + return data_ptr_; +} + +uint64_t +PbMemory::ShmStructSize(TRITONSERVER_MemoryType memory_type, uint64_t byte_size) +{ + uint64_t total_memory_size = sizeof(MemoryShm); + if (memory_type == TRITONSERVER_MEMORY_GPU) { +#ifdef TRITON_ENABLE_GPU + total_memory_size += sizeof(cudaIpcMemHandle_t); +#endif + } else { + total_memory_size += byte_size; + } + + return total_memory_size; +} + +#ifdef TRITON_ENABLE_GPU +void +PbMemory::SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle) +{ + *(reinterpret_cast(ShmData())) = *(cuda_ipc_handle); +} +#endif + +PbMemory::~PbMemory() +{ + if (opened_cuda_ipc_handle_) { +#ifdef TRITON_ENABLE_GPU + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.CloseCudaHandle( + memory_shm_ptr_->memory_type_id, GetGPUStartAddress()); +#endif + } +} + +}}} // namespace triton::backend::python diff --git a/src/pb_memory.h b/src/pb_memory.h new file mode 100644 index 00000000..6b1bdf87 --- /dev/null +++ b/src/pb_memory.h @@ -0,0 +1,155 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "pb_utils.h" +#include "shm_manager.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_memory.h" + +#ifdef TRITON_ENABLE_GPU +#include +#endif // TRITON_ENABLE_GPU + +namespace triton { namespace backend { namespace python { + +// +// Represents a memory object in shared memory. +// +struct MemoryShm { + // If the memory type is a GPU pointer, the offset of the GPU pointer from the + // base address. For CPU memory type this field contains garbage data. + uint64_t gpu_pointer_offset; + + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + uint64_t byte_size; + bool is_cuda_handle_set; +}; + +class PbMemory { + public: + static std::unique_ptr Create( + std::unique_ptr& shm_pool, + TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, + uint64_t byte_size, char* data, bool copy_gpu = true); + + static std::unique_ptr Create( + TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, + uint64_t byte_size, char* data, char* data_shm, + bi::managed_external_buffer::handle_t handle, bool copy_gpu = true); + +#ifndef TRITON_PB_STUB + static std::unique_ptr Create( + std::unique_ptr& shm_pool, + std::unique_ptr&& backend_memory, bool copy_gpu = true); +#endif + +#ifdef TRITON_ENABLE_GPU + void SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle); +#endif + + // Copy the destination buffer to the source buffer. + static void CopyBuffer( + std::unique_ptr& dst, std::unique_ptr& src); + + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t memory_handle, + bool open_cuda_handle); + static std::unique_ptr LoadFromSharedMemory( + bi::managed_external_buffer::handle_t handle, char* data_shm, + bool open_cuda_handle); + static uint64_t ShmStructSize( + TRITONSERVER_MemoryType memory_type, uint64_t byte_size); + + bi::managed_external_buffer::handle_t ShmHandle(); + + /// Get the total byte size of the tensor. + uint64_t ByteSize() const; + + /// Get the triton memory type. + /// \return the memory type of the tensor. + TRITONSERVER_MemoryType MemoryType() const; + + /// Get the pointer. + /// \return The location to the memory where the data is stored. + char* DataPtr() const; + + /// Get the memory type id. + /// \return The memory type id of the tensor. + int64_t MemoryTypeId() const; + + /// Get the shm data + /// \return The memory type id of the tensor. + char* ShmData() const; + + ~PbMemory(); + + private: + AllocatedSharedMemory memory_shm_; + MemoryShm* memory_shm_ptr_; + +#ifndef TRITON_PB_STUB + std::unique_ptr backend_memory_; +#endif + + // Refers to the pointer that can hold the data. For CPU pointers this will be + // the same as memory_data_shm_ptr_. + char* data_ptr_; + + bi::managed_external_buffer::handle_t memory_shm_handle_; + bool opened_cuda_ipc_handle_; + +#ifdef TRITON_ENABLE_GPU + /// Calculate the pointer offest from the base address. + /// \return The offset of a device pointer. + /// \throws PythonBackendException if the tensor is stored in CPU. + uint64_t GetGPUPointerOffset(); + + /// Get the GPU start address. + /// \return The start address of a device pointer. + /// \throws PythonBackendException if the tensor is stored in CPU. + void* GetGPUStartAddress(); + +#endif + + static void FillShmData( + TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, + uint64_t byte_size, char* data, char* data_shm, + bi::managed_external_buffer::handle_t handle, bool copy_gpu = true); + + PbMemory( + AllocatedSharedMemory& memory_shm, char* data, + bool opened_cuda_ipc_handle); + + PbMemory( + char* memory_shm, char* data, + bi::managed_external_buffer::handle_t handle, + bool opened_cuda_ipc_handle); +}; +}}} // namespace triton::backend::python diff --git a/src/pb_string.cc b/src/pb_string.cc new file mode 100644 index 00000000..4f8a1227 --- /dev/null +++ b/src/pb_string.cc @@ -0,0 +1,126 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_string.h" + +namespace triton { namespace backend { namespace python { + +std::unique_ptr +PbString::Create( + std::unique_ptr& shm_pool, const std::string& string) +{ + AllocatedSharedMemory string_container_shm = + shm_pool->Construct(); + string_container_shm.data_->length = string.size(); + + AllocatedSharedMemory string_shm = + shm_pool->Construct(string.size()); + std::memcpy(string_shm.data_.get(), string.data(), string.size()); + string_container_shm.data_->data = string_shm.handle_; + + return std::unique_ptr( + new PbString(string_container_shm, string_shm)); +} + +std::unique_ptr +PbString::Create( + const std::string& string, char* data_shm, + bi::managed_external_buffer::handle_t handle) +{ + StringShm* string_container_shm = reinterpret_cast(data_shm); + string_container_shm->length = string.size(); + + char* string_shm = data_shm + sizeof(StringShm); + std::memcpy(string_shm, string.data(), string.size()); + + return std::unique_ptr( + new PbString(string_container_shm, string_shm, handle)); +} + +std::unique_ptr +PbString::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory string_container_shm = + shm_pool->Load(handle); + AllocatedSharedMemory string_shm = + shm_pool->Load(string_container_shm.data_->data); + + return std::unique_ptr( + new PbString(string_container_shm, string_shm)); +} + +std::unique_ptr +PbString::LoadFromSharedMemory( + bi::managed_external_buffer::handle_t handle, char* data_shm) +{ + StringShm* string_container_shm = reinterpret_cast(data_shm); + char* string_shm = data_shm + sizeof(StringShm); + + return std::unique_ptr( + new PbString(string_container_shm, string_shm, handle)); +} + +PbString::PbString( + AllocatedSharedMemory& string_container_shm, + AllocatedSharedMemory& string_shm) + : string_container_shm_(std::move(string_container_shm)), + string_shm_(std::move(string_shm)) +{ + string_shm_ptr_ = string_shm_.data_.get(); + string_container_shm_ptr_ = string_container_shm_.data_.get(); + string_handle_ = string_container_shm_.handle_; +} + +PbString::PbString( + StringShm* string_container_shm, char* string_shm, + bi::managed_external_buffer::handle_t handle) +{ + string_shm_ptr_ = string_shm; + string_container_shm_ptr_ = string_container_shm; + string_handle_ = handle; +} + +bi::managed_external_buffer::handle_t +PbString::ShmHandle() +{ + return string_handle_; +} + +std::size_t +PbString::ShmStructSize(const std::string& string) +{ + return string.size() + sizeof(StringShm); +} + +std::size_t +PbString::Size() +{ + return string_container_shm_ptr_->length + sizeof(StringShm); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_string.h b/src/pb_string.h new file mode 100644 index 00000000..5d1ecff0 --- /dev/null +++ b/src/pb_string.h @@ -0,0 +1,80 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "shm_manager.h" + +namespace triton { namespace backend { namespace python { + +struct StringShm { + bi::managed_external_buffer::handle_t data; + size_t length; +}; + +class PbString { + public: + static std::unique_ptr Create( + std::unique_ptr& shm_pool, + const std::string& string); + static std::unique_ptr Create( + const std::string& string, char* data_shm, + bi::managed_external_buffer::handle_t handle); + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + static std::unique_ptr LoadFromSharedMemory( + bi::managed_external_buffer::handle_t handle, char* data_shm); + static std::size_t ShmStructSize(const std::string& string); + + char* MutableString() { return string_shm_ptr_; } + std::string String() + { + return std::string( + string_shm_ptr_, string_shm_ptr_ + string_container_shm_ptr_->length); + } + bi::managed_external_buffer::handle_t ShmHandle(); + std::size_t Size(); + + private: + AllocatedSharedMemory string_container_shm_; + StringShm* string_container_shm_ptr_; + + AllocatedSharedMemory string_shm_; + char* string_shm_ptr_; + + bi::managed_external_buffer::handle_t string_handle_; + + PbString( + AllocatedSharedMemory& string_container_shm, + AllocatedSharedMemory& string_shm); + + PbString( + StringShm* string_container_shm, char* string_shm, + bi::managed_external_buffer::handle_t handle); +}; + +}}} // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 26c13d31..9d6f3450 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -40,12 +40,14 @@ #include #include #include -#include "infer_request.h" #include "infer_response.h" -#include "message_queue.h" -#include "pb_tensor.h" -#include "pb_utils.h" +#include "pb_error.h" +#include "pb_map.h" +#include "pb_string.h" +#include "scoped_defer.h" #include "shm_manager.h" +#include "triton/common/nvtx.h" + #ifdef TRITON_ENABLE_GPU #include @@ -54,7 +56,6 @@ namespace py = pybind11; using namespace pybind11::literals; namespace bi = boost::interprocess; - namespace triton { namespace backend { namespace python { #define LOG_IF_EXCEPTION(X) \ @@ -130,7 +131,8 @@ Stub::Instantiate( int64_t shm_growth_size, int64_t shm_default_size, const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, - off_t ipc_control_offset, const std::string& model_instance_name) + bi::managed_external_buffer::handle_t ipc_control_handle, + const std::string& model_instance_name) { model_path_ = model_path; model_version_ = model_version; @@ -138,22 +140,20 @@ Stub::Instantiate( model_instance_name_ = model_instance_name; health_mutex_ = nullptr; initialized_ = false; - cuda_ipc_open_mutex_ = std::make_shared(); - cuda_ipc_close_mutex_ = std::make_shared(); try { - shm_pool_ = std::make_unique( - shm_region_name, shm_default_size, shm_growth_size); + shm_pool_ = std::make_unique( + shm_region_name, shm_default_size, shm_growth_size, false /* create */); - shm_pool_->MapOffset((char**)&ipc_control_, ipc_control_offset); + AllocatedSharedMemory ipc_control = + shm_pool_->Load(ipc_control_handle); + ipc_control_ = ipc_control.data_.get(); - bi::interprocess_mutex* health_mutex; - shm_pool_->MapOffset( - (char**)&health_mutex, ipc_control_->stub_health_mutex); - health_mutex_ = health_mutex; + health_mutex_ = &(ipc_control_->stub_health_mutex); stub_message_queue_ = MessageQueue::LoadFromSharedMemory( shm_pool_, ipc_control_->stub_message_queue); + parent_message_queue_ = MessageQueue::LoadFromSharedMemory( shm_pool_, ipc_control_->parent_message_queue); @@ -203,114 +203,18 @@ Stub::Health() return ipc_control_->stub_health; } -std::unique_ptr& -Stub::GetSharedMemory() +std::unique_ptr& +Stub::SharedMemory() { return shm_pool_; } -void -Stub::SetErrorForResponse(Response* response, const char* err_message) -{ - off_t err_string_offset = 0; - response->is_error_set = false; - response->has_error = true; - LOG_IF_EXCEPTION( - SaveStringToSharedMemory(shm_pool_, err_string_offset, err_message)); - - if (err_string_offset != 0) { - response->error = err_string_offset; - response->is_error_set = true; - } -} - -void -Stub::SetErrorForResponseBatch( - ResponseBatch* response_batch, const char* err_message) -{ - off_t err_string_offset = 0; - response_batch->is_error_set = false; - response_batch->has_error = true; - LOG_IF_EXCEPTION( - SaveStringToSharedMemory(shm_pool_, err_string_offset, err_message)); - - if (err_string_offset != 0) { - response_batch->error = err_string_offset; - response_batch->is_error_set = true; - } -} - -void -Stub::ProcessResponse( - Response* response_shm, ResponseBatch* response_batch, - InferResponse* response) -{ - // Initialize has_error to false - response_shm->has_error = false; - - bool has_error = response->HasError(); - - if (has_error) { - response_shm->has_error = true; - py::str py_string_err = response->Error()->Message(); - std::string response_error = py_string_err; - SetErrorForResponse(response_shm, response_error.c_str()); - - // Skip the response value when the response has error. - return; - } - - std::vector>& output_tensors = - response->OutputTensors(); - for (auto& output_tensor : output_tensors) { - if (!output_tensor->IsCPU()) { -#ifdef TRITON_ENABLE_GPU - AddToTensorsToRemove(output_tensor); -#else - throw PythonBackendException("GPU tensors is not supported."); -#endif - } - } - response->SaveToSharedMemory( - shm_pool_, response_shm, true /* copy_cpu */, false /* copy_gpu */); -} - -void -Stub::AddToTensorsToRemove(std::shared_ptr tensor) -{ - std::lock_guard guard{tensors_to_remove_mutex_}; - output_gpu_tensors_.push_back(tensor); -} - -std::shared_ptr -Stub::ProcessRequest(off_t request_offset, ResponseBatch* response_batch) -{ - std::shared_ptr infer_request = - InferRequest::LoadFromSharedMemory( - shm_pool_, request_offset, cuda_ipc_open_mutex_, - cuda_ipc_close_mutex_); - - for (auto& tensor : infer_request->Inputs()) { - if (!tensor->IsCPU()) - input_gpu_tensors_.push_back(tensor); - } - - return infer_request; -} - -void -Stub::SetResponseFromException( - ResponseBatch* response_batch, const PythonBackendException& pb_exception) -{ - SetErrorForResponseBatch(response_batch, pb_exception.what()); -} - std::unique_ptr Stub::PopMessage() { bool success = false; std::unique_ptr ipc_message; - off_t message; + bi::managed_external_buffer::handle_t message; while (!success) { message = stub_message_queue_->Pop(1000, success); } @@ -320,38 +224,38 @@ Stub::PopMessage() return ipc_message; } -std::shared_ptr& -Stub::CudaIpcCloseMutex() -{ - return cuda_ipc_close_mutex_; -} - -std::shared_ptr& -Stub::CudaIpcOpenMutex() -{ - return cuda_ipc_open_mutex_; -} - bool Stub::RunCommand() { + NVTX_RANGE(nvtx_, "RunCommand " + model_instance_name_); std::unique_ptr ipc_message = this->PopMessage(); - switch (ipc_message->Command()) { case PYTHONSTUB_CommandType::PYTHONSTUB_InitializeRequest: { bool has_exception = false; std::string error_string; std::unique_ptr initialize_response_msg = - std::make_unique(shm_pool_, false); + IPCMessage::Create(shm_pool_, false /* inline_response */); initialize_response_msg->Command() = PYTHONSTUB_InitializeResponse; - - InitializeResponse* initialize_response; - shm_pool_->Map( - (char**)&initialize_response, sizeof(InitializeResponse), - initialize_response_msg->Args()); - initialize_response->response_has_error = false; - initialize_response->response_is_error_set = false; + std::unique_ptr error_string_shm; + AllocatedSharedMemory initialize_response = + shm_pool_->Construct(); + + // The initialization is done in three steps. First the main process sends + // a message to the stub process asking to begin to initilize the Python + // model. After that is finished stub process sends a message to the + // parent process that the initialization is finished. Finally, the + // parent process sends a message to the stub process asking the stub + // process to release any objects it has held in shared memory. + ScopedDefer receive_initialize_finalize( + std::bind([this] { stub_message_queue_->Pop(); })); + ScopedDefer _(std::bind([this, &initialize_response_msg] { + SendIPCMessage(initialize_response_msg); + })); + + initialize_response.data_->response_has_error = false; + initialize_response.data_->response_is_error_set = false; + initialize_response_msg->Args() = initialize_response.handle_; try { Initialize(ipc_message->Args()); @@ -367,39 +271,60 @@ Stub::RunCommand() if (has_exception) { LOG_INFO << "Failed to initialize Python stub: " << error_string; - initialize_response->response_has_error = true; - initialize_response->response_is_error_set = false; - off_t err_string_offset; - LOG_IF_EXCEPTION(SaveStringToSharedMemory( - shm_pool_, err_string_offset, error_string.c_str())); - if (err_string_offset != 0) { - initialize_response->response_is_error_set = true; - initialize_response->response_error = err_string_offset; + initialize_response.data_->response_has_error = true; + initialize_response.data_->response_is_error_set = false; + + LOG_IF_EXCEPTION( + error_string_shm = PbString::Create(shm_pool_, error_string)); + if (error_string_shm != nullptr) { + initialize_response.data_->response_is_error_set = true; + initialize_response.data_->response_error = + error_string_shm->ShmHandle(); } - this->SendIPCMessage(initialize_response_msg); - return true; + return true; // Terminate the stub process. } - this->SendIPCMessage(initialize_response_msg); } break; case PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest: { - RequestBatch* request_batch; bool has_exception = false; std::string error_string; std::unique_ptr execute_response = - std::make_unique(shm_pool_, false /* Inline response */); + IPCMessage::Create(shm_pool_, false /* Inline response */); execute_response->Command() = PYTHONSTUB_ExecuteResposne; - shm_pool_->MapOffset((char**)&request_batch, ipc_message->Args()); - ResponseBatch* response_batch; - shm_pool_->Map( - (char**)&response_batch, sizeof(ResponseBatch), - execute_response->Args()); - response_batch->has_error = false; + AllocatedSharedMemory request_batch = + shm_pool_->Load(ipc_message->Args()); + RequestBatch* request_batch_shm_ptr = + reinterpret_cast(request_batch.data_.get()); + AllocatedSharedMemory response_batch = shm_pool_->Construct( + request_batch_shm_ptr->batch_size * + sizeof(bi::managed_external_buffer::handle_t) + + sizeof(ResponseBatch)); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast(response_batch.data_.get()); + + std::unique_ptr error_string_shm; + py::list inference_responses; + + bi::managed_external_buffer::handle_t* responses_shm_handle = + reinterpret_cast( + response_batch.data_.get() + sizeof(ResponseBatch)); + + execute_response->Args() = response_batch.handle_; + + ScopedDefer execute_finalize( + std::bind([this] { stub_message_queue_->Pop(); })); + ScopedDefer _(std::bind( + [this, &execute_response] { SendIPCMessage(execute_response); })); + + response_batch_shm_ptr->has_error = false; + response_batch_shm_ptr->is_error_set = false; try { - Execute(request_batch, response_batch); + inference_responses = Execute( + request_batch_shm_ptr, response_batch_shm_ptr, + responses_shm_handle); } catch (const PythonBackendException& pb_exception) { has_exception = true; @@ -417,25 +342,20 @@ Stub::RunCommand() model_instance_name_ + "', message: ") + error_string; LOG_INFO << err_message.c_str(); - response_batch->has_error = true; - response_batch->is_error_set = false; - off_t err_string_offset = 0; - LOG_IF_EXCEPTION(SaveStringToSharedMemory( - shm_pool_, err_string_offset, error_string.c_str())); - if (err_string_offset != 0) { - response_batch->is_error_set = true; - response_batch->error = err_string_offset; - } + error_string_shm = PbString::Create(shm_pool_, error_string); + response_batch_shm_ptr->has_error = true; + response_batch_shm_ptr->is_error_set = true; + response_batch_shm_ptr->error = error_string_shm->ShmHandle(); } - this->SendIPCMessage(execute_response); + } break; case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest: ipc_message->Command() = PYTHONSTUB_FinalizeResponse; - this->SendIPCMessage(ipc_message); - return true; + SendIPCMessage(ipc_message); + return true; // Terminate the stub process case PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers: try { - LoadGPUBuffers(); + LoadGPUBuffers(ipc_message); } catch (const PythonBackendException& pb_exception) { LOG_INFO << "An error occurred while trying to load GPU buffers in the " @@ -443,8 +363,6 @@ Stub::RunCommand() << pb_exception.what() << std::endl; } - ipc_message->Command() = PYTHONSTUB_LoadGPUBuffers; - this->SendIPCMessage(ipc_message); break; default: break; @@ -454,91 +372,7 @@ Stub::RunCommand() } void -Stub::Execute(RequestBatch* request_batch, ResponseBatch* response_batch) -{ - uint32_t batch_size = request_batch->batch_size; - - if (batch_size == 0) { - return; - } - - py::list py_request_list; - for (size_t i = 0; i < batch_size; i++) { - off_t request_offset = request_batch->requests + i * sizeof(Request); - py_request_list.append(ProcessRequest(request_offset, response_batch)); - } - - if (!py::hasattr(model_instance_, "execute")) { - std::string message = - "Python model " + model_path_ + " does not implement `execute` method."; - throw PythonBackendException(message); - } - py::object request_list = py_request_list; - py::module asyncio = py::module::import("asyncio"); - - // Execute Response - py::object execute_return = model_instance_.attr("execute")(request_list); - py::object responses_obj; - bool is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); - - if (is_coroutine) { - responses_obj = asyncio.attr("run")(execute_return); - } else { - responses_obj = execute_return; - } - - // Check the return type of execute function. - if (!py::isinstance(responses_obj)) { - std::string str = py::str(execute_return.get_type()); - throw PythonBackendException( - std::string("Expected a list in the execute return, found type '") + - str + "'."); - } - - py::list responses = responses_obj; - - Response* responses_shm; - off_t responses_shm_offset; - size_t response_size = py::len(responses); - - // If the number of request objects do not match the number of resposne - // objects throw an error. - if (response_size != batch_size) { - std::string err = - "Number of InferenceResponse objects do not match the number of " - "InferenceRequest objects. InferenceRequest(s) size is:" + - std::to_string(batch_size) + - ", and InferenceResponse(s) size is:" + std::to_string(response_size) + - "\n"; - throw PythonBackendException(err); - } - - shm_pool_->Map( - (char**)&responses_shm, sizeof(Response) * response_size, - responses_shm_offset); - response_batch->responses = responses_shm_offset; - response_batch->batch_size = response_size; - - size_t i = 0; - for (auto& response : responses) { - // Check the return type of execute function. - if (!py::isinstance(response)) { - std::string str = py::str(response.get_type()); - throw PythonBackendException( - std::string("Expected an 'InferenceResponse' object in the execute " - "function return list, found type '") + - str + "'."); - } - - InferResponse* infer_response = response.cast(); - Response* response_shm = &responses_shm[i]; - ProcessResponse(response_shm, response_batch, infer_response); - i += 1; - } -} - -void -Stub::Initialize(off_t map_offset) +Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) { py::module sys = py::module_::import("sys"); @@ -569,6 +403,12 @@ Stub::Initialize(off_t map_offset) py::module_::import("triton_python_backend_utils"); py::module c_python_backend_utils = py::module_::import("c_python_backend_utils"); + py::setattr( + python_backend_utils, "TritonError", + c_python_backend_utils.attr("TritonError")); + py::setattr( + python_backend_utils, "TritonModelException", + c_python_backend_utils.attr("TritonModelException")); py::setattr( python_backend_utils, "Tensor", c_python_backend_utils.attr("Tensor")); py::setattr( @@ -577,12 +417,6 @@ Stub::Initialize(off_t map_offset) py::setattr( python_backend_utils, "InferenceResponse", c_python_backend_utils.attr("InferenceResponse")); - py::setattr( - python_backend_utils, "TritonError", - c_python_backend_utils.attr("TritonError")); - py::setattr( - python_backend_utils, "TritonModelException", - c_python_backend_utils.attr("TritonModelException")); py::object TritonPythonModel = py::module_::import( @@ -593,7 +427,12 @@ Stub::Initialize(off_t map_offset) model_instance_ = TritonPythonModel(); std::unordered_map map; - LoadMapFromSharedMemory(shm_pool_, map_offset, map); + std::unique_ptr pb_map_shm = + PbMap::LoadFromSharedMemory(shm_pool_, map_handle); + + // Get the unordered_map representation of the map in shared memory. + map = pb_map_shm->UnorderedMap(); + py::dict model_config_params; for (const auto& pair : map) { @@ -609,29 +448,199 @@ Stub::Initialize(off_t map_offset) } void -Stub::UpdateHealth() +Stub::ProcessResponse(InferResponse* response) { - bi::scoped_lock lock(*health_mutex_); - ipc_control_->stub_health = true; + response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); + + for (auto& output_tensor : response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + gpu_tensors_.push_back(output_tensor); + } + } } void -Stub::LoadGPUBuffers() +Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) { - std::lock_guard guard{tensors_to_remove_mutex_}; -#ifdef TRITON_ENABLE_GPU - for (auto& tensor : output_gpu_tensors_) { - if (tensor->RawDataShm()->memory_type == TRITONSERVER_MEMORY_GPU) { - tensor->SetCudaIpcMutexes(CudaIpcOpenMutex(), CudaIpcCloseMutex()); - tensor->LoadGPUData(shm_pool_); - } else { - tensor->CopyToCPU(shm_pool_); + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Load(ipc_message->Args()); + + uint64_t* gpu_buffer_count = + reinterpret_cast(gpu_buffers_handle.data_.get()); + bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = + reinterpret_cast( + gpu_buffers_handle.data_.get() + sizeof(uint64_t)); + + if (gpu_tensors_.size() != *gpu_buffer_count) { + LOG_INFO + << (std::string( + "GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors_.size()) + + " != " + std::to_string(*gpu_buffer_count)); + return; + } + + // We need to hold the cpu_buffers until the main process makes a copy from + // them. + std::vector> cpu_buffers; + std::vector> dst_buffers; + + bool has_cpu_buffer = false; + for (size_t i = 0; i < gpu_tensors_.size(); i++) { + std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); + if (dst_buffer->MemoryType() == TRITONSERVER_MEMORY_CPU) { + has_cpu_buffer = true; } + dst_buffers.emplace_back(std::move(dst_buffer)); } -#endif // TRITON_ENABLE_GPU - output_gpu_tensors_.clear(); - input_gpu_tensors_.clear(); + // Pop a dummy message from the stub message queue indicating that the parent + // has finished copying the tensors. + ScopedDefer _(std::bind([this, has_cpu_buffer] { + if (has_cpu_buffer) { + stub_message_queue_->Pop(); + } + })); + + ScopedDefer load_gpu_buffer_response( + std::bind([this, has_cpu_buffer] { parent_message_queue_->Push(1000); })); + + for (size_t i = 0; i < gpu_tensors_.size(); i++) { + std::shared_ptr& src_buffer = gpu_tensors_[i]; + + // If the memory type is CPU, the buffer is empty and we need to create + // a buffer. + if (dst_buffers[i]->MemoryType() == TRITONSERVER_MEMORY_CPU) { + dst_buffers[i] = PbMemory::Create( + shm_pool_, dst_buffers[i]->MemoryType(), + dst_buffers[i]->MemoryTypeId(), src_buffer->ByteSize(), + nullptr /* buffer */); + + // Update the handle so that the main process can load it. + gpu_buffers_handle_shm[i] = dst_buffers[i]->ShmHandle(); + } + + PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); + + if (dst_buffers[i]->MemoryType() == TRITONSERVER_MEMORY_CPU) { + cpu_buffers.push_back(std::move(dst_buffers[i])); + } + } + + gpu_tensors_.clear(); +} + +py::list +Stub::Execute( + RequestBatch* request_batch_shm_ptr, ResponseBatch* response_batch_shm_ptr, + bi::managed_external_buffer::handle_t* responses_shm_handle) +{ + uint32_t batch_size = request_batch_shm_ptr->batch_size; + py::list responses; + + if (batch_size == 0) { + return responses; + } + + py::list py_request_list; + bi::managed_external_buffer::handle_t* request_shm_handle = + reinterpret_cast( + reinterpret_cast(request_batch_shm_ptr) + + sizeof(RequestBatch)); + + for (size_t i = 0; i < batch_size; i++) { + std::shared_ptr infer_request = + InferRequest::LoadFromSharedMemory( + shm_pool_, request_shm_handle[i], true /* open_cuda_handle */); + py_request_list.append(std::move(infer_request)); + } + + if (!py::hasattr(model_instance_, "execute")) { + std::string message = + "Python model " + model_path_ + " does not implement `execute` method."; + throw PythonBackendException(message); + } + + py::object request_list = py_request_list; + py::module asyncio = py::module::import("asyncio"); + + // Execute Response + py::object execute_return; + py::object responses_obj; + bool is_coroutine; + + { + NVTX_RANGE(nvtx_, "PyExecute " + model_instance_name_); + execute_return = model_instance_.attr("execute")(request_list); + is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); + } + + if (is_coroutine) { + responses_obj = asyncio.attr("run")(execute_return); + } else { + responses_obj = execute_return; + } + + // Check the return type of execute function. + if (!py::isinstance(responses_obj)) { + std::string str = py::str(execute_return.get_type()); + throw PythonBackendException( + std::string("Expected a list in the execute return, found type '") + + str + "'."); + } + + responses = responses_obj; + size_t response_size = py::len(responses); + + // If the number of request objects do not match the number of + // resposne objects throw an error. + if (response_size != batch_size) { + std::string err = + "Number of InferenceResponse objects do not match the number " + "of " + "InferenceRequest objects. InferenceRequest(s) size is:" + + std::to_string(batch_size) + + ", and InferenceResponse(s) size is:" + std::to_string(response_size) + + "\n"; + throw PythonBackendException(err); + } + for (auto& response : responses) { + // Check the return type of execute function. + if (!py::isinstance(response)) { + std::string str = py::str(response.get_type()); + throw PythonBackendException( + std::string("Expected an 'InferenceResponse' object in the execute " + "function return list, found type '") + + str + "'."); + } + } + + response_batch_shm_ptr->batch_size = response_size; + + std::vector> gpu_tensors; + + size_t i = 0; + for (auto& response : responses) { + InferResponse* infer_response = response.cast(); + ProcessResponse(infer_response); + for (auto output_tensor : infer_response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + gpu_tensors.push_back(output_tensor); + } + } + responses_shm_handle[i] = infer_response->ShmHandle(); + i += 1; + } + + return responses; +} + +void +Stub::UpdateHealth() +{ + bi::scoped_lock lock(*health_mutex_); + ipc_control_->stub_health = true; } void @@ -653,8 +662,7 @@ Stub::SendIPCMessage(std::unique_ptr& ipc_message) { bool success = false; while (!success) { - parent_message_queue_->Push( - ipc_message->SharedMemoryOffset(), 1000, success); + parent_message_queue_->Push(ipc_message->ShmHandle(), 1000, success); } } @@ -673,7 +681,7 @@ std::unique_ptr Stub::stub_instance_; std::unique_ptr& Stub::GetOrCreateInstance() { - if (stub_instance_.get() == nullptr) { + if (Stub::stub_instance_.get() == nullptr) { Stub::stub_instance_ = std::make_unique(); } @@ -682,14 +690,9 @@ Stub::GetOrCreateInstance() PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) { - py::class_>(module, "Tensor") - .def(py::init(&PbTensor::FromNumpy)) - .def("name", &PbTensor::Name) - .def("as_numpy", &PbTensor::AsNumpy) - .def("triton_dtype", &PbTensor::TritonDtype) - .def("to_dlpack", &PbTensor::ToDLPack) - .def("is_cpu", &PbTensor::IsCPU) - .def("from_dlpack", &PbTensor::FromDLPack); + py::class_>(module, "TritonError") + .def(py::init()) + .def("message", &PbError::Message); py::class_>( module, "InferenceRequest") @@ -728,6 +731,15 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) "requested_output_names", &InferRequest::RequestedOutputNames, py::return_value_policy::reference_internal); + py::class_>(module, "Tensor") + .def(py::init(&PbTensor::FromNumpy)) + .def("name", &PbTensor::Name) + .def("as_numpy", &PbTensor::AsNumpy) + .def("triton_dtype", &PbTensor::TritonDtype) + .def("to_dlpack", &PbTensor::ToDLPack) + .def("is_cpu", &PbTensor::IsCPU) + .def("from_dlpack", &PbTensor::FromDLPack); + py::class_(module, "InferenceResponse") .def( py::init< @@ -740,9 +752,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("has_error", &InferResponse::HasError) .def("error", &InferResponse::Error); - py::class_>(module, "TritonError") - .def(py::init()) - .def("message", &PbError::Message); py::register_exception( module, "TritonModelException"); @@ -793,7 +802,7 @@ main(int argc, char** argv) stub->Instantiate( shm_growth_size, shm_default_size, shm_region_name, model_path, model_version, argv[6] /* triton install path */, - std::stoi(argv[7]) /* IPCControl offset */, model_instance_name); + std::stoi(argv[7]) /* IPCControl handle */, model_instance_name); } catch (const PythonBackendException& pb_exception) { LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what(); @@ -829,11 +838,9 @@ main(int argc, char** argv) } }); - // This is the only place where NotifyParent() and WaitForNotification() are - // allowed to be called. The stub process will always keep listening for new - // notifications from the parent process. After the notification is received - // the stub process will run the appropriate comamnd and wait for new - // notifications. + // The stub process will always keep listening for new notifications from the + // parent process. After the notification is received the stub process will + // run the appropriate comamnd and wait for new notifications. bool finalize = false; while (true) { if (finalize) { diff --git a/src/pb_stub.h b/src/pb_stub.h index 9610468f..91b44efe 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -1,4 +1,4 @@ -// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -35,8 +35,8 @@ #include #include "infer_request.h" #include "infer_response.h" +#include "ipc_message.h" #include "message_queue.h" -#include "pb_tensor.h" #include "pb_utils.h" #pragma once @@ -48,6 +48,53 @@ using namespace pybind11::literals; namespace triton { namespace backend { namespace python { class Stub { + public: + Stub(){}; + static std::unique_ptr& GetOrCreateInstance(); + + /// Instantiate a new Python backend Stub. + void Instantiate( + int64_t shm_growth_size, int64_t shm_default_size, + const std::string& shm_region_name, const std::string& model_path, + const std::string& model_version, const std::string& triton_install_path, + bi::managed_external_buffer::handle_t ipc_control_handle, + const std::string& model_instance_name); + + /// Get the health of the stub process. + bool& Health(); + + /// Get the shared memory manager. + std::unique_ptr& SharedMemory(); + + /// Run a single command from the shared memory. + bool RunCommand(); + + /// Initialize the user's Python code. + void Initialize(bi::managed_external_buffer::handle_t map_handle); + + /// Send a message to the parent process. + void SendIPCMessage(std::unique_ptr& ipc_message); + + /// Receive a message from the parent process. + std::unique_ptr PopMessage(); + + /// Update the health variable in the stub process. + void UpdateHealth(); + + /// Finalize and terminate the stub process + void Finalize(); + + /// Execute a batch of requests. + py::list Execute( + RequestBatch* request_batch_shm_ptr, + ResponseBatch* response_batch_shm_ptr, + bi::managed_external_buffer::handle_t* responses_shm_handle); + + void ProcessResponse(InferResponse* response); + void LoadGPUBuffers(std::unique_ptr& ipc_message); + ~Stub(); + + private: bi::interprocess_mutex* stub_mutex_; bi::interprocess_condition* stub_cond_; bi::interprocess_mutex* parent_mutex_; @@ -58,65 +105,19 @@ class Stub { std::string model_version_; std::string model_instance_name_; std::string triton_install_path_; - IPCControl* ipc_control_; - std::unique_ptr shm_pool_; + IPCControlShm* ipc_control_; + std::unique_ptr shm_pool_; py::object model_instance_; py::object deserialize_bytes_; py::object serialize_bytes_; std::unique_ptr stub_message_queue_; std::unique_ptr parent_message_queue_; - std::vector> output_gpu_tensors_; - std::vector> input_gpu_tensors_; std::mutex tensors_to_remove_mutex_; std::vector> messages_; std::mutex messages_mutex_; - std::shared_ptr cuda_ipc_open_mutex_; - std::shared_ptr cuda_ipc_close_mutex_; std::condition_variable messages_cv_; - py::object thread_pool_; - bool require_cleanup_; bool initialized_; static std::unique_ptr stub_instance_; - - public: - Stub(){}; - static std::unique_ptr& GetOrCreateInstance(); - - void Instantiate( - int64_t shm_growth_size, int64_t shm_default_size, - const std::string& shm_region_name, const std::string& model_path, - const std::string& model_version, const std::string& triton_install_path, - off_t ipc_control_offset, const std::string& model_instance_name); - py::object GetThreadPool(); - void NotifyParent(); - bool& Health(); - std::unique_ptr& GetSharedMemory(); - void SetErrorForResponse(Response* response, const char* err_message); - void SetErrorForResponseBatch( - ResponseBatch* response_batch, const char* err_message); - void ProcessResponse( - Response* response_shm, ResponseBatch* response_batch, - InferResponse* response); - std::shared_ptr ProcessRequest( - off_t request_offset, ResponseBatch* response_batch); - void SetResponseFromException( - ResponseBatch* response_batch, - const PythonBackendException& pb_exception); - bool RunCommand(); - std::unique_ptr Poll(); - void Execute(RequestBatch* request_batch, ResponseBatch* response_batch); - void Initialize(off_t map_offset); - void SendIPCMessage(std::unique_ptr& ipc_message); - std::unique_ptr PopMessage(); - void AddToTensorsToRemove(std::shared_ptr tensor); - std::shared_ptr& CudaIpcOpenMutex(); - std::shared_ptr& CudaIpcCloseMutex(); - - void Fetch(); - void UpdateHealth(); - void LoadGPUBuffers(); - void Finalize(); - - ~Stub(); + std::vector> gpu_tensors_; }; }}} // namespace triton::backend::python diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index ea4baadf..8cbfcf4e 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -33,7 +33,6 @@ namespace py = pybind11; #endif #include "pb_tensor.h" -#include "pb_utils.h" namespace triton { namespace backend { namespace python { @@ -43,7 +42,6 @@ PbTensor::PbTensor(const std::string& name, py::object numpy_array) : name_(name) { dtype_ = numpy_to_triton_type(numpy_array.attr("dtype")); - tensor_type_ = PYTHONBACKEND_NUMPY; memory_type_ = TRITONSERVER_MEMORY_CPU; memory_type_id_ = 0; dl_managed_tensor_ = nullptr; @@ -105,7 +103,6 @@ PbTensor::PbTensor( memory_ptr_ = numpy_array_.request().ptr; byte_size_ = numpy_array_.nbytes(); } - tensor_type_ = PYTHONBACKEND_NUMPY; memory_type_ = TRITONSERVER_MEMORY_CPU; dtype_ = dtype; @@ -125,7 +122,8 @@ PbTensor::PbTensor( const std::string& name, const std::vector& dims, TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* memory_ptr, uint64_t byte_size, - DLManagedTensor* dl_managed_tensor, off_t shm_offset) + DLManagedTensor* dl_managed_tensor, + bi::managed_external_buffer::handle_t shm_handle) { name_ = name; memory_ptr_ = memory_ptr; @@ -133,7 +131,7 @@ PbTensor::PbTensor( memory_type_id_ = memory_type_id; dtype_ = dtype; dims_ = dims; - raw_shm_offset_ = shm_offset; + // [FIXME] fix shm_handle #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || @@ -159,22 +157,13 @@ PbTensor::PbTensor( byte_size_ = byte_size; dl_managed_tensor_ = dl_managed_tensor; - - if (dl_managed_tensor != nullptr) { - tensor_type_ = PYTHONBACKEND_DLPACK; - } else { - tensor_type_ = PYTHONBACKEND_RAW; - } } bool PbTensor::IsCPU() const { - if (tensor_type_ == PYTHONBACKEND_NUMPY || - ((tensor_type_ == PYTHONBACKEND_RAW || - tensor_type_ == PYTHONBACKEND_DLPACK) && - (memory_type_ == TRITONSERVER_MEMORY_CPU || - memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED))) { + if (memory_type_ == TRITONSERVER_MEMORY_CPU || + memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { return true; } else { return false; @@ -193,12 +182,6 @@ PbTensor::MemoryTypeId() const return memory_type_id_; } -off_t -PbTensor::RawShmOffset() -{ - return raw_shm_offset_; -} - uint64_t PbTensor::ByteSize() const { @@ -211,10 +194,14 @@ PbTensor::Dims() const return dims_; } -PYTHONBACKEND_TensorType -PbTensor::TensorType() const +void +PbTensor::SetMemory(std::unique_ptr&& memory) { - return tensor_type_; + pb_memory_ = std::move(memory); + memory_type_ = pb_memory_->MemoryType(); + memory_type_id_ = pb_memory_->MemoryTypeId(); + byte_size_ = pb_memory_->ByteSize(); + memory_ptr_ = pb_memory_->DataPtr(); } #ifdef TRITON_PB_STUB @@ -297,86 +284,10 @@ PbTensor::DeleteDLPack() } } -std::shared_ptr -PbTensor::LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t tensor_offset, - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex) +std::unique_ptr& +PbTensor::Memory() { - Tensor* tensor_shm; - shm_pool->MapOffset((char**)&tensor_shm, tensor_offset); - - char* name; - LoadStringFromSharedMemory(shm_pool, tensor_shm->name, name); - std::string name_str = name; - - size_t dims_count = tensor_shm->dims_count; - RawData* raw_data; - shm_pool->MapOffset((char**)&raw_data, tensor_shm->raw_data); - - int64_t* dims; - shm_pool->MapOffset((char**)&dims, tensor_shm->dims); - - std::string reused_gpu_tensor_name; - std::shared_ptr pb_tensor; - - char* data = nullptr; - if (raw_data->memory_type == TRITONSERVER_MEMORY_CPU) { - shm_pool->MapOffset((char**)&data, raw_data->memory_ptr); - pb_tensor = std::make_shared( - name, std::vector(dims, dims + dims_count), tensor_shm->dtype, - raw_data->memory_type, raw_data->memory_type_id, data, - raw_data->byte_size, nullptr /* DLManaged Tensor */); - } else if (raw_data->memory_type == TRITONSERVER_MEMORY_GPU) { -#ifdef TRITON_ENABLE_GPU - cudaIpcMemHandle_t* cuda_ipc_mem_handle; - shm_pool->MapOffset((char**)&cuda_ipc_mem_handle, raw_data->memory_ptr); - if (tensor_shm->is_cuda_handle_set) { - cudaSetDevice(raw_data->memory_type_id); - - if (cuda_ipc_open_mutex != nullptr) - cuda_ipc_open_mutex->lock(); - - cudaError_t err = cudaIpcOpenMemHandle( - (void**)&data, *cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess); - - if (cuda_ipc_open_mutex != nullptr) - cuda_ipc_open_mutex->unlock(); - - if (err != cudaSuccess) { - throw PythonBackendException(std::string( - "failed to open cuda ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } - // Adjust the offset. cudaIpcOpenMemHandle will map the base address of a - // GPU pointer and the offset is not preserved when transferring the - // pointer using cudaIpcMemHandle. - data = data + raw_data->offset; - pb_tensor = std::make_shared( - name, std::vector(dims, dims + dims_count), - tensor_shm->dtype, raw_data->memory_type, raw_data->memory_type_id, - data, raw_data->byte_size, nullptr /* DLManaged Tensor */); - pb_tensor->destruct_cuda_ipc_mem_handle_ = true; - } else { - pb_tensor = std::make_shared( - name, std::vector(dims, dims + dims_count), - tensor_shm->dtype, raw_data->memory_type, raw_data->memory_type_id, - data, raw_data->byte_size, nullptr /* DLManaged Tensor */); - pb_tensor->destruct_cuda_ipc_mem_handle_ = false; - pb_tensor->is_cuda_handle_set_ = false; - } - pb_tensor->cuda_ipc_mem_handle_ = cuda_ipc_mem_handle; - pb_tensor->SetCudaIpcMutexes(cuda_ipc_open_mutex, cuda_ipc_close_mutex); -#else - throw PythonBackendException("GPU Tensor is not supported."); -#endif // TRITON_ENABLE_GPU - } - pb_tensor->tensor_shm_ = tensor_shm; - pb_tensor->raw_data_shm_ = raw_data; - pb_tensor->shm_offset_ = tensor_offset; - - return pb_tensor; + return pb_memory_; } #ifdef TRITON_PB_STUB @@ -430,7 +341,7 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor) memory_type_id = 0; break; case DLDeviceType::kDLCUDAHost: - memory_type = TRITONSERVER_MEMORY_CPU_PINNED; + memory_type = TRITONSERVER_MEMORY_CPU; memory_type_id = 0; break; default: @@ -460,28 +371,6 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor) PbTensor::~PbTensor() noexcept(false) { -#ifdef TRITON_ENABLE_GPU - if (!IsCPU() && cuda_ipc_mem_handle_ != nullptr && - destruct_cuda_ipc_mem_handle_) { - // Mutex needs to be used since calls to cudaIpcCloseMemHandle are not - // thread safe. - if (cuda_ipc_close_mutex_ != nullptr) - cuda_ipc_close_mutex_->lock(); - - cudaError_t err = cudaIpcCloseMemHandle(GetGPUStartAddress()); - - if (cuda_ipc_close_mutex_ != nullptr) - cuda_ipc_close_mutex_->unlock(); - - cuda_ipc_mem_handle_ = nullptr; - if (err != cudaSuccess) { - throw PythonBackendException(std::string( - "failed to close cuda ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } - } -#endif // TRITON_ENABLE_GPU DeleteDLPack(); } @@ -495,7 +384,7 @@ PbTensor::Name() const const py::array& PbTensor::AsNumpy() const { - if (this->IsCPU()) { + if (IsCPU()) { return numpy_array_; } else { throw PythonBackendException( @@ -506,282 +395,127 @@ PbTensor::AsNumpy() const } #endif // TRITON_PB_STUB -#ifdef TRITON_ENABLE_GPU -void* -PbTensor::GetGPUStartAddress() -{ - if (!this->IsCPU()) { - CUDADriverAPI& driver_api = CUDADriverAPI::getInstance(); - CUdeviceptr start_address; - - driver_api.PointerGetAttribute( - &start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, - (CUdeviceptr)this->GetDataPtr()); - - return reinterpret_cast(start_address); - } - - throw PythonBackendException( - "Calling GetGPUStartAddress function on a CPU tensor."); -} - -void -PbTensor::SetCudaIpcMutexes( - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex) -{ - cuda_ipc_open_mutex_ = cuda_ipc_open_mutex; - cuda_ipc_close_mutex_ = cuda_ipc_close_mutex; -} - -uint64_t -PbTensor::GetGPUPointerOffset() -{ - if (!this->IsCPU()) { - uint64_t offset = reinterpret_cast(this->GetDataPtr()) - - reinterpret_cast(this->GetGPUStartAddress()); - return offset; - } - - throw PythonBackendException( - "Calling GetGPUPointerOffset function on a CPU tensor."); -} - -const cudaIpcMemHandle_t* -PbTensor::CudaIpcMemHandle() -{ - return cuda_ipc_mem_handle_; -} - -#endif // TRITON_ENABLE_GPU - void PbTensor::SaveToSharedMemory( - std::unique_ptr& shm_pool, Tensor* tensor_shm, bool copy_cpu, - bool copy_gpu) + std::unique_ptr& shm_pool, bool copy_gpu) { - const std::string& tensor_name = this->Name(); - TRITONSERVER_DataType dtype_triton = - static_cast(this->TritonDtype()); - tensor_shm->is_cuda_handle_set = false; - TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU; - int64_t memory_type_id = 0; - tensor_shm_ = tensor_shm; - - if (IsCPU()) { - size_t dims_count = dims_.size(); - memory_type = TRITONSERVER_MEMORY_CPU; - memory_type_id = 0; - - char* data_in_shm; - char* data_ptr; - - data_ptr = static_cast(memory_ptr_); - - uint64_t* ptr_offset; - SaveTensorToSharedMemory( - shm_pool, tensor_shm, data_in_shm, memory_type, memory_type_id, - byte_size_, tensor_name.c_str(), dims_.data(), dims_count, dtype_triton, - &ptr_offset, raw_shm_offset_); - *ptr_offset = 0; - - if (copy_cpu) { - std::copy(data_ptr, data_ptr + byte_size_, data_in_shm); - } else { - memory_ptr_ = reinterpret_cast(data_in_shm); - } - } else { -#ifdef TRITON_ENABLE_GPU - char* cuda_handle; - uint64_t* ptr_offset; - SaveTensorToSharedMemory( - shm_pool, tensor_shm, cuda_handle, this->MemoryType(), - this->MemoryTypeId(), this->ByteSize(), tensor_name.c_str(), - this->Dims().data(), this->Dims().size(), dtype_triton, &ptr_offset, - raw_shm_offset_); - cuda_ipc_mem_handle_ = reinterpret_cast(cuda_handle); - - if (copy_gpu) { - tensor_shm->is_cuda_handle_set = true; - *ptr_offset = this->GetGPUPointerOffset(); - cudaSetDevice(this->MemoryTypeId()); - cudaError_t err = cudaIpcGetMemHandle( - reinterpret_cast(cuda_handle), - this->GetDataPtr()); - if (err != cudaSuccess) { - throw PythonBackendException(std::string( - "failed to get cuda ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } - } -#else - throw PythonBackendException("GPU tensors are not supported."); -#endif // TRITON_ENABLE_GPU - } + if (!tensor_shm_.data_) { + uint64_t byte_size = sizeof(TensorShm) + sizeof(int64_t) * dims_.size() + + PbString::ShmStructSize(name_) + + PbMemory::ShmStructSize(memory_type_, byte_size_); + tensor_shm_ = shm_pool->Construct(byte_size); - shm_pool->MapOffset((char**)&raw_data_shm_, tensor_shm_->raw_data); -} + tensor_shm_ptr_ = reinterpret_cast(tensor_shm_.data_.get()); + tensor_shm_ptr_->dtype = dtype_; + tensor_shm_ptr_->dims_count = dims_.size(); + shm_handle_ = tensor_shm_.handle_; -#ifdef TRITON_ENABLE_GPU -void -PbTensor::LoadGPUData(std::unique_ptr& shm_pool) -{ - if (!this->IsCPU()) { - if (!tensor_shm_->is_cuda_handle_set) { - throw PythonBackendException( - std::string("Failed to get cudaIpcMemHandle for tensor '") + name_ + - "'."); - } - char* d_buffer; - - // Sync the memory type id. Since it will be updated by the main process - // after providing the GPU buffers. - memory_type_id_ = raw_data_shm_->memory_type_id; - - cudaSetDevice(this->MemoryTypeId()); - shm_pool->MapOffset( - (char**)&cuda_ipc_mem_handle_, raw_data_shm_->memory_ptr); - - // Lock the mutex when using cudaIpcOpenMemHandle. This code is only - // required in the stub process. In the Triton process, we never use - // cudaIpcOpenMemHandle. The mutex is required because cudaIpcOpenMemHandle - // is not thread safe. - if (cuda_ipc_open_mutex_ != nullptr) - cuda_ipc_open_mutex_->lock(); - - cudaError_t err = cudaIpcOpenMemHandle( - (void**)&d_buffer, *cuda_ipc_mem_handle_, - cudaIpcMemLazyEnablePeerAccess); - - if (cuda_ipc_open_mutex_ != nullptr) - cuda_ipc_open_mutex_->unlock(); - - if (err != cudaSuccess) { - throw PythonBackendException(std::string( - "failed to open ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } + dims_shm_ptr_ = reinterpret_cast( + reinterpret_cast(tensor_shm_ptr_) + sizeof(TensorShm)); - char* buffer_start = d_buffer + raw_data_shm_->offset; - err = cudaMemcpy( - (void*)buffer_start, memory_ptr_, (size_t)this->ByteSize(), - cudaMemcpyDeviceToDevice); - if (err != cudaSuccess) { - throw PythonBackendException( - std::string( - "failed to copy data: " + std::string(cudaGetErrorString(err))) - .c_str()); + // Write the dimensions data to shared memory. + for (size_t i = 0; i < dims_.size(); i++) { + dims_shm_ptr_[i] = dims_[i]; } - if (cuda_ipc_close_mutex_ != nullptr) - cuda_ipc_close_mutex_->lock(); - - err = cudaIpcCloseMemHandle(d_buffer); - - if (cuda_ipc_close_mutex_ != nullptr) - cuda_ipc_close_mutex_->unlock(); - - if (err != cudaSuccess) { - throw PythonBackendException(std::string( - "failed to close memory handle: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } - } else { - throw PythonBackendException("LoadGPUData called on a CPU tensor."); + std::size_t name_offset = + sizeof(TensorShm) + sizeof(int64_t) * dims_.size(); + name_shm_ = PbString::Create( + name_, reinterpret_cast(tensor_shm_ptr_) + name_offset, + shm_handle_ + name_offset); + std::size_t pb_memory_offset = name_offset + PbString::ShmStructSize(name_); + + pb_memory_ = PbMemory::Create( + memory_type_, memory_type_id_, byte_size_, + reinterpret_cast(memory_ptr_), + reinterpret_cast(tensor_shm_ptr_) + pb_memory_offset, + shm_handle_ + pb_memory_offset, copy_gpu); + + tensor_shm_ptr_->memory = pb_memory_->ShmHandle(); + memory_ptr_ = pb_memory_->DataPtr(); } } -void -PbTensor::CopyToCPU(std::unique_ptr& shm_pool) +std::unique_ptr +PbTensor::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t tensor_handle, bool open_cuda_handle) +{ + AllocatedSharedMemory tensor_shm = shm_pool->Load(tensor_handle); + TensorShm* tensor_shm_ptr = + reinterpret_cast(tensor_shm.data_.get()); + size_t name_offset = + sizeof(TensorShm) + sizeof(int64_t) * tensor_shm_ptr->dims_count; + std::unique_ptr name_shm = PbString::LoadFromSharedMemory( + tensor_handle + name_offset, tensor_shm.data_.get() + name_offset); + + std::size_t pb_memory_offset = name_offset + name_shm->Size(); + std::unique_ptr pb_memory = PbMemory::LoadFromSharedMemory( + pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset, + open_cuda_handle); + return std::unique_ptr( + new PbTensor(tensor_shm, name_shm, pb_memory)); +} + +TRITONSERVER_DataType +PbTensor::TritonDtype() const { - if (!this->IsCPU()) { - char* raw_data_ptr; - uint64_t* offset_ptr; - off_t raw_ptr_offset = 0; - off_t raw_data_offset; - - // Raw Data - SaveRawDataToSharedMemory( - shm_pool, raw_data_offset, raw_data_ptr, - TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /*memory_type_id */, - this->ByteSize(), &offset_ptr, raw_ptr_offset); - tensor_shm_->raw_data = raw_data_offset; - cudaError_t err = cudaMemcpy( - (void*)raw_data_ptr, memory_ptr_, this->ByteSize(), - cudaMemcpyDeviceToHost); - - if (err != cudaSuccess) { - throw PythonBackendException( - std::string( - "failed to copy data: " + std::string(cudaGetErrorString(err))) - .c_str()); - } - } else { - throw PythonBackendException("CopyToCPU can be called on a GPU tensor."); - } + return dtype_; } -#endif // TRITON_ENABLE_GPU -void -PbTensor::SetDataPtr(void* ptr) +void* +PbTensor::DataPtr() { - memory_ptr_ = ptr; + return memory_ptr_; } -void -PbTensor::SetMemoryType(TRITONSERVER_MemoryType memory_type) +bi::managed_external_buffer::handle_t +PbTensor::ShmHandle() { - memory_type_ = memory_type; - raw_data_shm_->memory_type = memory_type; + return shm_handle_; } -void -PbTensor::SetMemoryTypeId(int64_t memory_type_id) -{ - memory_type_id_ = memory_type_id; - raw_data_shm_->memory_type_id = memory_type_id; -} +PbTensor::PbTensor( + AllocatedSharedMemory& tensor_shm, + std::unique_ptr& name_shm, std::unique_ptr& pb_memory) + : tensor_shm_(std::move(tensor_shm)), name_shm_(std::move(name_shm)), + pb_memory_(std::move(pb_memory)) +{ + tensor_shm_ptr_ = reinterpret_cast(tensor_shm_.data_.get()); + dims_shm_ptr_ = reinterpret_cast( + reinterpret_cast(tensor_shm_ptr_) + sizeof(TensorShm)); + + name_ = name_shm_->String(); + dims_ = std::vector( + dims_shm_ptr_, dims_shm_ptr_ + tensor_shm_ptr_->dims_count); + dtype_ = tensor_shm_ptr_->dtype; + dl_managed_tensor_ = nullptr; + byte_size_ = pb_memory_->ByteSize(); + memory_ptr_ = pb_memory_->DataPtr(); + memory_type_ = pb_memory_->MemoryType(); + memory_type_id_ = pb_memory_->MemoryTypeId(); + shm_handle_ = tensor_shm_.handle_; -#ifdef TRITON_ENABLE_GPU -#ifndef TRITON_PB_STUB -void -PbTensor::SetBackendMemory( - std::unique_ptr backend_memory, - std::unique_ptr& shm_pool) -{ - tensor_shm_->is_cuda_handle_set = false; - cudaSetDevice(this->MemoryTypeId()); - cudaError_t err = - cudaIpcGetMemHandle(cuda_ipc_mem_handle_, backend_memory->MemoryPtr()); - if (err != cudaSuccess) { - throw PythonBackendException(std::string( - "failed to get cuda ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str()); +#ifdef TRITON_PB_STUB + if (memory_type_ == TRITONSERVER_MEMORY_CPU || + memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { + if (dtype_ != TRITONSERVER_TYPE_BYTES) { + py::object numpy_array = + py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); + numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); + } else { + py::object numpy_array = py::array( + triton_to_pybind_dtype(TRITONSERVER_TYPE_UINT8), {byte_size_}, + (void*)memory_ptr_); + py::module triton_pb_utils = + py::module::import("triton_python_backend_utils"); + numpy_array_ = + triton_pb_utils.attr("deserialize_bytes_tensor")(numpy_array) + .attr("reshape")(dims_); + } + } else { + numpy_array_ = py::none(); } - - memory_ptr_ = backend_memory->MemoryPtr(); - SetMemoryType(backend_memory->MemoryType()); - SetMemoryTypeId(backend_memory->MemoryTypeId()); - backend_memory_ = std::move(backend_memory); - raw_data_shm_->offset = this->GetGPUPointerOffset(); - tensor_shm_->is_cuda_handle_set = true; -} #endif -#endif - -int -PbTensor::TritonDtype() const -{ - return dtype_; -} - -void* -PbTensor::GetDataPtr() const -{ - return memory_ptr_; } }}} // namespace triton::backend::python diff --git a/src/pb_tensor.h b/src/pb_tensor.h index 09ab36f8..1c208cd5 100644 --- a/src/pb_tensor.h +++ b/src/pb_tensor.h @@ -1,5 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights -// reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -41,6 +40,8 @@ namespace py = pybind11; #endif #include +#include "pb_memory.h" +#include "pb_string.h" #include "pb_utils.h" #include "triton/backend/backend_common.h" #include "triton/backend/backend_memory.h" @@ -48,48 +49,18 @@ namespace py = pybind11; namespace triton { namespace backend { namespace python { -typedef enum PYTHONBACKEND_tensortype_enum { - PYTHONBACKEND_RAW, - PYTHONBACKEND_NUMPY, - PYTHONBACKEND_DLPACK -} PYTHONBACKEND_TensorType; +// +// Represents a Tensor object in shared memory. +// +struct TensorShm { + // Handle for the pointer data in shared memory. + bi::managed_external_buffer::handle_t memory; + TRITONSERVER_DataType dtype; + size_t dims_count; +}; -// PbTensor class is the representation of Triton tensors -// inside Python backend. +// PbTensor class is the representation of Triton tensors inside Python backend. class PbTensor { - private: - std::string name_; -#ifdef TRITON_PB_STUB - py::array numpy_array_; - // Storing the serialized version of the numpy array - py::array numpy_array_serialized_; -#endif - TRITONSERVER_DataType dtype_; - void* memory_ptr_; - int64_t memory_type_id_; - std::vector dims_; - TRITONSERVER_MemoryType memory_type_; - PYTHONBACKEND_TensorType tensor_type_; - uint64_t byte_size_; - DLManagedTensor* dl_managed_tensor_; - Tensor* tensor_shm_; - RawData* raw_data_shm_; - -#ifdef TRITON_ENABLE_GPU - bool is_cuda_handle_set_; - cudaIpcMemHandle_t* cuda_ipc_mem_handle_ = nullptr; - std::shared_ptr cuda_ipc_open_mutex_; - std::shared_ptr cuda_ipc_close_mutex_; -#ifndef TRITON_PB_STUB - std::unique_ptr backend_memory_; -#endif // TRITON_PB_STUB -#endif // TRITON_ENABLE_GPU - // bool is_reused_ = false; - uint64_t device_ptr_offset_ = 0; - bool destruct_cuda_ipc_mem_handle_ = false; - off_t raw_shm_offset_ = 0; - off_t shm_offset_ = 0; - public: #ifdef TRITON_PB_STUB /// Create a PbTensor using a numpy array @@ -116,18 +87,29 @@ class PbTensor { /// \param memory_type The memory type of the tensor /// \param memory_type_id The memory type_id of the tensor /// \param memory_ptr Pointer to the location of the data. Data must be - /// contiguous and in C order. + /// contiguous and in C-order. /// \param byte_size Total number of bytes that the tensor uses. - /// \param shm_offset The shared memory offset of the device pointer. + /// \param shm_handle The shared memory handle of pointer if it is stored in + /// shared memory. PbTensor( const std::string& name, const std::vector& dims, TRITONSERVER_DataType dtype, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* memory_ptr, uint64_t byte_size, - DLManagedTensor* dl_managed_tensor = nullptr, off_t shm_offset = 0); + DLManagedTensor* dl_managed_tensor = nullptr, + bi::managed_external_buffer::handle_t shm_handle = 0); + + /// This constructor is used when + /// loading the tensor from shared memory. + /// \param tensor_shm The name of the tensor + /// \param dims_shm Tensor dimensions + /// \param pb_string Triton dtype + PbTensor( + AllocatedSharedMemory& tensor_shm, + std::unique_ptr& name_shm, + std::unique_ptr& pb_memory); // Copying tensor objects is not allowed. - PbTensor(const PbTensor& other) = delete; - PbTensor& operator=(const PbTensor& other) = delete; + DISALLOW_COPY_AND_ASSIGN(PbTensor); #ifdef TRITON_PB_STUB /// Construct a Python backend tensor using a DLPack @@ -151,40 +133,27 @@ class PbTensor { /// Get the name of the tensor /// \return name of the tensor. const std::string& Name() const; - static std::shared_ptr LoadFromSharedMemory( - std::unique_ptr& shm_pool, off_t tensor_offset, - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex); -#ifdef TRITON_ENABLE_GPU - /// Get the GPU start address. - /// \return The start address of a device pointer. - /// \throws PythonBackendException if the tensor is stored in CPU. - void* GetGPUStartAddress(); - - /// Get the cuda IPC handle corresponding to this tensor. - /// \return The cudaIpcMemHandle - const cudaIpcMemHandle_t* CudaIpcMemHandle(); - - /// Set cuda IPC open mutex. This mutex will be used for cudaIpcOpenMemHandle - /// and cudaIpcCloseMemHandle calls. - void SetCudaIpcMutexes( - std::shared_ptr& cuda_ipc_open_mutex, - std::shared_ptr& cuda_ipc_close_mutex); - - /// Set the cuda IPC handle corresponding to this tensor. - /// \param cuda_ipc_mem_handle CUDA ipc mem handle. - void SetCudaIpcMemHandle(cudaIpcMemHandle_t* cuda_ipc_mem_handle) - { - tensor_shm_->is_cuda_handle_set = true; - *cuda_ipc_mem_handle_ = *cuda_ipc_mem_handle; - } - - /// Get the GPU pointer offset. - /// \return The offset of a device pointer. - /// \throws PythonBackendException if the tensor is stored in CPU. - uint64_t GetGPUPointerOffset(); -#endif // TRITON_ENABLE_GPU + /// Set the name of the tensor + /// \param name Name of the tensor. + void SetName(const std::string& name); + + /// Get the shared memory handle corresponding to this tensor + /// \return returns the shared memory handle. + bi::managed_external_buffer::handle_t ShmHandle(); + + /// Load the tensor object from shared memory. + /// \param shm_pool The shared memory manager object + /// \param tensor_handle The handle of the object in shared memory. + /// \param open_cuda_handle If the tensor is in GPU, setting this option to + /// true will call cudaIpcOpenMemHandle on it. In the main process this option + /// should be set to false because we never want to call cudaIpcOpenMemHandle + /// in the main process. + /// \return returns the tensor loaded from shared memory. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t tensor_handle, + bool open_cuda_handle); #ifdef TRITON_PB_STUB /// Get NumPy representation of the tensor. @@ -193,37 +162,22 @@ class PbTensor { const py::array& AsNumpy() const; #endif -#ifndef TRITON_PB_STUB - /// Set the backend memory object that holds the data for GPU tensors. - /// \param backend_memory Backend memory. - void SetBackendMemory( - std::unique_ptr backend_memory, - std::unique_ptr& shm_pool); -#endif - /// Save tensor inside shared memory. void SaveToSharedMemory( - std::unique_ptr& shm_pool, Tensor* tensor_shm, - bool copy_cpu, bool copy_gpu); + std::unique_ptr& shm_pool, bool copy_gpu); /// Get the triton dtype /// \return Triton dtype - int TritonDtype() const; + TRITONSERVER_DataType TritonDtype() const; + + /// Get the data ptr + /// \return Get the raw pointer. + void* DataPtr(); /// This function will be automatically called by the stub when the tensor is /// no longer required. void DeleteDLPack(); - /// Shared memory offset of the raw pointer. - off_t RawShmOffset(); - - /// Shared memory offset of the tensor. - off_t ShmOffset() { return shm_offset_; } - - /// Get the type of the tensor - /// \return Type of the tensor. - PYTHONBACKEND_TensorType TensorType() const; - /// Tells whether the Tensor is stored in CPU or not. /// \return A boolean value indicating whether the tensor is stored in CPU /// or not. @@ -236,37 +190,52 @@ class PbTensor { /// \return the memory type of the tensor. TRITONSERVER_MemoryType MemoryType() const; + /// Get a mutable reference to the MemoryType. + /// \return the pointer to the memory type of the tensor. + TRITONSERVER_MemoryType* MutableMemoryType(); + + /// Get the triton memory type of the Tensor. + /// \return the memory type of the tensor. + int64_t MemoryTypeId() const; + /// Get the dimensions of the tensor /// \return A vector containing the tensor dimensions. const std::vector& Dims() const; - /// Get the data pointer. - /// \return The location to the memory where the data is stored. - void* GetDataPtr() const; - - void SetMemoryType(TRITONSERVER_MemoryType memory_type); - void SetMemoryTypeId(int64_t memory_type_id); + /// Get the underlying memory + std::unique_ptr& Memory(); - /// Set the underlying pointer to use. This must be only used when the tensor - /// is being reused. - void SetDataPtr(void* ptr); + /// Set the underlying memory + void SetMemory(std::unique_ptr&& memory); - /// After the GPU tensor buffer is provided, copy the data to the output - /// buffers. - void LoadGPUData(std::unique_ptr& shm_pool); - void CopyToCPU(std::unique_ptr& shm_pool); + PbTensor(); - Tensor* SharedMemoryObject() { return tensor_shm_; } - RawData* RawDataShm() { return raw_data_shm_; } + /// Destructor + ~PbTensor() noexcept(false); + private: + std::string name_; +#ifdef TRITON_PB_STUB + py::array numpy_array_; + // Storing the serialized version of the numpy array + py::array numpy_array_serialized_; +#endif + TRITONSERVER_DataType dtype_; + void* memory_ptr_; + int64_t memory_type_id_; + std::vector dims_; + TRITONSERVER_MemoryType memory_type_; + uint64_t byte_size_; + DLManagedTensor* dl_managed_tensor_; - /// Get the memory type id. - /// \return The memory type id of the tensor. - int64_t MemoryTypeId() const; + bi::managed_external_buffer::handle_t shm_handle_; - PbTensor(); + AllocatedSharedMemory tensor_shm_; + TensorShm* tensor_shm_ptr_; + int64_t* dims_shm_ptr_; + std::unique_ptr name_shm_; - /// Destructor - ~PbTensor() noexcept(false); + // The pointer is null when the object is not stored in shared memory. + std::unique_ptr pb_memory_; }; }}} // namespace triton::backend::python diff --git a/src/pb_utils.cc b/src/pb_utils.cc index f9e0ec0e..4de37d33 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -44,7 +44,7 @@ #include #include #include -#include "shm_manager.h" +#include "scoped_defer.h" #ifdef TRITON_ENABLE_GPU #include @@ -53,277 +53,9 @@ namespace triton { namespace backend { namespace python { -#define THROW_IF_ERROR(MSG, X) \ - do { \ - int return__ = (X); \ - if (return__ != 0) { \ - throw PythonBackendException(MSG); \ - } \ - } while (false) - -void -LoadStringFromSharedMemory( - std::unique_ptr& shm_pool, off_t shm_offset, char*& str) -{ - String* string; - shm_pool->MapOffset((char**)&string, shm_offset); - shm_pool->MapOffset((char**)&str, string->data); -} - -void -SaveStringToSharedMemory( - std::unique_ptr& shm_pool, off_t& shm_offset, const char* str) -{ - String* string_shm; - shm_pool->Map((char**)&string_shm, sizeof(String), shm_offset); - string_shm->length = strlen(str) + 1; - - char* string_data; - off_t str_data_offset; - shm_pool->Map((char**)&string_data, string_shm->length, str_data_offset); - string_shm->data = str_data_offset; - strcpy(string_data, str); -} - -void -SaveRawDataToSharedMemory( - std::unique_ptr& shm_pool, off_t& raw_data_offset, - char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type, - int memory_type_id, uint64_t byte_size, uint64_t** offset, - off_t raw_ptr_offset) -{ - // raw data - RawData* raw_data; - shm_pool->Map((char**)&raw_data, sizeof(RawData), raw_data_offset); - - raw_data->memory_type = memory_type; - raw_data->memory_type_id = memory_type_id; - raw_data->byte_size = byte_size; - *offset = &(raw_data->offset); - if (memory_type == TRITONSERVER_MEMORY_CPU) { - // If the raw_ptr_offset is not equal to zero, the user has provided - // the offset for the raw ptr. - if (raw_ptr_offset == 0) { - off_t buffer_offset; - shm_pool->Map((char**)&raw_data_ptr, byte_size, buffer_offset); - raw_data->memory_ptr = buffer_offset; - } else { - raw_data->memory_ptr = raw_ptr_offset; - } - } - - if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU - off_t buffer_offset; - shm_pool->Map( - (char**)&raw_data_ptr, sizeof(cudaIpcMemHandle_t), buffer_offset); - raw_data->memory_ptr = buffer_offset; - -#else - throw PythonBackendException( - "Python backend does not support GPU tensors."); -#endif // TRITON_ENABLE_GPU - } -} - -void -SaveMapToSharedMemory( - std::unique_ptr& shm_pool, off_t& shm_offset, - const std::unordered_map& map) -{ - Dict* dict; - shm_pool->Map((char**)&dict, sizeof(Dict), shm_offset); - dict->length = map.size(); - - Pair* pairs; - shm_pool->Map((char**)&pairs, sizeof(Pair) * map.size(), dict->values); - - size_t i = 0; - for (const auto& pair : map) { - SaveStringToSharedMemory(shm_pool, pairs[i].key, pair.first.c_str()); - SaveStringToSharedMemory(shm_pool, pairs[i].value, pair.second.c_str()); - i += 1; - } -} - -void -LoadMapFromSharedMemory( - std::unique_ptr& shm_pool, off_t shm_offset, - std::unordered_map& map) -{ - Dict* dict; - shm_pool->MapOffset((char**)&dict, shm_offset); - - Pair* pairs; - shm_pool->MapOffset((char**)&pairs, dict->values); - for (size_t i = 0; i < dict->length; i++) { - char* key; - LoadStringFromSharedMemory(shm_pool, pairs[i].key, key); - char* value; - LoadStringFromSharedMemory(shm_pool, pairs[i].value, value); - map.emplace(std::make_pair(key, value)); - } -} - -void -SaveTensorToSharedMemory( - std::unique_ptr& shm_pool, Tensor* tensor, - char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type, - int64_t memory_type_id, uint64_t byte_size, const char* name, - const int64_t* dims, size_t dims_count, TRITONSERVER_DataType dtype, - uint64_t** offset_ptr, off_t raw_ptr_offset) -{ - off_t raw_data_offset; - // Raw Data - SaveRawDataToSharedMemory( - shm_pool, raw_data_offset, raw_data_ptr, memory_type, memory_type_id, - byte_size, offset_ptr, raw_ptr_offset); - tensor->raw_data = raw_data_offset; - - // name - off_t name_offset; - SaveStringToSharedMemory(shm_pool, name_offset, name); - tensor->name = name_offset; - - // input dtype - tensor->dtype = dtype; - - // input dims - int64_t* tensor_dims; - tensor->dims_count = dims_count; - off_t tensor_dims_offset; - shm_pool->Map( - (char**)&tensor_dims, sizeof(int64_t) * dims_count, tensor_dims_offset); - tensor->dims = tensor_dims_offset; - - for (size_t j = 0; j < dims_count; ++j) { - tensor_dims[j] = dims[j]; - } -} - -void -CopySingleArchiveEntry(archive* input_archive, archive* output_archive) -{ - const void* buff; - size_t size; -#if ARCHIVE_VERSION_NUMBER >= 3000000 - int64_t offset; -#else - off_t offset; -#endif - - for (;;) { - int return_status; - return_status = - archive_read_data_block(input_archive, &buff, &size, &offset); - if (return_status == ARCHIVE_EOF) - break; - if (return_status != ARCHIVE_OK) - throw PythonBackendException( - "archive_read_data_block() failed with error code = " + - std::to_string(return_status)); - - return_status = - archive_write_data_block(output_archive, buff, size, offset); - if (return_status != ARCHIVE_OK) { - throw PythonBackendException( - "archive_write_data_block() failed with error code = " + - std::to_string(return_status) + ", error message is " + - archive_error_string(output_archive)); - } - } -} - -void -ExtractTarFile(std::string& archive_path, std::string& dst_path) -{ - char current_directory[PATH_MAX]; - if (getcwd(current_directory, PATH_MAX) == nullptr) { - throw PythonBackendException( - (std::string("Failed to get the current working directory. Error: ") + - std::strerror(errno))); - } - if (chdir(dst_path.c_str()) == -1) { - throw PythonBackendException( - (std::string("Failed to change the directory to ") + dst_path + - " Error: " + std::strerror(errno)) - .c_str()); - } - - struct archive_entry* entry; - int flags = ARCHIVE_EXTRACT_TIME; - - struct archive* input_archive = archive_read_new(); - struct archive* output_archive = archive_write_disk_new(); - archive_write_disk_set_options(output_archive, flags); - - archive_read_support_filter_gzip(input_archive); - archive_read_support_format_tar(input_archive); - - if (archive_path.size() == 0) { - throw PythonBackendException("The archive path is empty."); - } - - THROW_IF_ERROR( - "archive_read_open_filename() failed.", - archive_read_open_filename( - input_archive, archive_path.c_str(), 10240 /* block_size */)); - - while (true) { - int read_status = archive_read_next_header(input_archive, &entry); - if (read_status == ARCHIVE_EOF) - break; - if (read_status != ARCHIVE_OK) { - throw PythonBackendException( - std::string("archive_read_next_header() failed with error code = ") + - std::to_string(read_status) + std::string(" error message is ") + - archive_error_string(input_archive)); - } - - read_status = archive_write_header(output_archive, entry); - if (read_status != ARCHIVE_OK) { - throw PythonBackendException(std::string( - "archive_write_header() failed with error code = " + - std::to_string(read_status) + std::string(" error message is ") + - archive_error_string(output_archive))); - } - - CopySingleArchiveEntry(input_archive, output_archive); - - read_status = archive_write_finish_entry(output_archive); - if (read_status != ARCHIVE_OK) { - throw PythonBackendException(std::string( - "archive_write_finish_entry() failed with error code = " + - std::to_string(read_status) + std::string(" error message is ") + - archive_error_string(output_archive))); - } - } - - archive_read_close(input_archive); - archive_read_free(input_archive); - - archive_write_close(output_archive); - archive_write_free(output_archive); - - // Revert the directory change. - if (chdir(current_directory) == -1) { - throw PythonBackendException( - (std::string("Failed to change the directory to ") + current_directory) - .c_str()); - } -} - -bool -FileExists(std::string& path) -{ - struct stat buffer; - return stat(path.c_str(), &buffer) == 0; -} - -#ifdef TRITON_ENABLE_GPU - -CUDADriverAPI::CUDADriverAPI() +CUDAHandler::CUDAHandler() { dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY); @@ -350,7 +82,7 @@ CUDADriverAPI::CUDADriverAPI() } void -CUDADriverAPI::PointerGetAttribute( +CUDAHandler::PointerGetAttribute( CUdeviceptr* start_address, CUpointer_attribute attribute, CUdeviceptr dev_ptr) { @@ -368,12 +100,108 @@ CUDADriverAPI::PointerGetAttribute( } bool -CUDADriverAPI::IsAvailable() +CUDAHandler::IsAvailable() { return dl_open_handle_ != nullptr; } -CUDADriverAPI::~CUDADriverAPI() noexcept(false) +void +CUDAHandler::OpenCudaHandle( + int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle, + void** data_ptr) +{ + std::lock_guard guard{mu_}; + int current_device; + + // Save the previous device + cudaError_t err = cudaGetDevice(¤t_device); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to get the current CUDA device. error: ") + + cudaGetErrorString(err)); + } + + bool overridden = (current_device != memory_type_id); + + // Restore the previous device before returning from the function. + ScopedDefer _(std::bind([&overridden, ¤t_device] { + if (overridden) { + cudaError_t err = cudaSetDevice(current_device); + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to set the CUDA device to " + + std::to_string(current_device) + + ". error: " + cudaGetErrorString(err)); + } + } + })); + + if (overridden) { + err = cudaSetDevice(memory_type_id); + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to set the CUDA device to " + std::to_string(memory_type_id) + + ". error: " + cudaGetErrorString(err)); + } + } + + err = cudaIpcOpenMemHandle( + data_ptr, *cuda_mem_handle, cudaIpcMemLazyEnablePeerAccess); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to open the cudaIpcHandle. error: ") + + cudaGetErrorString(err)); + } +} + +void +CUDAHandler::CloseCudaHandle(int64_t memory_type_id, void* data_ptr) +{ + std::lock_guard guard{mu_}; + int current_device; + + // Save the previous device + cudaError_t err = cudaGetDevice(¤t_device); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to get the current CUDA device. error: ") + + cudaGetErrorString(err)); + } + + bool overridden = (current_device != memory_type_id); + + // Restore the previous device before returning from the function. + ScopedDefer _(std::bind([&overridden, ¤t_device] { + if (overridden) { + cudaError_t err = cudaSetDevice(current_device); + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to set the CUDA device to " + + std::to_string(current_device) + + ". error: " + cudaGetErrorString(err)); + } + } + })); + + if (overridden) { + err = cudaSetDevice(memory_type_id); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to set the CUDA device to ") + + std::to_string(memory_type_id) + + ". error: " + cudaGetErrorString(err)); + } + } + + err = cudaIpcCloseMemHandle(data_ptr); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to close the cudaIpcHandle. error: ") + + cudaGetErrorString(err)); + } +} + +CUDAHandler::~CUDAHandler() noexcept(false) { if (dl_open_handle_ != nullptr) { int status = dlclose(dl_open_handle_); @@ -383,5 +211,4 @@ CUDADriverAPI::~CUDADriverAPI() noexcept(false) } } #endif - }}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 8bb76193..a947222c 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -31,11 +31,12 @@ #endif // TRITON_ENABLE_GPU #include #include -#include #include +#include #include #include #include +#include "pb_exception.h" #include "shm_manager.h" #include "triton/backend/backend_common.h" #include "triton/core/tritonserver.h" @@ -50,12 +51,12 @@ namespace bi = boost::interprocess; (X); \ } \ catch (cont PythonBackendException & pb_exception) { \ - off_t string_offset__; \ + bi::managed_external_buffer::handle_t string_handle__; \ try { \ SaveStringToSharedMemory( \ - SHM_POOL, string_offset__, pb_exception.what()); \ + SHM_POOL, string_handle__, pb_exception.what()); \ RESPONSE->has_error = true; \ - RESPONSE->error = string_offset__; \ + RESPONSE->error = string_handle__; \ if (R) \ return; \ } \ @@ -76,191 +77,103 @@ namespace bi = boost::interprocess; } \ } while (false) +#define THROW_IF_CUDA_ERROR(X) \ + do { \ + cudaError_t cuda_err__ = (X); \ + if (cuda_err__ != cudaSuccess) { \ + throw PythonBackendException( \ + std::string(cudaGetErrorString(cuda_err__))); \ + } \ + } while (false) + +#define THROW_IF_ERROR(MSG, X) \ + do { \ + int return__ = (X); \ + if (return__ != 0) { \ + throw PythonBackendException(MSG); \ + } \ + } while (false) -struct InitializeResponse { + +#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete; +#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete; +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + DISALLOW_COPY(TypeName) \ + DISALLOW_ASSIGN(TypeName) + +struct InitializeResponseShm { // Indicates whether the response has an error or not. bool response_has_error; // Indicates whether the response error is set or not. bool response_is_error_set; // Contains the error message. - off_t response_error; + bi::managed_external_buffer::handle_t response_error; }; // Control data structure for the communication between the Python stub and the // main stub. -struct IPCControl { +struct IPCControlShm { bool stub_health; bool parent_health; bool uses_env; - off_t parent_health_mutex; - off_t stub_health_mutex; - off_t stub_message_queue; - off_t parent_message_queue; -}; - -// -// Represents a raw data -// -struct RawData { - off_t memory_ptr; - // offset represents the pointer offset. - uint64_t offset; - TRITONSERVER_MemoryType memory_type; - int64_t memory_type_id; - uint64_t byte_size; -}; - -// -// Represents a Tensor object that will be passed to Python code. -// -struct Tensor { - // Offset for raw data field. - off_t raw_data; - // Offset for name field. - off_t name; - TRITONSERVER_DataType dtype; - // Shared memory offset for the dimensions. - off_t dims; - size_t dims_count; - bool is_cuda_handle_set; -}; - -struct String { - off_t data; - size_t length; -}; - -// -// Inference Request -// -struct Request { - // Offset for the id field. - off_t id; - uint64_t correlation_id; - // Offset for input field. - off_t inputs; - uint32_t requested_input_count; - // Offset for the requested output names - off_t requested_output_names; - uint32_t requested_output_count; - off_t model_name; - int64_t model_version; - uint32_t flags; -}; - -struct Response { - // Offset for Tensor output. - off_t outputs; - uint32_t outputs_size; - off_t error; - bool has_error; - // Indicates whether this error has a message or not. - bool is_error_set; + bi::interprocess_mutex parent_health_mutex; + bi::interprocess_mutex stub_health_mutex; + bi::managed_external_buffer::handle_t stub_message_queue; + bi::managed_external_buffer::handle_t parent_message_queue; }; struct ResponseBatch { - // Offset for response object. - off_t responses; uint32_t batch_size; - off_t error; + bi::managed_external_buffer::handle_t error; bool has_error; + // Indicates whether an additional call to stub is required for the clean up // of the resources. bool cleanup; + // Indicates whether this error has a message or not. bool is_error_set; }; struct RequestBatch { - // Offset for request object. - off_t requests; uint32_t batch_size; -}; -// Representing a key value pair -struct Pair { - off_t key; - off_t value; -}; + // GPU Buffers handle + bi::managed_external_buffer::handle_t gpu_buffers_handle; -struct Dict { - uint32_t length; - // Values point to the location where there are `length` pairs. - off_t values; + // GPU buffers count + uint32_t gpu_buffers_count; }; -// -// PythonBackendException -// -// Exception thrown if error occurs in PythonBackend. -// -struct PythonBackendException : std::exception { - PythonBackendException(std::string message) : message_(message) {} - - const char* what() const throw() { return message_.c_str(); } - - std::string message_; -}; - -void SaveMapToSharedMemory( - std::unique_ptr& shm_pool, off_t& shm_offset, - const std::unordered_map& map); - -void LoadMapFromSharedMemory( - std::unique_ptr& shm_pool, off_t shm_offset, - std::unordered_map& map); - -void SaveStringToSharedMemory( - std::unique_ptr& shm_pool, off_t& shm_offset, - const char* str); -void LoadStringFromSharedMemory( - std::unique_ptr& shm_pool, off_t shm_offset, char*& str); - -void SaveRawDataToSharedMemory( - std::unique_ptr& shm_pool, off_t& raw_data_offset, - char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type, - int memory_type_id, uint64_t byte_size, uint64_t** offset_ptr, - off_t raw_ptr_offset = 0); - -void SaveTensorToSharedMemory( - std::unique_ptr& shm_pool, Tensor* tensor, - char*& raw_data_ptr, TRITONSERVER_MemoryType memory_type, - int64_t memory_type_id, uint64_t byte_size, const char* name, - const int64_t* dims, size_t dims_count, TRITONSERVER_DataType dtype, - uint64_t** offset_ptr, off_t raw_ptr_offset); - -void LoadTensorFromSharedMemory( - std::unique_ptr& shm_pool, off_t tensor_shm_offset, - Tensor& tensor); - -void ExtractTarFile(std::string& archive_path, std::string& dst_path); - -bool FileExists(std::string& path); - #ifdef TRITON_ENABLE_GPU -class CUDADriverAPI { +class CUDAHandler { public: - static CUDADriverAPI& getInstance() + static CUDAHandler& getInstance() { - static CUDADriverAPI instance; + static CUDAHandler instance; return instance; } private: + std::mutex mu_; void* dl_open_handle_ = nullptr; CUresult (*cu_pointer_get_attribute_fn_)( CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr; CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr; - CUDADriverAPI(); - ~CUDADriverAPI() noexcept(false); + CUDAHandler(); + ~CUDAHandler() noexcept(false); public: - CUDADriverAPI(CUDADriverAPI const&) = delete; - void operator=(CUDADriverAPI const&) = delete; + CUDAHandler(CUDAHandler const&) = delete; + void operator=(CUDAHandler const&) = delete; bool IsAvailable(); void PointerGetAttribute( CUdeviceptr* start_address, CUpointer_attribute attr, CUdeviceptr device_ptr); + void OpenCudaHandle( + int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle, + void** data_ptr); + void CloseCudaHandle(int64_t memory_type_id, void* data_ptr); }; #endif // TRITON_ENABLE_GPU diff --git a/src/python.cc b/src/python.cc index 698e392b..fbec5b67 100644 --- a/src/python.cc +++ b/src/python.cc @@ -55,16 +55,18 @@ #include "ipc_message.h" #include "message_queue.h" #include "pb_env.h" -#include "pb_main_utils.h" +#include "pb_map.h" #include "pb_metric_reporter.h" -#include "pb_tensor.h" #include "pb_utils.h" +#include "request_executor.h" +#include "scoped_defer.h" #include "shm_manager.h" #include "triton/backend/backend_common.h" #include "triton/backend/backend_input_collector.h" #include "triton/backend/backend_memory.h" #include "triton/backend/backend_model.h" #include "triton/backend/backend_model_instance.h" +#include "triton/common/nvtx.h" #include "triton/common/triton_json.h" #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" @@ -225,7 +227,6 @@ class ModelState : public BackendModel { bool force_cpu_only_input_tensors_; }; - class ModelInstanceState : public BackendModelInstance { ModelInstanceState( ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance); @@ -234,18 +235,16 @@ class ModelInstanceState : public BackendModelInstance { bi::interprocess_mutex* health_mutex_; std::unique_ptr stub_message_queue_; std::unique_ptr parent_message_queue_; - off_t ipc_control_offset_; std::string model_path_; - IPCControl* ipc_control_; + std::unique_ptr> + ipc_control_; + bi::managed_external_buffer::handle_t ipc_control_handle_; std::vector> bls_futures_; std::vector bls_inference_responses_; std::mutex bls_responses_mutex_; - std::unique_ptr shm_pool_; + std::unique_ptr shm_pool_; std::string shm_region_name_; - off_t shm_reset_offset_; - std::vector> infer_responses_; std::vector> request_executors_; - std::vector> handles_; // Stub process pid pid_t stub_pid_; @@ -265,21 +264,9 @@ class ModelInstanceState : public BackendModelInstance { ~ModelInstanceState(); - // Load Triton inputs to the appropriate Protobufs - TRITONSERVER_Error* GetInputTensor( - const uint32_t input_idx, Tensor* input_tensor_shm, - std::shared_ptr& input_tensor, TRITONBACKEND_Request* request, - std::shared_ptr>& responses); - - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - bool& restart, std::atomic& cleanup); - // Create the stub process. TRITONSERVER_Error* SetupStubProcess(); TRITONSERVER_Error* SendMessageToStub(off_t message); - void CleanupBLSResponses(); - void WaitForBLSRequestsToFinish(); // Checks whether the stub process is live bool IsStubProcessAlive(); @@ -305,11 +292,27 @@ class ModelInstanceState : public BackendModelInstance { // Start stub process TRITONSERVER_Error* StartStubProcess(); - // Reset the shared memory offset - void ResetSharedMemoryOffset(); + // Convert TRITONBACKEND_Input to Python backend tensors. + TRITONSERVER_Error* GetInputTensor( + const uint32_t input_idx, std::shared_ptr& input_tensor, + TRITONBACKEND_Request* request, + std::shared_ptr>& responses); + + // Process all the requests obtained from Triton. + void ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count, + bool& restart); + + // Execute a BLS Request void ExecuteBLSRequest( - std::unique_ptr& shm_pool, off_t message_offset, - std::atomic& cleanup); + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t message_offset); + + // Cleanup BLS responses + void CleanupBLSResponses(); + + // Wait for BLS requests to complete + void WaitForBLSRequestsToFinish(); }; ModelInstanceState::ModelInstanceState( @@ -345,15 +348,10 @@ ModelInstanceState::KillStubProcess() stub_pid_ = 0; } -void -ModelInstanceState::WaitForBLSRequestsToFinish() -{ - bls_futures_.clear(); -} - void ModelInstanceState::SendMessageAndReceiveResponse( - off_t message, off_t& response, bool& restart, + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, bool& restart, std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count) { @@ -366,7 +364,7 @@ ModelInstanceState::SendMessageAndReceiveResponse( return; } - off_t response_message; + bi::managed_external_buffer::handle_t response_message; error = ReceiveMessageFromStub(response_message); if (error != nullptr) { restart = true; @@ -380,7 +378,8 @@ ModelInstanceState::SendMessageAndReceiveResponse( } TRITONSERVER_Error* -ModelInstanceState::SendMessageToStub(off_t message) +ModelInstanceState::SendMessageToStub( + bi::managed_external_buffer::handle_t message) { bool success = false; while (!success) { @@ -416,7 +415,8 @@ ModelInstanceState::SendMessageToStub(off_t message) } TRITONSERVER_Error* -ModelInstanceState::ReceiveMessageFromStub(off_t& message) +ModelInstanceState::ReceiveMessageFromStub( + bi::managed_external_buffer::handle_t& message) { bool success = false; while (!success) { @@ -480,209 +480,696 @@ ModelInstanceState::RespondErrorToAllRequests( } void -ModelInstanceState::ResetSharedMemoryOffset() +ModelInstanceState::CleanupBLSResponses() { - shm_pool_->SetOffset(shm_reset_offset_); + for (auto& bls_inference_response : bls_inference_responses_) { + LOG_IF_ERROR( + TRITONSERVER_InferenceResponseDelete(bls_inference_response), + " failed to release BLS inference response."); + } + + bls_inference_responses_.clear(); + request_executors_.clear(); } void -ModelInstanceState::ExecuteBLSRequest( - std::unique_ptr& shm_pool, off_t message_offset, - std::atomic& cleanup) +ModelInstanceState::WaitForBLSRequestsToFinish() { - ModelState* model_state = reinterpret_cast(Model()); - auto request_executor = - std::make_unique(model_state->TritonServer()); - std::unique_ptr ipc_message = - IPCMessage::LoadFromSharedMemory(shm_pool, message_offset); - bool is_response_batch_set = false; - ResponseBatch* response_batch; - TRITONSERVER_InferenceResponse* inference_response = nullptr; - try { - std::unique_ptr bls_response = - std::make_unique(shm_pool_, false /* inline_response */); - RequestBatch* request_batch; - shm_pool_->MapOffset((char**)&request_batch, ipc_message->Args()); - - bls_response->Command() = PYTHONSTUB_InferExecResponse; - ipc_message->RequestOffset() = bls_response->SharedMemoryOffset(); - - shm_pool_->Map( - (char**)&response_batch, sizeof(ResponseBatch), bls_response->Args()); - response_batch->batch_size = 1; - response_batch->has_error = false; - response_batch->is_error_set = false; - response_batch->cleanup = false; - is_response_batch_set = true; - bool has_gpu_tensor = false; - PythonBackendException pb_exception(std::string{}); + bls_futures_.clear(); +} - if (request_batch->batch_size == 1) { - std::unique_ptr infer_request; - std::shared_ptr cuda_ipc_mutex; - infer_request = InferRequest::LoadFromSharedMemory( - shm_pool_, request_batch->requests, cuda_ipc_mutex, cuda_ipc_mutex); - std::unique_ptr infer_response; +bool +ModelInstanceState::IsStubProcessAlive() +{ + boost::posix_time::ptime timeout = + boost::get_system_time() + boost::posix_time::seconds(1); + bi::scoped_lock lock(*health_mutex_, timeout); - // If the BLS inputs are in GPU an additional round trip between the - // stub process and the main process is required. The reason is that we - // need to first allocate the GPU memory from the memory pool and then - // ask the stub process to fill in those allocated buffers. - for (auto& input_tensor : infer_request->Inputs()) { - if (!input_tensor->IsCPU()) { -#ifdef TRITON_ENABLE_GPU - BackendMemory* backend_memory; - std::unique_ptr lbackend_memory; - has_gpu_tensor = true; - TRITONSERVER_Error* error = BackendMemory::Create( - Model()->TritonMemoryManager(), - {BackendMemory::AllocationType::GPU_POOL, - BackendMemory::AllocationType::GPU}, - input_tensor->MemoryTypeId(), input_tensor->ByteSize(), - &backend_memory); - if (error != nullptr) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, TRITONSERVER_ErrorMessage(error)); - break; - } - lbackend_memory.reset(backend_memory); - input_tensor->SetBackendMemory(std::move(lbackend_memory), shm_pool_); -#endif // TRITON_ENABLE_GPU - } - } + // Check if lock has been acquired. + if (lock) { + return ipc_control_->stub_health; + } else { + // If It failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return false; + } +} - // Wait for the extra round trip to complete. The stub process will fill - // in the data for the GPU tensors. If there is an error, the extra round - // trip must be still completed, otherwise the stub process will always be - // waiting for a message from the parent process. - if (has_gpu_tensor) { - bi::scoped_lock lock{ - *(ipc_message->ResponseMutex())}; - ipc_message->ResponseCondition()->notify_all(); - ipc_message->ResponseCondition()->wait(lock); - } +TRITONSERVER_Error* +ModelInstanceState::StartStubProcess() +{ + new (&(ipc_control_->stub_health_mutex)) bi::interprocess_mutex; + health_mutex_ = &(ipc_control_->stub_health_mutex); + stub_message_queue_->ResetSemaphores(); + parent_message_queue_->ResetSemaphores(); - if (pb_exception.what() != nullptr) { - infer_response = request_executor->Infer( - infer_request, shm_pool_, &inference_response); + std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); - if (inference_response != nullptr) { - std::lock_guard lock{bls_responses_mutex_}; - bls_inference_responses_.push_back(inference_response); - } + ModelState* model_state = reinterpret_cast(Model()); + int64_t shm_growth_size = + model_state->StateForBackend()->shm_growth_byte_size; + int64_t shm_default_size = + model_state->StateForBackend()->shm_default_byte_size; + const char* model_path = model_state->RepositoryPath().c_str(); - Response* response; - shm_pool_->Map( - (char**)&response, sizeof(Response), response_batch->responses); - infer_response->SaveToSharedMemory( - shm_pool_, response, false /* copy_cpu */, true /* copy_gpu */); - } else { - throw pb_exception; - } - } - } - catch (const PythonBackendException& pb_exception) { - if (is_response_batch_set) { - response_batch->has_error = true; - off_t string_offset = 0; - LOG_IF_EXCEPTION(SaveStringToSharedMemory( - shm_pool_, string_offset, pb_exception.what())); - if (string_offset != 0) { - response_batch->is_error_set = true; - response_batch->error = string_offset; - } - } else { - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what()); - } - } + initialized_ = false; - { - bi::scoped_lock lock{ - *(ipc_message->ResponseMutex())}; - ipc_message->ResponseCondition()->notify_all(); + pid_t pid = fork(); + if (pid < 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Failed to fork the stub process."); } - request_executors_.emplace_back(std::move(request_executor)); -} - -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - bool& restart, std::atomic& cleanup) -{ - ModelState* model_state = reinterpret_cast(Model()); - int max_batch_size = model_state->MaxBatchSize(); - std::string name = model_state->Name(); - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model ") + model_state->Name() + ", instance " + Name() + - ", executing " + std::to_string(request_count) + " requests") - .c_str()); + // Stub process + if (pid == 0) { + const char* stub_args[4]; + stub_args[0] = "bash"; + stub_args[1] = "-c"; + stub_args[3] = nullptr; // Last argument must be nullptr - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); + // Default Python backend stub + std::string python_backend_stub = + model_state->StateForBackend()->python_lib + + "/triton_python_backend_stub"; - // For each request collect the total batch size for this inference - // execution. The batch-size, number of inputs, and size of each - // input has already been checked so don't need to do that here. + // Path to alternative Python backend stub + std::string model_python_backend_stub = + std::string(model_path) + "/triton_python_backend_stub"; - size_t total_batch_size = 0; - for (size_t i = 0; i < request_count; i++) { - // If we get a nullptr request then something is badly wrong. Fail - // and release all requests. - if (requests[i] == nullptr) { - RequestsRespondWithError( - requests, request_count, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "null request given to Python backend for '" + name + "'") - .c_str())); - return; + if (FileExists(model_python_backend_stub)) { + python_backend_stub = model_python_backend_stub; } - } - // We take the responsibility of the responses. - std::shared_ptr> responses( - new std::vector()); - responses->reserve(request_count); - PbMetricReporter reporter( - TritonModelInstance(), requests, request_count, responses); - reporter.SetExecStartNs(exec_start_ns); + std::string bash_argument; - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses->emplace_back(response); - } else { - responses->emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } + // This shared memory variable indicates whether the stub process should + // revert the LD_LIBRARY_PATH changes to avoid shared library issues in + // executables and libraries. + ipc_control_->uses_env = false; + if (model_state->PythonExecutionEnv() != "") { + std::stringstream ss; - for (size_t i = 0; i < request_count; i++) { - if (max_batch_size > 0) { - // Retrieve the batch size from one of the inputs, if the model - // supports batching, the first dimension size is batch size - TRITONBACKEND_Input* input; - TRITONSERVER_Error* err = - TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); - if (err == nullptr) { - const int64_t* shape; - err = TRITONBACKEND_InputProperties( - input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); - total_batch_size += shape[0]; - } - if (err != nullptr) { - RESPOND_ALL_AND_RETURN_IF_ERROR(responses, request_count, err); - } + // Need to properly set the LD_LIBRARY_PATH so that Python environments + // using different python versions load properly. + ss << "source " << path_to_activate_ + << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ + << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ + << " " << shm_region_name_ << " " << shm_default_size << " " + << shm_growth_size << " " << parent_pid_ << " " + << model_state->StateForBackend()->python_lib << " " + << ipc_control_handle_ << " " << Name(); + ipc_control_->uses_env = true; + bash_argument = ss.str(); } else { - ++total_batch_size; + std::stringstream ss; + ss << " exec " << python_backend_stub << " " << model_path_ << " " + << shm_region_name_ << " " << shm_default_size << " " + << shm_growth_size << " " << parent_pid_ << " " + << model_state->StateForBackend()->python_lib << " " + << ipc_control_handle_ << " " << Name(); + bash_argument = ss.str(); } - } - + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Starting Python backend stub: ") + bash_argument) + .c_str()); + + stub_args[2] = bash_argument.c_str(); + + int stub_status_code = + system((python_backend_stub + "> /dev/null 2>&1").c_str()); + + // If running stub process without any arguments returns any status code, + // other than 1, it can indicate a permission issue as a result of + // downloading the stub process from a cloud object storage service. + if (WEXITSTATUS(stub_status_code) != 1) { + // Give the execute permission for the triton_python_backend_stub to the + // owner. + int error = chmod(python_backend_stub.c_str(), S_IXUSR); + if (error != 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Failed to give execute permission to " + "triton_python_backend_stub in ") + + python_backend_stub + " " + Name() + + " Error No.: " + std::to_string(error)) + .c_str()); + } + } + + if (execvp("bash", (char**)stub_args) != 0) { + std::stringstream ss; + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Shared Memory Region Name: " << shm_region_name_ << '\n' + << "Shared Memory Default Byte Size: " << shm_default_size << '\n' + << "Shared Memory Growth Byte Size: " << shm_growth_size << '\n'; + std::string log_message = ss.str(); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str()); + + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Failed to initialize model instance ") + Name()) + .c_str()); + } + + } else { + ScopedDefer _(std::bind([this] { + // Push a dummy message to the message queue so that the stub + // process is notified that it can release the object stored in + // shared memory. + stub_message_queue_->Push(1000); + })); + + stub_pid_ = pid; + triton::common::TritonJson::WriteBuffer buffer; + Model()->ModelConfig().Write(&buffer); + + std::unordered_map initialize_map = { + {"model_config", buffer.MutableContents()}, + {"model_instance_kind", TRITONSERVER_InstanceGroupKindString(kind_)}, + {"model_instance_name", name_}, + {"model_instance_device_id", std::to_string(device_id_)}, + {"model_repository", model_state->RepositoryPath()}, + {"model_version", std::to_string(model_state->Version())}, + {"model_name", model_state->Name()}}; + + std::unique_ptr initialize_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + initialize_message->Command() = PYTHONSTUB_InitializeRequest; + + std::unique_ptr pb_map = PbMap::Create(shm_pool_, initialize_map); + bi::managed_external_buffer::handle_t initialize_map_handle = + pb_map->ShmHandle(); + + initialize_message->Args() = initialize_map_handle; + stub_message_queue_->Push(initialize_message->ShmHandle()); + + std::unique_ptr initialize_response_message = + IPCMessage::LoadFromSharedMemory( + shm_pool_, parent_message_queue_->Pop()); + + if (initialize_response_message->Command() != + PYTHONSTUB_InitializeResponse) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string( + "Received unexpected resposne from Python backend stub: ") + + name_) + .c_str()); + } + + auto initialize_response = + std::move((shm_pool_->Load( + initialize_response_message->Args()))) + .data_; + + if (initialize_response->response_has_error) { + if (initialize_response->response_is_error_set) { + std::unique_ptr error_message = + PbString::LoadFromSharedMemory( + shm_pool_, initialize_response->response_error); + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, error_message->String().c_str()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Initialize() failed for ") + model_state->Name()) + .c_str()); + } + } + + initialized_ = true; + } + + return nullptr; // success +} + +TRITONSERVER_Error* +ModelInstanceState::SetupStubProcess() +{ + std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); + ModelState* model_state = reinterpret_cast(Model()); + + // Increase the stub process count to avoid shared memory region name + // collision + model_state->StateForBackend()->number_of_instance_inits++; + shm_region_name_ = + model_state->StateForBackend()->shared_memory_region_prefix + + std::to_string(model_state->StateForBackend()->number_of_instance_inits); + int64_t shm_default_size = + model_state->StateForBackend()->shm_default_byte_size; + int64_t shm_growth_byte_size = + model_state->StateForBackend()->shm_growth_byte_size; + + try { + shm_pool_ = std::make_unique( + shm_region_name_, shm_default_size, shm_growth_byte_size, + true /* create */); + } + catch (const PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + AllocatedSharedMemory ipc_control = + shm_pool_->Construct(); + ipc_control_ = std::move(ipc_control.data_); + ipc_control_handle_ = ipc_control.handle_; + + uint64_t model_version = model_state->Version(); + const char* model_path = model_state->RepositoryPath().c_str(); + + std::stringstream ss; + std::string artifact_name; + RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( + "default_model_filename", &artifact_name)); + ss << model_path << "/" << model_version << "/"; + + if (artifact_name.size() > 0) { + ss << artifact_name; + } else { + // Default artifact name. + ss << "model.py"; + } + + model_path_ = ss.str(); + struct stat buffer; + + // Check if model.py exists + if (stat(model_path_.c_str(), &buffer) != 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("model.py does not exist in the model repository path: " + model_path_) + .c_str()); + } + + // Path to the extracted Python env + std::string python_execution_env = ""; + if (model_state->PythonExecutionEnv() != "") { + try { + python_execution_env = + model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( + model_state->PythonExecutionEnv()); + } + catch (PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + path_to_activate_ = python_execution_env + "/bin/activate"; + path_to_libpython_ = python_execution_env + "/lib"; + if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Path ") + path_to_activate_ + + " does not exist. The Python environment should contain an " + "'activate' script.") + .c_str()); + } + } + + parent_pid_ = getpid(); + + auto message_queue_size = + model_state->StateForBackend()->shm_message_queue_size; + + RETURN_IF_EXCEPTION( + stub_message_queue_ = + MessageQueue::Create(shm_pool_, message_queue_size)); + RETURN_IF_EXCEPTION( + parent_message_queue_ = + MessageQueue::Create(shm_pool_, message_queue_size)); + ipc_control_->parent_message_queue = parent_message_queue_->ShmHandle(); + ipc_control_->stub_message_queue = stub_message_queue_->ShmHandle(); + + RETURN_IF_ERROR(StartStubProcess()); + + return nullptr; +} + + +TRITONSERVER_Error* +ModelInstanceState::GetInputTensor( + const uint32_t input_idx, std::shared_ptr& input_tensor, + TRITONBACKEND_Request* request, + std::shared_ptr>& responses) +{ + NVTX_RANGE(nvtx_, "GetInputTensor " + Name()); + const char* input_name; + // Load iidx'th input name + RETURN_IF_ERROR( + TRITONBACKEND_RequestInputName(request, input_idx, &input_name)); + + // Load iidx'th input + TRITONBACKEND_Input* in; + RETURN_IF_ERROR(TRITONBACKEND_RequestInput(request, input_name, &in)); + + // Load input properties + TRITONSERVER_DataType input_dtype; + const int64_t* input_shape; + uint32_t input_dims_count; + uint64_t input_byte_size; + uint32_t input_buffer_count; + + RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( + in, HostPolicyName().c_str(), &input_name, &input_dtype, &input_shape, + &input_dims_count, &input_byte_size, &input_buffer_count)); + + BackendInputCollector collector( + &request, 1, responses.get(), Model()->TritonMemoryManager(), + false /* pinned_enable */, CudaStream(), nullptr, nullptr, 0, + HostPolicyName().c_str()); + + ModelState* model_state = reinterpret_cast(Model()); + bool cpu_only_tensors = model_state->ForceCPUOnlyInputTensors(); + if (input_dtype == TRITONSERVER_TYPE_BYTES) { + cpu_only_tensors = true; + } + +#ifdef TRITON_ENABLE_GPU + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + // If CUDA driver API is not available, the input tensors will be moved to + // CPU. + if (!cuda_handler.IsAvailable()) { + cpu_only_tensors = true; + } +#endif + + TRITONSERVER_MemoryType src_memory_type; + int64_t src_memory_type_id; + size_t src_byte_size; + const void* src_ptr; + RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( + in, 0 /* input buffer index */, &src_ptr, &src_byte_size, + &src_memory_type, &src_memory_type_id)); + +// If TRITON_ENABLE_GPU is false, we need to copy the tensors +// to the CPU. +#ifndef TRITON_ENABLE_GPU + cpu_only_tensors = true; +#endif // TRITON_ENABLE_GPU + + if (cpu_only_tensors || src_memory_type != TRITONSERVER_MEMORY_GPU) { + input_tensor = std::make_shared( + std::string(input_name), + std::vector(input_shape, input_shape + input_dims_count), + input_dtype, TRITONSERVER_MEMORY_CPU /* memory_type */, + 0 /* memory_type_id */, nullptr /* buffer ptr*/, input_byte_size, + nullptr /* DLManagedTensor */); + RETURN_IF_EXCEPTION( + input_tensor->SaveToSharedMemory(shm_pool_, false /* copy_gpu */)); + char* input_buffer = reinterpret_cast(input_tensor->DataPtr()); + collector.ProcessTensor( + input_name, input_buffer, input_byte_size, + TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */); + } else { +#ifdef TRITON_ENABLE_GPU + + // Retreiving GPU input tensors + const void* buffer = nullptr; + std::vector> alloc_perference; + alloc_perference = {{TRITONSERVER_MEMORY_GPU, src_memory_type_id}}; + RETURN_IF_ERROR(collector.ProcessTensor( + input_name, nullptr, 0, alloc_perference, + reinterpret_cast(&buffer), &input_byte_size, + &src_memory_type, &src_memory_type_id)); + + // If the tensor is using the cuda shared memory, we need to extract the + // handle that was used to create the device pointer. This is because of a + // limitation in the legacy CUDA IPC API that doesn't allow getting the + // handle of an exported pointer. If the cuda handle exists, it indicates + // that the cuda shared memory was used and the input is in a single buffer. + // [FIXME] for the case where the input is in cuda shared memory and uses + // multiple input buffers this needs to be changed. + TRITONSERVER_BufferAttributes* buffer_attributes; + + // This value is not used. + const void* buffer_p; + RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes( + in, 0, &buffer_p, &buffer_attributes)); + + input_tensor = std::make_shared( + std::string(input_name), + std::vector(input_shape, input_shape + input_dims_count), + input_dtype, src_memory_type, src_memory_type_id, + const_cast(buffer), input_byte_size, + nullptr /* DLManagedTensor */); + + cudaIpcMemHandle_t* cuda_ipc_handle; + RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle( + buffer_attributes, reinterpret_cast(&cuda_ipc_handle))); + if (cuda_ipc_handle != nullptr) { + RETURN_IF_EXCEPTION( + input_tensor->SaveToSharedMemory(shm_pool_, false /* copy_gpu */)); + RETURN_IF_EXCEPTION( + input_tensor->Memory()->SetCudaIpcHandle(cuda_ipc_handle)); + } else { + RETURN_IF_EXCEPTION( + input_tensor->SaveToSharedMemory(shm_pool_, true /* copy_gpu */)); + } +#else + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Python backend does not support GPU tensors."); +#endif // TRITON_ENABLE_GPU + } + + return nullptr; +} + +void +ModelInstanceState::ExecuteBLSRequest( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t message_offset) +{ + ModelState* model_state = reinterpret_cast(Model()); + auto request_executor = + std::make_unique(model_state->TritonServer()); + std::unique_ptr ipc_message = + IPCMessage::LoadFromSharedMemory(shm_pool, message_offset); + bool is_response_batch_set = false; + std::unique_ptr infer_response; + ResponseBatch* response_batch; + TRITONSERVER_InferenceResponse* inference_response = nullptr; + std::unique_ptr pb_error_message; + std::unique_ptr bls_response; + AllocatedSharedMemory response_batch_shm; + try { + bls_response = IPCMessage::Create(shm_pool_, false /* inline_response */); + + AllocatedSharedMemory request_batch = + shm_pool_->Load(ipc_message->Args()); + RequestBatch* request_batch_shm_ptr = + reinterpret_cast(request_batch.data_.get()); + + bls_response->Command() = PYTHONSTUB_InferExecResponse; + ipc_message->ResponseHandle() = bls_response->ShmHandle(); + + // The response batch of the handle will contain a ResponseBatch + response_batch_shm = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(bi::managed_external_buffer::handle_t)); + response_batch = + reinterpret_cast(response_batch_shm.data_.get()); + bi::managed_external_buffer::handle_t* response_handle = + reinterpret_cast( + response_batch_shm.data_.get() + sizeof(ResponseBatch)); + bls_response->Args() = response_batch_shm.handle_; + + response_batch->batch_size = 1; + response_batch->has_error = false; + response_batch->is_error_set = false; + response_batch->cleanup = false; + is_response_batch_set = true; + bool has_gpu_tensor = false; + + PythonBackendException pb_exception(std::string{}); + + uint32_t gpu_buffers_count = 0; + if (request_batch_shm_ptr->batch_size == 1) { + std::shared_ptr infer_request; + bi::managed_external_buffer::handle_t* request_handle = + reinterpret_cast( + request_batch.data_.get() + sizeof(RequestBatch)); + infer_request = InferRequest::LoadFromSharedMemory( + shm_pool_, *request_handle, false /* open_cuda_handle */); + + // If the BLS inputs are in GPU an additional round trip between the + // stub process and the main process is required. The reason is that we + // need to first allocate the GPU memory from the memory pool and then + // ask the stub process to fill in those allocated buffers. + for (auto& input_tensor : infer_request->Inputs()) { + if (!input_tensor->IsCPU()) { +#ifdef TRITON_ENABLE_GPU + gpu_buffers_count++; + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + has_gpu_tensor = true; + TRITONSERVER_Error* error = BackendMemory::Create( + Model()->TritonMemoryManager(), + {BackendMemory::AllocationType::GPU_POOL, + BackendMemory::AllocationType::GPU}, + input_tensor->MemoryTypeId(), input_tensor->ByteSize(), + &backend_memory); + if (error != nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, TRITONSERVER_ErrorMessage(error)); + break; + } + lbackend_memory.reset(backend_memory); + input_tensor->SetMemory(std::move( + PbMemory::Create(shm_pool_, std::move(lbackend_memory)))); +#endif // TRITON_ENABLE_GPU + } + } + AllocatedSharedMemory gpu_handles; + + // Wait for the extra round trip to complete. The stub process will fill + // in the data for the GPU tensors. If there is an error, the extra round + // trip must be still completed, otherwise the stub process will always be + // waiting for a message from the parent process. + if (has_gpu_tensor) { + gpu_handles = + shm_pool_->Construct( + gpu_buffers_count); + request_batch_shm_ptr->gpu_buffers_count = gpu_buffers_count; + request_batch_shm_ptr->gpu_buffers_handle = gpu_handles.handle_; + + size_t i = 0; + for (auto& input_tensor : infer_request->Inputs()) { + if (!input_tensor->IsCPU()) { + gpu_handles.data_.get()[i] = input_tensor->Memory()->ShmHandle(); + ++i; + } + } + + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + ipc_message->ResponseCondition()->notify_all(); + ipc_message->ResponseCondition()->wait(lock); + } + + if (pb_exception.what() != nullptr) { + infer_response = request_executor->Infer( + infer_request, shm_pool_, &inference_response); + + if (inference_response != nullptr) { + std::lock_guard lock{bls_responses_mutex_}; + bls_inference_responses_.push_back(inference_response); + } + + infer_response->SaveToSharedMemory(shm_pool_); + *response_handle = infer_response->ShmHandle(); + } else { + throw pb_exception; + } + } + } + catch (const PythonBackendException& pb_exception) { + if (is_response_batch_set) { + response_batch->has_error = true; + LOG_IF_EXCEPTION( + pb_error_message = PbString::Create(shm_pool, pb_exception.what())); + + if (pb_error_message != nullptr) { + response_batch->is_error_set = true; + response_batch->error = pb_error_message->ShmHandle(); + } + } else { + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what()); + } + } + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + ipc_message->ResponseCondition()->notify_all(); + ipc_message->ResponseCondition()->wait(lock); + } + + request_executors_.emplace_back(std::move(request_executor)); +} + +void +ModelInstanceState::ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count, + bool& restart) +{ + NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); + ModelState* model_state = reinterpret_cast(Model()); + int max_batch_size = model_state->MaxBatchSize(); + std::string name = model_state->Name(); + + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("model ") + model_state->Name() + ", instance " + Name() + + ", executing " + std::to_string(request_count) + " requests") + .c_str()); + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + + // For each request collect the total batch size for this inference + // execution. The batch-size, number of inputs, and size of each + // input has already been checked so don't need to do that here. + + size_t total_batch_size = 0; + for (size_t i = 0; i < request_count; i++) { + // If we get a nullptr request then something is badly wrong. Fail + // and release all requests. + if (requests[i] == nullptr) { + RequestsRespondWithError( + requests, request_count, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "null request given to Python backend for '" + name + "'") + .c_str())); + return; + } + } + + // We take the responsibility of the responses. + std::shared_ptr> responses( + new std::vector()); + responses->reserve(request_count); + PbMetricReporter reporter( + TritonModelInstance(), requests, request_count, responses); + reporter.SetExecStartNs(exec_start_ns); + + for (size_t i = 0; i < request_count; i++) { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses->emplace_back(response); + } else { + responses->emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + + for (size_t i = 0; i < request_count; i++) { + if (max_batch_size > 0) { + // Retrieve the batch size from one of the inputs, if the model + // supports batching, the first dimension size is batch size + TRITONBACKEND_Input* input; + TRITONSERVER_Error* err = + TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input); + if (err == nullptr) { + const int64_t* shape; + err = TRITONBACKEND_InputProperties( + input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr); + total_batch_size += shape[0]; + } + if (err != nullptr) { + RESPOND_ALL_AND_RETURN_IF_ERROR(responses, request_count, err); + } + } else { + ++total_batch_size; + } + } + // If there are no valid payloads then no need to run the inference. if (total_batch_size == 0) { return; @@ -705,31 +1192,28 @@ ModelInstanceState::ProcessRequests( } std::unique_ptr ipc_message = - std::make_unique(shm_pool_, false /*inline_response*/); + IPCMessage::Create(shm_pool_, false /*inline_response*/); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; - RequestBatch* request_batch; - off_t request_batch_offset; + AllocatedSharedMemory request_batch; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, - shm_pool_->Map( - (char**)&request_batch, sizeof(RequestBatch), request_batch_offset)); + request_batch = shm_pool_->Construct( + sizeof(RequestBatch) + + request_count * sizeof(bi::managed_external_buffer::handle_t))); - ipc_message->Args() = request_batch_offset; - request_batch->batch_size = request_count; + RequestBatch* request_batch_shm_ptr = + reinterpret_cast(request_batch.data_.get()); + request_batch_shm_ptr->batch_size = request_count; + ipc_message->Args() = request_batch.handle_; - Request* requests_shm; - off_t requests_shm_offset; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - shm_pool_->Map( - (char**)&requests_shm, sizeof(Request) * request_count, - requests_shm_offset)); - request_batch->requests = requests_shm_offset; + bi::managed_external_buffer::handle_t* requests_shm = + reinterpret_cast( + request_batch.data_.get() + sizeof(RequestBatch)); + std::vector> pb_inference_requests; for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; - Request* python_infer_request = &requests_shm[r]; uint32_t requested_input_count = 0; RESPOND_ALL_AND_RETURN_IF_ERROR( responses, request_count, @@ -739,27 +1223,14 @@ ModelInstanceState::ProcessRequests( RESPOND_ALL_AND_RETURN_IF_ERROR( responses, request_count, TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); - python_infer_request->requested_output_count = requested_output_count; - - Tensor* input_tensors; - off_t input_tensors_offset; - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - shm_pool_->Map( - (char**)&input_tensors, sizeof(Tensor) * requested_input_count, - input_tensors_offset)); - python_infer_request->inputs = input_tensors_offset; std::vector> pb_input_tensors; for (size_t iidx = 0; iidx < requested_input_count; ++iidx) { - Tensor* input_tensor = &input_tensors[iidx]; std::shared_ptr pb_input_tensor; RESPOND_ALL_AND_RETURN_IF_ERROR( responses, request_count, - GetInputTensor( - iidx, input_tensor, pb_input_tensor, request, responses)); + GetInputTensor(iidx, pb_input_tensor, request, responses)); pb_input_tensors.emplace_back(std::move(pb_input_tensor)); } @@ -788,12 +1259,15 @@ ModelInstanceState::ProcessRequests( RESPOND_ALL_AND_RETURN_IF_ERROR( responses, request_count, TRITONBACKEND_RequestFlags(request, &flags)); - InferRequest infer_request( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), flags); + std::unique_ptr infer_request = + std::make_unique( + id, correlation_id, pb_input_tensors, requested_output_names, + model_state->Name(), model_state->Version(), flags); + RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - infer_request.SaveToSharedMemory(shm_pool_, python_infer_request)); + responses, request_count, infer_request->SaveToSharedMemory(shm_pool_)); + requests_shm[r] = infer_request->ShmHandle(); + pb_inference_requests.emplace_back(std::move(infer_request)); } uint64_t compute_start_ns = 0; @@ -809,10 +1283,23 @@ ModelInstanceState::ProcessRequests( return; } - off_t response_message; - SendMessageAndReceiveResponse( - ipc_message->SharedMemoryOffset(), response_message, restart, responses, - requests, request_count); + bi::managed_external_buffer::handle_t response_message; + { + NVTX_RANGE(nvtx_, "StubProcessing " + Name()); + SendMessageAndReceiveResponse( + ipc_message->ShmHandle(), response_message, restart, responses, + requests, request_count); + } + + + ScopedDefer execute_finalize(std::bind([this, &restart] { + // Push a dummy message to the message queue so that + // the stub process is notified that it can release + // the object stored in shared memory. + NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); + if (!restart) + stub_message_queue_->Push(1000); + })); if (restart) { return; } @@ -822,15 +1309,18 @@ ModelInstanceState::ProcessRequests( ipc_message = IPCMessage::LoadFromSharedMemory(shm_pool_, response_message)); - // If the stub command is no longer PYTHONSTUB_InferExecRequest, it - // indicates that inference request exeuction has finished. + // If the stub command is no longer PYTHONSTUB_InferExecRequest, it indicates + // that inference request exeuction has finished and there are no more BLS + // requests to execute. Otherwise, the Python backend will continuosly execute + // BLS requests pushed to the message queue. while (ipc_message->Command() == PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest) { - off_t current_message = response_message; + bi::managed_external_buffer::handle_t current_message = response_message; + // Launch the BLS request in a future. bls_futures_.emplace_back( - std::async(std::launch::async, [this, current_message, &cleanup]() { - this->ExecuteBLSRequest(this->shm_pool_, current_message, cleanup); + std::async(std::launch::async, [this, current_message]() { + this->ExecuteBLSRequest(this->shm_pool_, current_message); })); auto error = ReceiveMessageFromStub(response_message); @@ -852,23 +1342,27 @@ ModelInstanceState::ProcessRequests( reporter.SetComputeEndNs(compute_end_ns); // Parsing the request response - ResponseBatch* response_batch; + AllocatedSharedMemory response_batch; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, - shm_pool_->MapOffset((char**)&response_batch, ipc_message->Args())); + response_batch = shm_pool_->Load(ipc_message->Args())); + + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast(response_batch.data_.get()); // If inference fails, release all the requests and send an error response. // If inference fails at this stage, it usually indicates a bug in the model // code - if (response_batch->has_error) { - if (response_batch->is_error_set) { - char* error_message; + if (response_batch_shm_ptr->has_error) { + if (response_batch_shm_ptr->is_error_set) { + std::unique_ptr error_message_shm; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, - LoadStringFromSharedMemory( - shm_pool_, response_batch->error, error_message)); + error_message_shm = PbString::LoadFromSharedMemory( + shm_pool_, response_batch_shm_ptr->error)); RespondErrorToAllRequests( - error_message, responses, requests, request_count); + error_message_shm->String().c_str(), responses, requests, + request_count); } else { const char* error_message = "Failed to fetch the error in response batch."; @@ -878,58 +1372,41 @@ ModelInstanceState::ProcessRequests( return; } - Response* responses_shm; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - shm_pool_->MapOffset((char**)&responses_shm, response_batch->responses)); - + bi::managed_external_buffer::handle_t* response_shm_handle = + reinterpret_cast( + response_batch.data_.get() + sizeof(ResponseBatch)); + // If the output provided by the model is in GPU, we will pass the list of + // buffers provided by Triton to the stub process. bool has_gpu_output = false; - // The vector that stores the tensor pairs for the tensors that the stub - // provides in GPU but the output buffer provided by Triton is in CPU. - std::vector, std::pair>> - tensor_buffer_pairs; + // GPU output buffers + std::vector, std::pair>> + gpu_output_buffers; for (uint32_t r = 0; r < request_count; ++r) { + NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); TRITONBACKEND_Response* response = (*responses)[r]; TRITONBACKEND_Request* request = requests[r]; uint32_t requested_output_count = 0; std::unique_ptr infer_response; try { - std::shared_ptr cuda_ipc_mutex; infer_response = InferResponse::LoadFromSharedMemory( - shm_pool_, response_batch->responses + sizeof(Response) * r, - cuda_ipc_mutex /* cuda_ipc_open_mutex */, - cuda_ipc_mutex /* cuda_ipc_close_mutex */); + shm_pool_, response_shm_handle[r], false /* open_cuda_handle */); if (infer_response->HasError()) { - if (infer_response->IsErrorMessageSet()) { - TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - infer_response->Error()->Message().c_str()); - - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - } else { - const char* err_string = "Failed to process response."; - TRITONSERVER_Error* err = - TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_string); - - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - } + TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + infer_response->Error()->Message().c_str()); + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + TRITONSERVER_ErrorDelete(err); (*responses)[r] = nullptr; - // If has_error is true, we do not look at the response even if the - // response is set. + // If has_error is true, we do not look at the response tensors. continue; } } @@ -996,453 +1473,151 @@ ModelInstanceState::ProcessRequests( TRITONBACKEND_OutputBufferAttributes( response_output, &output_buffer_attributes)); + std::unique_ptr output_buffer; if (src_memory_type == TRITONSERVER_MEMORY_GPU && actual_memory_type == TRITONSERVER_MEMORY_GPU) { -#ifdef TRITON_ENABLE_GPU - if ((*responses)[r] != nullptr) { +#ifdef TRITON_ENABLE_GPU cudaIpcMemHandle_t* cuda_ipc_mem_handle_p; GUARDED_RESPOND_IF_ERROR( responses, r, TRITONSERVER_BufferAttributesCudaIpcHandle( - output_buffer_attributes, - reinterpret_cast(&cuda_ipc_mem_handle_p))); - - cudaSetDevice(output_tensor->MemoryTypeId()); - output_tensor->SetMemoryType(actual_memory_type); - output_tensor->SetMemoryTypeId(actual_memory_type_id); - output_tensor->SetDataPtr(buffer); - - if (cuda_ipc_mem_handle_p != nullptr) { - output_tensor->SetCudaIpcMemHandle(cuda_ipc_mem_handle_p); - output_tensor->RawDataShm()->offset = - output_tensor->GetGPUPointerOffset(); - } else { - cudaIpcMemHandle_t cuda_ipc_mem_handle; - cudaError_t err = cudaIpcGetMemHandle(&cuda_ipc_mem_handle, buffer); - output_tensor->SetCudaIpcMemHandle(&cuda_ipc_mem_handle); - if (err != cudaSuccess) { - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "failed to get cuda ipc handle: " + - std::string(cudaGetErrorString(err))) - .c_str())); - } else { - output_tensor->RawDataShm()->offset = - output_tensor->GetGPUPointerOffset(); - } - } - } -#endif - } - - if (src_memory_type == TRITONSERVER_MEMORY_GPU && - (actual_memory_type == TRITONSERVER_MEMORY_CPU || - actual_memory_type == TRITONSERVER_MEMORY_CPU_PINNED)) { - tensor_buffer_pairs.push_back({output_tensor, {buffer, r}}); - - // Set the memory type to CPU in shared memory. The stubs notices the - // change in the memory type and should copy the input tensors to CPU. - output_tensor->RawDataShm()->memory_type = TRITONSERVER_MEMORY_CPU; - output_tensor->RawDataShm()->memory_type_id = actual_memory_type_id; - } - - if (src_memory_type != TRITONSERVER_MEMORY_GPU) { - GUARDED_RESPOND_IF_ERROR( - responses, r, - CopyBuffer( - "Failed to copy the output tensor to buffer.", src_memory_type, - src_memory_type_id, actual_memory_type, actual_memory_type_id, - output_tensor->ByteSize(), output_tensor->GetDataPtr(), buffer, - CudaStream(), &cuda_used)); - } - cuda_copy |= cuda_used; - } -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } -#endif // TRITON_ENABLE_GPU - } - - // If the output tensor is in GPU, there will be a second round trip - // required for filling the GPU buffers provided by the main process. - if (has_gpu_output) { - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; - SendMessageAndReceiveResponse( - ipc_message->SharedMemoryOffset(), response_message, restart, responses, - requests, 0); - - bool cuda_copy = false; - for (auto& tensor_buffer_pair : tensor_buffer_pairs) { - bool cuda_used = false; - auto& tensor = tensor_buffer_pair.first; - - // Reload the tensor from shared memory so that the memory data is - // updated. - std::shared_ptr cuda_ipc_mutex; - std::shared_ptr reloaded_tensor = - PbTensor::LoadFromSharedMemory( - shm_pool_, tensor->ShmOffset(), - cuda_ipc_mutex /* cuda_ipc_open_mutex */, - cuda_ipc_mutex /* cuda_ipc_close_mutex */); - auto& buffer = tensor_buffer_pair.second.first; - auto& response_index = tensor_buffer_pair.second.second; - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - reloaded_tensor->ByteSize(), reloaded_tensor->GetDataPtr(), - buffer, CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; - } -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } -#endif // TRITON_ENABLE_GPU - } - - for (uint32_t r = 0; r < request_count; ++r) { - // If error happens at this stage, we can only log it - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr)); - } - - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - reporter.SetExecEndNs(exec_end_ns); - reporter.SetBatchStatistics(total_batch_size); - - return; -} - -void -ModelInstanceState::CleanupBLSResponses() -{ - for (auto& bls_inference_response : bls_inference_responses_) { - LOG_IF_ERROR( - TRITONSERVER_InferenceResponseDelete(bls_inference_response), - " failed to release BLS inference response."); - } - - bls_inference_responses_.clear(); - request_executors_.clear(); -} - -bool -ModelInstanceState::IsStubProcessAlive() -{ - boost::posix_time::ptime timeout = - boost::get_system_time() + boost::posix_time::seconds(1); - bi::scoped_lock lock(*health_mutex_, timeout); - - // Check if lock has been acquired. - if (lock) { - return ipc_control_->stub_health; - } else { - // If It failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return false; - } -} - -TRITONSERVER_Error* -ModelInstanceState::StartStubProcess() -{ - new (health_mutex_) bi::interprocess_mutex; - stub_message_queue_->ResetSemaphores(); - parent_message_queue_->ResetSemaphores(); - - std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); - - ModelState* model_state = reinterpret_cast(Model()); - int64_t shm_growth_size = - model_state->StateForBackend()->shm_growth_byte_size; - int64_t shm_default_size = - model_state->StateForBackend()->shm_default_byte_size; - const char* model_path = model_state->RepositoryPath().c_str(); - - initialized_ = false; - - pid_t pid = fork(); - if (pid < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Failed to fork the stub process."); - } - - // Stub process - if (pid == 0) { - const char* stub_args[4]; - stub_args[0] = "bash"; - stub_args[1] = "-c"; - stub_args[3] = nullptr; // Last argument must be nullptr - - // Default Python backend stub - std::string python_backend_stub = - model_state->StateForBackend()->python_lib + - "/triton_python_backend_stub"; - - // Path to alternative Python backend stub - std::string model_python_backend_stub = - std::string(model_path) + "/triton_python_backend_stub"; - - if (FileExists(model_python_backend_stub)) { - python_backend_stub = model_python_backend_stub; - } - - std::string bash_argument; - - // This shared memory variable indicates whether the - // stub process should revert the LD_LIBRARY_PATH changes to avoid - // shared library issues in executables and libraries. - ipc_control_->uses_env = false; - if (model_state->PythonExecutionEnv() != "") { - std::stringstream ss; - ss << "source " << path_to_activate_ - << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ - << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ - << " " << shm_region_name_ << " " << shm_default_size << " " - << shm_growth_size << " " << parent_pid_ << " " - << model_state->StateForBackend()->python_lib << " " - << ipc_control_offset_ << " " << Name(); - ipc_control_->uses_env = true; - // Need to properly set the LD_LIBRARY_PATH so that Python environments - // using different python versions load properly. - bash_argument = ss.str(); - } else { - std::stringstream ss; - ss << " exec " << python_backend_stub << " " << model_path_ << " " - << shm_region_name_ << " " << shm_default_size << " " - << shm_growth_size << " " << parent_pid_ << " " - << model_state->StateForBackend()->python_lib << " " - << ipc_control_offset_ << " " << Name(); - bash_argument = ss.str(); - } - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Starting Python backend stub: ") + bash_argument) - .c_str()); - - stub_args[2] = bash_argument.c_str(); - - int stub_status_code = - system((python_backend_stub + "> /dev/null 2>&1").c_str()); - - // If running stub process without any arguments returns any status code, - // other than 1, it can indicate a permission issue as a result of - // downloading the stub process from a cloud object storage service. - if (WEXITSTATUS(stub_status_code) != 1) { - // Give the execute permission for the triton_python_backend_stub to the - // owner. - int error = chmod(python_backend_stub.c_str(), S_IXUSR); - if (error != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Failed to give execute permission to " - "triton_python_backend_stub in ") + - python_backend_stub + " " + Name() + - " Error No.: " + std::to_string(error)) - .c_str()); - } - } - - if (execvp("bash", (char**)stub_args) != 0) { - std::stringstream ss; - ss << "Failed to run python backend stub. Errno = " << errno << '\n' - << "Python backend stub path: " << python_backend_stub << '\n' - << "Shared Memory Region Name: " << shm_region_name_ << '\n' - << "Shared Memory Default Byte Size: " << shm_default_size << '\n' - << "Shared Memory Growth Byte Size: " << shm_growth_size << '\n'; - std::string log_message = ss.str(); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str()); - - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Failed to initialize model instance ") + Name()) - .c_str()); - } - - } else { - stub_pid_ = pid; - triton::common::TritonJson::WriteBuffer buffer; - Model()->ModelConfig().Write(&buffer); - - std::unordered_map initialize_map = { - {"model_config", buffer.MutableContents()}, - {"model_instance_kind", TRITONSERVER_InstanceGroupKindString(kind_)}, - {"model_instance_name", name_}, - {"model_instance_device_id", std::to_string(device_id_)}, - {"model_repository", model_state->RepositoryPath()}, - {"model_version", std::to_string(model_state->Version())}, - {"model_name", model_state->Name()}}; - - std::unique_ptr initialize_message = - std::make_unique(shm_pool_, false /* inline_response */); - initialize_message->Command() = PYTHONSTUB_InitializeRequest; - - // TODO: Fix restart during initialize - off_t initialize_map_offset; - RETURN_IF_EXCEPTION(SaveMapToSharedMemory( - shm_pool_, initialize_map_offset, initialize_map)); - initialize_message->Args() = initialize_map_offset; - stub_message_queue_->Push(initialize_message->SharedMemoryOffset()); - - std::unique_ptr init_msg_response_mapped = - IPCMessage::LoadFromSharedMemory( - shm_pool_, parent_message_queue_->Pop()); - - if (init_msg_response_mapped->Command() != PYTHONSTUB_InitializeResponse) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string( - "Received unexpected resposne from Python backend stub: ") + - name_) - .c_str()); - } - - InitializeResponse* initialize_response; - RETURN_IF_EXCEPTION(shm_pool_->MapOffset( - (char**)&initialize_response, init_msg_response_mapped->Args())); + output_buffer_attributes, + reinterpret_cast(&cuda_ipc_mem_handle_p))); - if (initialize_response->response_has_error) { - if (initialize_response->response_is_error_set) { - char* err_message; - RETURN_IF_EXCEPTION(LoadStringFromSharedMemory( - shm_pool_, initialize_response->response_error, err_message)); - return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message); - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Initialize() failed for ") + model_state->Name()) - .c_str()); + if (cuda_ipc_mem_handle_p != nullptr) { + GUARDED_RESPOND_IF_EXCEPTION( + responses, r, + output_buffer = PbMemory::Create( + shm_pool_, actual_memory_type, actual_memory_type_id, + output_tensor->ByteSize(), reinterpret_cast(buffer), + false /* copy_gpu */)); + output_buffer->SetCudaIpcHandle(cuda_ipc_mem_handle_p); + } else { + GUARDED_RESPOND_IF_EXCEPTION( + responses, r, + output_buffer = PbMemory::Create( + shm_pool_, actual_memory_type, actual_memory_type_id, + output_tensor->ByteSize(), reinterpret_cast(buffer), + true /* copy_gpu */)); + } + gpu_output_buffers.push_back({std::move(output_buffer), {buffer, r}}); +#endif + } } - } - - initialized_ = true; - } - return nullptr; // success -} + // When we requested a GPU buffer but received a CPU buffer. + if (src_memory_type == TRITONSERVER_MEMORY_GPU && + (actual_memory_type == TRITONSERVER_MEMORY_CPU || + actual_memory_type == TRITONSERVER_MEMORY_CPU_PINNED)) { + GUARDED_RESPOND_IF_EXCEPTION( + responses, r, + output_buffer = PbMemory::Create( + shm_pool_, actual_memory_type, actual_memory_type_id, + 0 /* byte size */, nullptr /* data ptr */)); -TRITONSERVER_Error* -ModelInstanceState::SetupStubProcess() -{ - std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); - ModelState* model_state = reinterpret_cast(Model()); + gpu_output_buffers.push_back({std::move(output_buffer), {buffer, r}}); + } - // Increase the stub process count to avoid shared memory region name - // collision - model_state->StateForBackend()->number_of_instance_inits++; - shm_region_name_ = - model_state->StateForBackend()->shared_memory_region_prefix + - std::to_string(model_state->StateForBackend()->number_of_instance_inits); - int64_t shm_growth_size = - model_state->StateForBackend()->shm_growth_byte_size; - int64_t shm_default_size = - model_state->StateForBackend()->shm_default_byte_size; + if (src_memory_type != TRITONSERVER_MEMORY_GPU) { + GUARDED_RESPOND_IF_ERROR( + responses, r, + CopyBuffer( + "Failed to copy the output tensor to buffer.", src_memory_type, + src_memory_type_id, actual_memory_type, actual_memory_type_id, + output_tensor->ByteSize(), output_tensor->DataPtr(), buffer, + CudaStream(), &cuda_used)); + } - try { - shm_pool_ = std::make_unique( - shm_region_name_, shm_default_size, shm_growth_size, - true /* truncate */); - } - catch (const PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + cuda_copy |= cuda_used; + } +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } +#endif // TRITON_ENABLE_GPU } - IPCControl* ipc_control; - shm_pool_->Map((char**)&ipc_control, sizeof(IPCControl), ipc_control_offset_); - ipc_control_ = ipc_control; - - bi::interprocess_mutex* health_mutex; - off_t health_mutex_offset; - RETURN_IF_EXCEPTION(shm_pool_->Map( - (char**)&health_mutex, sizeof(bi::interprocess_mutex), - health_mutex_offset)); - ipc_control_->stub_health_mutex = health_mutex_offset; - - health_mutex_ = health_mutex; - - uint64_t model_version = model_state->Version(); - const char* model_path = model_state->RepositoryPath().c_str(); + // Finalize the execute. + execute_finalize.Complete(); - std::stringstream ss; - std::string artifact_name; - RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( - "default_model_filename", &artifact_name)); - ss << model_path << "/" << model_version << "/"; + // If the output tensor is in GPU, there will be a second round trip + // required for filling the GPU buffers provided by the main process. + if (has_gpu_output) { + AllocatedSharedMemory gpu_buffers_handle = shm_pool_->Construct( + sizeof(uint64_t) + gpu_output_buffers.size() * + sizeof(bi::managed_external_buffer::handle_t)); + uint64_t* gpu_buffer_count = + reinterpret_cast(gpu_buffers_handle.data_.get()); + *gpu_buffer_count = gpu_output_buffers.size(); + bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = + reinterpret_cast( + gpu_buffers_handle.data_.get() + sizeof(uint64_t)); + + for (size_t i = 0; i < gpu_output_buffers.size(); i++) { + gpu_buffers_handle_shm[i] = gpu_output_buffers[i].first->ShmHandle(); + } - if (artifact_name.size() > 0) { - ss << artifact_name; - } else { - // Default artifact name. - ss << "model.py"; - } + ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; + ipc_message->Args() = gpu_buffers_handle.handle_; + SendMessageAndReceiveResponse( + ipc_message->ShmHandle(), response_message, restart, responses, + requests, 0); - model_path_ = ss.str(); - struct stat buffer; + bool cuda_copy = false; - // Check if model.py exists - if (stat(model_path_.c_str(), &buffer) != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("model.py does not exist in the model repository path: " + model_path_) - .c_str()); - } + // CPU tensors require an additional notification to the stub process. + // This is to ask the stub process to release the tensor. + bool has_cpu_tensor = false; + for (size_t i = 0; i < gpu_output_buffers.size(); i++) { + std::unique_ptr& memory = gpu_output_buffers[i].first; + if (memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + bool cuda_used; + has_cpu_tensor = true; + std::unique_ptr pb_cpu_memory = + PbMemory::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle_shm[i], + false /* open cuda handle */); + uint32_t response_index = gpu_output_buffers[i].second.second; + void* pointer = gpu_output_buffers[i].second.first; - // Path to the extracted Python env - std::string python_execution_env = ""; - if (model_state->PythonExecutionEnv() != "") { - try { - python_execution_env = - model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( - model_state->PythonExecutionEnv()); + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_cpu_memory->ByteSize(), pb_cpu_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } } - catch (PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + + if (has_cpu_tensor) { + // Any number would work here. + stub_message_queue_->Push(1000); } - path_to_activate_ = python_execution_env + "/bin/activate"; - path_to_libpython_ = python_execution_env + "/lib"; - if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Path ") + path_to_activate_ + - " does not exist. The Python environment should contain an " - "'activate' script.") - .c_str()); +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream_); } +#endif // TRITON_ENABLE_GPU } - parent_pid_ = getpid(); - - auto message_queue_size = - model_state->StateForBackend()->shm_message_queue_size; - - RETURN_IF_EXCEPTION( - stub_message_queue_ = - std::make_unique(shm_pool_, message_queue_size)); - RETURN_IF_EXCEPTION( - parent_message_queue_ = - std::make_unique(shm_pool_, message_queue_size)); - ipc_control_->parent_message_queue = parent_message_queue_->ShmOffset(); - ipc_control_->stub_message_queue = stub_message_queue_->ShmOffset(); - + for (uint32_t r = 0; r < request_count; ++r) { + // If error happens at this stage, we can only log it + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr)); + } - // Offset that must be used for resetting the shared memory usage. - shm_reset_offset_ = shm_pool_->Offset(); - RETURN_IF_ERROR(StartStubProcess()); + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + reporter.SetExecEndNs(exec_end_ns); + reporter.SetBatchStatistics(total_batch_size); - return nullptr; + return; } ModelInstanceState::~ModelInstanceState() @@ -1467,10 +1642,10 @@ ModelInstanceState::~ModelInstanceState() if (healthy) { // Finalize command does not have any arguments. std::unique_ptr ipc_message = - std::make_unique(shm_pool_, false /* inline_response */); + IPCMessage::Create(shm_pool_, false /* inline_response */); ipc_message->Command() = PYTHONSTUB_FinalizeRequest; - stub_message_queue_->Push(ipc_message->SharedMemoryOffset()); + stub_message_queue_->Push(ipc_message->ShmHandle()); parent_message_queue_->Pop(); stub_message_queue_.reset(); @@ -1486,135 +1661,12 @@ ModelInstanceState::~ModelInstanceState() } waitpid(stub_pid_, &status, 0); } -} - -TRITONSERVER_Error* -ModelInstanceState::GetInputTensor( - const uint32_t input_idx, Tensor* input_tensor_shm, - std::shared_ptr& input_tensor, TRITONBACKEND_Request* request, - std::shared_ptr>& responses) -{ - const char* input_name; - // Load iidx'th input name - RETURN_IF_ERROR( - TRITONBACKEND_RequestInputName(request, input_idx, &input_name)); - - // Load iidx'th input - TRITONBACKEND_Input* in; - RETURN_IF_ERROR(TRITONBACKEND_RequestInput(request, input_name, &in)); - - // Load input properties - TRITONSERVER_DataType input_dtype; - const int64_t* input_shape; - uint32_t input_dims_count; - uint64_t input_byte_size; - uint32_t input_buffer_count; - - RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy( - in, HostPolicyName().c_str(), &input_name, &input_dtype, &input_shape, - &input_dims_count, &input_byte_size, &input_buffer_count)); - - BackendInputCollector collector( - &request, 1, responses.get(), Model()->TritonMemoryManager(), - false /* pinned_enable */, CudaStream(), nullptr, nullptr, 0, - HostPolicyName().c_str()); - - ModelState* model_state = reinterpret_cast(Model()); - bool cpu_only_tensors = model_state->ForceCPUOnlyInputTensors(); - if (input_dtype == TRITONSERVER_TYPE_BYTES) { - cpu_only_tensors = true; - } - -#ifdef TRITON_ENABLE_GPU - CUDADriverAPI& cuda_driver_api = CUDADriverAPI::getInstance(); - // If CUDA driver API is not available, the input tensors will be moved to - // CPU. - if (!cuda_driver_api.IsAvailable()) { - cpu_only_tensors = true; - } -#endif - - TRITONSERVER_MemoryType src_memory_type; - int64_t src_memory_type_id; - size_t src_byte_size; - const void* src_ptr; - RETURN_IF_ERROR(TRITONBACKEND_InputBuffer( - in, 0 /* input buffer index */, &src_ptr, &src_byte_size, - &src_memory_type, &src_memory_type_id)); - -// If TRITON_ENABLE_GPU is false, we need to copy the tensors -// to the CPU. -#ifndef TRITON_ENABLE_GPU - cpu_only_tensors = true; -#endif // TRITON_ENABLE_GPU - - if (cpu_only_tensors || src_memory_type != TRITONSERVER_MEMORY_GPU) { - input_tensor = std::make_unique( - std::string(input_name), - std::vector(input_shape, input_shape + input_dims_count), - input_dtype, TRITONSERVER_MEMORY_CPU /* memory_type */, - 0 /* memory_type_id */, nullptr /* buffer ptr*/, input_byte_size, - nullptr /* DLManagedTensor */); - RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( - shm_pool_, input_tensor_shm, false /* copy_cpu */, - true /* copy_gpu */)); - char* input_buffer = reinterpret_cast(input_tensor->GetDataPtr()); - collector.ProcessTensor( - input_name, input_buffer, input_byte_size, - TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */); - } else { -#ifdef TRITON_ENABLE_GPU - // Retreiving GPU input tensors - const void* buffer = nullptr; - std::vector> alloc_perference; - alloc_perference = {{TRITONSERVER_MEMORY_GPU, src_memory_type_id}}; - RETURN_IF_ERROR(collector.ProcessTensor( - input_name, nullptr, 0, alloc_perference, - reinterpret_cast(&buffer), &input_byte_size, - &src_memory_type, &src_memory_type_id)); - input_tensor = std::make_unique( - std::string(input_name), - std::vector(input_shape, input_shape + input_dims_count), - input_dtype, src_memory_type, src_memory_type_id, - const_cast(buffer), input_byte_size, - nullptr /* DLManagedTensor */); - - // If the tensor is using the cuda shared memory, we need to extract the - // handle that was used to create the device pointer. This is because of a - // limitation in the legacy CUDA IPC API that doesn't allow getting the - // handle of an exported pointer. If the cuda handle exists, it indicates - // that the cuda shared memory was used and the input is in a single buffer. - // [FIXME] for the case where the input is in cuda shared memory and uses - // multiple input buffers this needs to be changed. - TRITONSERVER_BufferAttributes* buffer_attributes; - - // This value is not used. - const void* buffer_p; - RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes( - in, 0, &buffer_p, &buffer_attributes)); - - cudaIpcMemHandle_t* cuda_ipc_handle; - RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle( - buffer_attributes, reinterpret_cast(&cuda_ipc_handle))); - if (cuda_ipc_handle != nullptr) { - RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( - shm_pool_, input_tensor_shm, true /* copy_cpu */, - false /* copy_gpu */)); - RETURN_IF_EXCEPTION(input_tensor->SetCudaIpcMemHandle(cuda_ipc_handle)); - input_tensor->RawDataShm()->offset = input_tensor->GetGPUPointerOffset(); - } else { - RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( - shm_pool_, input_tensor_shm, true /* copy_cpu */, - true /* copy_gpu */)); - } -#else - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "Python backend does not support GPU tensors."); -#endif // TRITON_ENABLE_GPU - } - return nullptr; + // First destroy the IPCControl. This makes sure that IPCControl is + // destroyed before the shared memory manager goes out of scope. + ipc_control_.reset(); + stub_message_queue_.reset(); + parent_message_queue_.reset(); } TRITONSERVER_Error* @@ -1992,8 +2044,7 @@ TRITONBACKEND_ModelInstanceExecute( // If restart is equal to true, it indicates that the stub process is // unhealthy and needs a restart. bool restart = false; - std::atomic cleanup{false}; - instance_state->ProcessRequests(requests, request_count, restart, cleanup); + instance_state->ProcessRequests(requests, request_count, restart); // Wait for all the pending BLS requests to be completed. instance_state->WaitForBLSRequestsToFinish(); @@ -2023,11 +2074,6 @@ TRITONBACKEND_ModelInstanceExecute( "Failed to restart the stub process."); } - // We should return the shared memory offset before returning from this - // function. Otherwise there will be shared memory leaks if there is an - // error when processing the requests - instance_state->ResetSharedMemoryOffset(); - return nullptr; } diff --git a/src/pb_main_utils.cc b/src/request_executor.cc similarity index 88% rename from src/pb_main_utils.cc rename to src/request_executor.cc index dbbfc929..10f94f36 100644 --- a/src/pb_main_utils.cc +++ b/src/request_executor.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -24,7 +24,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include "pb_main_utils.h" +#include "request_executor.h" #include #include "pb_utils.h" @@ -72,7 +72,7 @@ ResponseAlloc( void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type, int64_t* actual_memory_type_id) { - SharedMemory* shm_pool = reinterpret_cast(userp); + SharedMemoryManager* shm_pool = reinterpret_cast(userp); *actual_memory_type = preferred_memory_type; *actual_memory_type_id = preferred_memory_type_id; @@ -90,17 +90,30 @@ ResponseAlloc( case TRITONSERVER_MEMORY_CPU_PINNED: { *actual_memory_type = TRITONSERVER_MEMORY_CPU; *actual_memory_type_id = 0; - off_t tensor_offset; + bi::managed_external_buffer::handle_t tensor_handle; try { - shm_pool->Map((char**)buffer, byte_size, tensor_offset); + AllocatedSharedMemory memory = + shm_pool->Construct(byte_size); + *buffer = memory.data_.get(); + tensor_handle = memory.handle_; + + // Release the ownership to avoid deallocation. The buffer + // will be deallocated in ResponseRelease function. + memory.data_.release(); } catch (const PythonBackendException& pb_exception) { TRITONSERVER_Error* err = CreateTritonErrorFromException(pb_exception); return err; } - // Store the buffer offset in the userp; - *buffer_userp = new off_t(tensor_offset); + // Store the buffer offset in the userp; The userp is large enough to + // hold the shared memory offset and the address of the Shared memory + // manager + AllocationInfo* allocation_info = new AllocationInfo; + *buffer_userp = allocation_info; + + allocation_info->handle_ = tensor_handle; + allocation_info->shm_manager_ = shm_pool; } break; #ifdef TRITON_ENABLE_GPU case TRITONSERVER_MEMORY_GPU: { @@ -110,7 +123,7 @@ ResponseAlloc( return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, std::string( - "unable to recover current CUDA device: " + + "unable to set current CUDA device: " + std::string(cudaGetErrorString(err))) .c_str()); } @@ -141,8 +154,15 @@ ResponseRelease( switch (memory_type) { case TRITONSERVER_MEMORY_CPU: case TRITONSERVER_MEMORY_CPU_PINNED: { - off_t* offset = reinterpret_cast(buffer_userp); - delete offset; + AllocationInfo* allocation_info = + reinterpret_cast(buffer_userp); + { + // Load the data so that it is deallocated automatically. + auto result = allocation_info->shm_manager_->Load( + allocation_info->handle_, true /* unsafe */); + } + + delete allocation_info; } break; case TRITONSERVER_MEMORY_GPU: { #ifdef TRITON_ENABLE_GPU @@ -171,8 +191,8 @@ RequestExecutor::RequestExecutor(TRITONSERVER_Server* server) : server_(server) std::unique_ptr RequestExecutor::Infer( - const std::unique_ptr& infer_request, - const std::unique_ptr& shm_pool, + const std::shared_ptr& infer_request, + const std::unique_ptr& shm_pool, TRITONSERVER_InferenceResponse** triton_response) { std::unique_ptr infer_response; @@ -221,7 +241,7 @@ RequestExecutor::Infer( infer_input->Dims().data(), infer_input->Dims().size())); THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAppendInputData( - irequest, infer_input->Name().c_str(), infer_input->GetDataPtr(), + irequest, infer_input->Name().c_str(), infer_input->DataPtr(), infer_input->ByteSize(), infer_input->MemoryType(), infer_input->MemoryTypeId())); } diff --git a/src/pb_main_utils.h b/src/request_executor.h similarity index 86% rename from src/pb_main_utils.h rename to src/request_executor.h index a0967f2c..7d988efd 100644 --- a/src/pb_main_utils.h +++ b/src/request_executor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,14 +32,20 @@ namespace triton { namespace backend { namespace python { TRITONSERVER_Error* CreateTritonErrorFromException( const PythonBackendException& pb_exception); + +struct AllocationInfo { + bi::managed_external_buffer::handle_t handle_; + SharedMemoryManager* shm_manager_; +}; + class RequestExecutor { TRITONSERVER_ResponseAllocator* response_allocator_ = nullptr; TRITONSERVER_Server* server_; public: std::unique_ptr Infer( - const std::unique_ptr& infer_request, - const std::unique_ptr& shm_pool, + const std::shared_ptr& infer_request, + const std::unique_ptr& shm_pool, TRITONSERVER_InferenceResponse** response); RequestExecutor(TRITONSERVER_Server* server); ~RequestExecutor(); diff --git a/src/scoped_defer.cc b/src/scoped_defer.cc new file mode 100644 index 00000000..9c33bfd2 --- /dev/null +++ b/src/scoped_defer.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "scoped_defer.h" + +namespace triton { namespace backend { namespace python { +ScopedDefer::ScopedDefer(std::function task) +{ + task_ = task; + done_ = false; +} + +void +ScopedDefer::Complete() +{ + if (!done_) { + task_(); + done_ = true; + } +} + +ScopedDefer::~ScopedDefer() +{ + if (!done_) { + task_(); + } +} + +}}}; // namespace triton::backend::python diff --git a/src/scoped_defer.h b/src/scoped_defer.h new file mode 100644 index 00000000..eb52d6b6 --- /dev/null +++ b/src/scoped_defer.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once +#include + +namespace triton { namespace backend { namespace python { +class ScopedDefer { + public: + ScopedDefer(std::function task); + ~ScopedDefer(); + void Complete(); + + private: + std::function task_; + bool done_; +}; + +}}} // namespace triton::backend::python diff --git a/src/shm_manager.cc b/src/shm_manager.cc index b636d099..03ebdb40 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -24,165 +24,132 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include "shm_manager.h" +#include +#include +#include +#include -#include -#include -#include -#include -#include -#include -#include #include "pb_utils.h" +#include "shm_manager.h" namespace triton { namespace backend { namespace python { -namespace bi = boost::interprocess; - -SharedMemory::SharedMemory( - const std::string& shm_key, int64_t default_byte_size, - int64_t shm_growth_bytes, bool truncate) +SharedMemoryManager::SharedMemoryManager( + const std::string& shm_region_name, size_t shm_size, + size_t shm_growth_bytes, bool create) { - // `truncate` variable indicates whether the shared memory region has been - // created before the other process starts using it or not. - if (truncate) { - shm_obj_ = bi::shared_memory_object( - bi::open_or_create, shm_key.c_str(), bi::read_write); - } else { - shm_obj_ = bi::shared_memory_object( - bi::open_only, shm_key.c_str(), bi::read_write); - } - + shm_region_name_ = shm_region_name; + create_ = create; shm_growth_bytes_ = shm_growth_bytes; + try { - shm_obj_.truncate(default_byte_size); + if (create) { + shm_obj_ = std::make_unique( + bi::open_or_create, shm_region_name.c_str(), bi::read_write); + shm_obj_->truncate(shm_size); + } else { + shm_obj_ = std::make_unique( + bi::open_only, shm_region_name.c_str(), bi::read_write); + } + + current_capacity_ = shm_size; + shm_map_ = std::make_shared(*shm_obj_, bi::read_write); + old_shm_maps_.push_back(shm_map_); + + // Only create the managed external buffer for the stub process. + if (create) { + managed_buffer_ = std::make_unique( + bi::create_only, shm_map_->get_address(), shm_size); + } else { + int64_t shm_size = 0; + shm_obj_->get_size(shm_size); + managed_buffer_ = std::make_unique( + bi::open_only, shm_map_->get_address(), shm_size); + current_capacity_ = shm_size; + } } catch (bi::interprocess_exception& ex) { std::string error_message = - ("Unable to initialize shared memory key '" + shm_key + - "' to requested size (" + std::to_string(default_byte_size) + + ("Unable to initialize shared memory key '" + shm_region_name + + "' to requested size (" + std::to_string(shm_size) + " bytes). If you are running Triton inside docker, use '--shm-size' " "flag to control the shared memory region size. Each Python backend " "model instance requires at least 64MBs of shared memory. Error: " + ex.what()); - // Remove the shared memory region if there was an error. - bi::shared_memory_object::remove(shm_key_.c_str()); + bi::shared_memory_object::remove(shm_region_name.c_str()); throw PythonBackendException(std::move(error_message)); } - shm_map_ = std::make_unique(shm_obj_, bi::read_write); - shm_addr_ = (char*)shm_map_->get_address(); - - capacity_ = (size_t*)shm_addr_; - *capacity_ = default_byte_size; - current_capacity_ = *capacity_; - - if (truncate) { - // Create the shared memory mutex. - shm_mutex_ = new ((char*)shm_addr_ + sizeof(size_t)) bi::interprocess_mutex; - } else { - // If the shared memory is already created, just fix the pointer. - char* shm_mutex = (char*)shm_addr_ + sizeof(size_t); - shm_mutex_ = reinterpret_cast(shm_mutex); - } - - // Set offset address - offset_ = - (off_t*)((char*)shm_addr_ + sizeof(size_t) + sizeof(bi::interprocess_mutex)); - - if (truncate) { - *offset_ = 0; - *offset_ += sizeof(off_t); - *offset_ += sizeof(size_t); - *offset_ += sizeof(bi::interprocess_mutex); + // Construct a mutex in shared memory. + shm_mutex_ = + managed_buffer_->find_or_construct("shm_mutex")(); + total_size_ = managed_buffer_->find_or_construct("total size")(); + if (create) { + *total_size_ = current_capacity_; + new (shm_mutex_) bi::interprocess_mutex; } - - shm_key_ = shm_key; -} - -SharedMemory::~SharedMemory() noexcept(false) -{ - bi::shared_memory_object::remove(shm_key_.c_str()); } void -SharedMemory::Map(char** shm_addr, size_t byte_size, off_t& offset) +SharedMemoryManager::GrowIfNeeded(uint64_t byte_size) { - bi::scoped_lock gaurd{*shm_mutex_}; - - size_t shm_bytes_added = 0; - while (*offset_ + byte_size >= *capacity_) { - // Increase the shared memory pool size by the amount of bytes available. - *capacity_ += shm_growth_bytes_; - shm_bytes_added += shm_growth_bytes_; + if (*total_size_ != current_capacity_) { + shm_map_ = std::make_shared(*shm_obj_, bi::read_write); + managed_buffer_ = std::make_unique( + bi::open_only, shm_map_->get_address(), *total_size_); + old_shm_maps_.push_back(shm_map_); + current_capacity_ = *total_size_; } - if (shm_bytes_added > 0) { + if (byte_size != 0) { + uint64_t bytes_to_be_added = + shm_growth_bytes_ * (byte_size / shm_growth_bytes_ + 1); + uint64_t new_size = *total_size_ + bytes_to_be_added; try { - shm_obj_.truncate(*capacity_); + shm_obj_->truncate(new_size); } catch (bi::interprocess_exception& ex) { - *capacity_ -= shm_bytes_added; std::string error_message = ("Failed to increase the shared memory pool size for key '" + - shm_key_ + "' to " + std::to_string(*capacity_) + + shm_region_name_ + "' to " + std::to_string(*total_size_) + " bytes. If you are running Triton inside docker, use '--shm-size' " "flag to control the shared memory region size. Error: " + ex.what()); throw PythonBackendException(error_message); } - } - - UpdateSharedMemory(); - - *shm_addr = shm_addr_ + *offset_; - offset = *offset_; - *offset_ += byte_size; -} -void -SharedMemory::UpdateSharedMemory() -{ - if (current_capacity_ != *capacity_) { - std::unique_ptr new_map; try { - new_map = std::make_unique(shm_obj_, bi::read_write); + shm_obj_->truncate(new_size); + shm_map_ = std::make_shared(*shm_obj_, bi::read_write); + old_shm_maps_.push_back(shm_map_); + managed_buffer_ = std::make_unique( + bi::open_only, shm_map_->get_address(), new_size); + managed_buffer_->grow(new_size - current_capacity_); + current_capacity_ = managed_buffer_->get_size(); + *total_size_ = new_size; } catch (bi::interprocess_exception& ex) { - std::string error_message = std::string( - "unable to process address space or " - "shared-memory descriptor, err:") + - ex.what(); + shm_obj_->truncate(*total_size_); + std::string error_message = + ("Failed to create new mapped region for the grown shared memory " + "region '" + + shm_region_name_ + "'. " + ex.what()); throw PythonBackendException(error_message); } - - old_shm_maps_.emplace_back(std::move(shm_map_)); - current_capacity_ = *capacity_; - shm_map_ = std::move(new_map); - shm_addr_ = (char*)shm_map_->get_address(); } } -void -SharedMemory::MapOffset(char** shm_addr, off_t offset) +size_t +SharedMemoryManager::FreeMemory() { - bi::scoped_lock gaurd(*shm_mutex_); - // Update shared memory pointer and capacity if necessary. - UpdateSharedMemory(); - *shm_addr = shm_addr_ + offset; + return managed_buffer_->get_free_memory(); } -void -SharedMemory::SetOffset(off_t offset) -{ - *offset_ = offset; -} -off_t -SharedMemory::Offset() +SharedMemoryManager::~SharedMemoryManager() noexcept(false) { - return *offset_; + bi::shared_memory_object::remove(shm_region_name_.c_str()); } }}} // namespace triton::backend::python diff --git a/src/shm_manager.h b/src/shm_manager.h index 5e46bfc6..e990cbf5 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -1,4 +1,4 @@ -// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,49 +26,171 @@ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include +#include #include -#include -#include +#include +#include #include - +#include "pb_exception.h" namespace triton { namespace backend { namespace python { +namespace bi = boost::interprocess; -class SharedMemory { - std::string shm_key_; - size_t* capacity_; - off_t* offset_; - char* shm_addr_; - boost::interprocess::interprocess_mutex* shm_mutex_; +template +struct AllocatedSharedMemory { + AllocatedSharedMemory() = default; + AllocatedSharedMemory( + std::unique_ptr>& data, + bi::managed_external_buffer::handle_t handle) + : data_(std::move(data)), handle_(handle) + { + } - // Current capcity, local to each process. - size_t current_capacity_; + std::unique_ptr> data_; + bi::managed_external_buffer::handle_t handle_; +}; - // Amount of bytes to grow the shared memory when the pool is completely used. - int64_t shm_growth_bytes_; +struct AllocatedShmOwnership { + uint32_t ref_count_; +}; - // Get the amount of shared memory available. - size_t GetAvailableSharedMemory(); - boost::interprocess::shared_memory_object shm_obj_; - std::unique_ptr shm_map_; - std::vector> - old_shm_maps_; +class SharedMemoryManager { + public: + SharedMemoryManager( + const std::string& shm_region_name, size_t shm_size, + size_t shm_growth_bytes, bool create); - void UpdateSharedMemory(); + template + AllocatedSharedMemory Construct(uint64_t count = 1, bool aligned = false) + { + T* obj = nullptr; + AllocatedShmOwnership* shm_ownership_data = nullptr; + bi::managed_external_buffer::handle_t handle = 0; - public: - SharedMemory( - const std::string& shm_key, int64_t default_byte_size, - int64_t shm_growth_bytes, bool truncate = false); - void MapOffset(char** shm_addr, off_t offset); - void Map(char** shm_addr, size_t byte_size, off_t& offset); - off_t Offset(); - void SetOffset(off_t offset); - ~SharedMemory() noexcept(false); -}; + { + bi::scoped_lock gaurd{*shm_mutex_}; + std::size_t requested_bytes = + sizeof(T) * count + sizeof(AllocatedShmOwnership); + GrowIfNeeded(0); + + void* allocated_data; + try { + allocated_data = Allocate(requested_bytes, aligned); + } + catch (bi::bad_alloc& ex) { + // Try to grow the shared memory region if the allocate failed. + GrowIfNeeded(requested_bytes); + allocated_data = Allocate(requested_bytes, aligned); + } + + shm_ownership_data = + reinterpret_cast(allocated_data); + obj = reinterpret_cast( + (reinterpret_cast(shm_ownership_data)) + + sizeof(AllocatedShmOwnership)); + shm_ownership_data->ref_count_ = 1; + + handle = managed_buffer_->get_handle_from_address( + reinterpret_cast(shm_ownership_data)); + } + + return WrapObjectInUniquePtr(obj, shm_ownership_data, handle); + } + + template + AllocatedSharedMemory Load( + bi::managed_external_buffer::handle_t handle, bool unsafe = false) + { + T* object_ptr; + AllocatedShmOwnership* shm_ownership_data; + + { + bi::scoped_lock gaurd{*shm_mutex_}; + GrowIfNeeded(0); + shm_ownership_data = reinterpret_cast( + managed_buffer_->get_address_from_handle(handle)); + object_ptr = reinterpret_cast( + reinterpret_cast(shm_ownership_data) + + sizeof(AllocatedShmOwnership)); + if (!unsafe) { + shm_ownership_data->ref_count_ += 1; + } + } + return WrapObjectInUniquePtr(object_ptr, shm_ownership_data, handle); + } + + size_t FreeMemory(); + + void Deallocate(bi::managed_external_buffer::handle_t handle) + { + bi::scoped_lock gaurd{*shm_mutex_}; + GrowIfNeeded(0); + void* ptr = managed_buffer_->get_address_from_handle(handle); + managed_buffer_->deallocate(ptr); + } + + void DeallocateUnsafe(bi::managed_external_buffer::handle_t handle) + { + void* ptr = managed_buffer_->get_address_from_handle(handle); + managed_buffer_->deallocate(ptr); + } + + void GrowIfNeeded(uint64_t bytes); + bi::interprocess_mutex* Mutex() { return shm_mutex_; } + + ~SharedMemoryManager() noexcept(false); + + private: + std::string shm_region_name_; + std::unique_ptr managed_buffer_; + std::unique_ptr shm_obj_; + std::shared_ptr shm_map_; + std::vector> old_shm_maps_; + uint64_t current_capacity_; + bi::interprocess_mutex* shm_mutex_; + size_t shm_growth_bytes_; + uint64_t* total_size_; + bool create_; + + template + AllocatedSharedMemory WrapObjectInUniquePtr( + T* object, AllocatedShmOwnership* shm_ownership_data, + const bi::managed_external_buffer::handle_t& handle) + { + // Custom deleter to conditionally deallocate the object + std::function deleter = [this, handle, + shm_ownership_data](T* memory) { + bool destroy = false; + bi::scoped_lock gaurd{*shm_mutex_}; + shm_ownership_data->ref_count_ -= 1; + if (shm_ownership_data->ref_count_ == 0) { + destroy = true; + } + if (destroy) { + DeallocateUnsafe(handle); + } + }; + + auto data = std::unique_ptr(object, deleter); + return AllocatedSharedMemory(data, handle); + } + + void* Allocate(uint64_t requested_bytes, bool aligned) + { + void* ptr; + if (aligned) { + const std::size_t alignment = 32; + ptr = managed_buffer_->allocate_aligned(requested_bytes, alignment); + } else { + ptr = managed_buffer_->allocate(requested_bytes); + } + + return ptr; + } +}; }}} // namespace triton::backend::python From 7be41725ad502e2b95cb2ae89e0fb711c4c843f0 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 25 Mar 2022 12:31:24 -0400 Subject: [PATCH 018/216] Make sure there are no shared memory leaks after restart (#129) --- src/pb_stub.cc | 5 +++ src/python.cc | 83 ++++++++++++++++++++++++++------------------------ 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 9d6f3450..c8dae887 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -417,6 +417,7 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) py::setattr( python_backend_utils, "InferenceResponse", c_python_backend_utils.attr("InferenceResponse")); + c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); py::object TritonPythonModel = py::module_::import( @@ -752,6 +753,10 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("has_error", &InferResponse::HasError) .def("error", &InferResponse::Error); + // This class is not part of the public API for Python backend. This is only + // used for internal testing purposes. + py::class_(module, "SharedMemory") + .def("free_memory", &SharedMemoryManager::FreeMemory); py::register_exception( module, "TritonModelException"); diff --git a/src/python.cc b/src/python.cc index fbec5b67..a4c5cb1d 100644 --- a/src/python.cc +++ b/src/python.cc @@ -518,18 +518,53 @@ ModelInstanceState::IsStubProcessAlive() TRITONSERVER_Error* ModelInstanceState::StartStubProcess() { + // Destruct any in-use shared memory object before starting the stub process. + ipc_control_ = nullptr; + stub_message_queue_ = nullptr; + parent_message_queue_ = nullptr; + ModelState* model_state = reinterpret_cast(Model()); + int64_t shm_default_size = + model_state->StateForBackend()->shm_default_byte_size; + int64_t shm_growth_byte_size = + model_state->StateForBackend()->shm_growth_byte_size; + + try { + // It is necessary for restart to make sure that the previous shared memory + // pool is destructed before the new pool is created. + shm_pool_ = nullptr; + shm_pool_ = std::make_unique( + shm_region_name_, shm_default_size, shm_growth_byte_size, + true /* create */); + } + catch (const PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + AllocatedSharedMemory ipc_control = + shm_pool_->Construct(); + ipc_control_ = std::move(ipc_control.data_); + ipc_control_handle_ = ipc_control.handle_; + + auto message_queue_size = + model_state->StateForBackend()->shm_message_queue_size; + + RETURN_IF_EXCEPTION( + stub_message_queue_ = + MessageQueue::Create(shm_pool_, message_queue_size)); + RETURN_IF_EXCEPTION( + parent_message_queue_ = + MessageQueue::Create(shm_pool_, message_queue_size)); + ipc_control_->parent_message_queue = parent_message_queue_->ShmHandle(); + ipc_control_->stub_message_queue = stub_message_queue_->ShmHandle(); + new (&(ipc_control_->stub_health_mutex)) bi::interprocess_mutex; health_mutex_ = &(ipc_control_->stub_health_mutex); + stub_message_queue_->ResetSemaphores(); parent_message_queue_->ResetSemaphores(); std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); - - ModelState* model_state = reinterpret_cast(Model()); - int64_t shm_growth_size = - model_state->StateForBackend()->shm_growth_byte_size; - int64_t shm_default_size = - model_state->StateForBackend()->shm_default_byte_size; const char* model_path = model_state->RepositoryPath().c_str(); initialized_ = false; @@ -575,7 +610,7 @@ ModelInstanceState::StartStubProcess() << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ << " " << shm_region_name_ << " " << shm_default_size << " " - << shm_growth_size << " " << parent_pid_ << " " + << shm_growth_byte_size << " " << parent_pid_ << " " << model_state->StateForBackend()->python_lib << " " << ipc_control_handle_ << " " << Name(); ipc_control_->uses_env = true; @@ -584,7 +619,7 @@ ModelInstanceState::StartStubProcess() std::stringstream ss; ss << " exec " << python_backend_stub << " " << model_path_ << " " << shm_region_name_ << " " << shm_default_size << " " - << shm_growth_size << " " << parent_pid_ << " " + << shm_growth_byte_size << " " << parent_pid_ << " " << model_state->StateForBackend()->python_lib << " " << ipc_control_handle_ << " " << Name(); bash_argument = ss.str(); @@ -623,7 +658,7 @@ ModelInstanceState::StartStubProcess() << "Python backend stub path: " << python_backend_stub << '\n' << "Shared Memory Region Name: " << shm_region_name_ << '\n' << "Shared Memory Default Byte Size: " << shm_default_size << '\n' - << "Shared Memory Growth Byte Size: " << shm_growth_size << '\n'; + << "Shared Memory Growth Byte Size: " << shm_growth_byte_size << '\n'; std::string log_message = ss.str(); LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str()); @@ -717,25 +752,6 @@ ModelInstanceState::SetupStubProcess() shm_region_name_ = model_state->StateForBackend()->shared_memory_region_prefix + std::to_string(model_state->StateForBackend()->number_of_instance_inits); - int64_t shm_default_size = - model_state->StateForBackend()->shm_default_byte_size; - int64_t shm_growth_byte_size = - model_state->StateForBackend()->shm_growth_byte_size; - - try { - shm_pool_ = std::make_unique( - shm_region_name_, shm_default_size, shm_growth_byte_size, - true /* create */); - } - catch (const PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - } - - AllocatedSharedMemory ipc_control = - shm_pool_->Construct(); - ipc_control_ = std::move(ipc_control.data_); - ipc_control_handle_ = ipc_control.handle_; uint64_t model_version = model_state->Version(); const char* model_path = model_state->RepositoryPath().c_str(); @@ -791,17 +807,6 @@ ModelInstanceState::SetupStubProcess() parent_pid_ = getpid(); - auto message_queue_size = - model_state->StateForBackend()->shm_message_queue_size; - - RETURN_IF_EXCEPTION( - stub_message_queue_ = - MessageQueue::Create(shm_pool_, message_queue_size)); - RETURN_IF_EXCEPTION( - parent_message_queue_ = - MessageQueue::Create(shm_pool_, message_queue_size)); - ipc_control_->parent_message_queue = parent_message_queue_->ShmHandle(); - ipc_control_->stub_message_queue = stub_message_queue_->ShmHandle(); RETURN_IF_ERROR(StartStubProcess()); From 4e3c580932c41b9f682f254d6ffb7484a1c14793 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 31 Mar 2022 23:13:53 -0400 Subject: [PATCH 019/216] Add shared memory monitor (#130) * Add shared memory monitor * Make sure there are no pending shared memory operations --- CMakeLists.txt | 2 ++ src/infer_request.cc | 4 +-- src/pb_stub.cc | 19 +++++++------ src/python.cc | 19 +++++++------ src/shm_manager.cc | 32 ++++++++++++++++++++-- src/shm_manager.h | 3 +++ src/shm_monitor/CMakeLists.txt | 49 ++++++++++++++++++++++++++++++++++ src/shm_monitor/shm_monitor.cc | 40 +++++++++++++++++++++++++++ 8 files changed, 146 insertions(+), 22 deletions(-) create mode 100644 src/shm_monitor/CMakeLists.txt create mode 100644 src/shm_monitor/shm_monitor.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 157f27bc..fe09828e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,6 +251,8 @@ set_target_properties( LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" ) +add_subdirectory(./src/shm_monitor) + # # Install # diff --git a/src/infer_request.cc b/src/infer_request.cc index 7db7f5bf..d31028e2 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -292,11 +292,11 @@ InferRequest::Exec() std::unique_ptr ipc_message; AllocatedSharedMemory request_batch; - ScopedDefer data_load_complete(std::bind([&ipc_message] { + ScopedDefer data_load_complete([&ipc_message] { bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; ipc_message->ResponseCondition()->notify_all(); - })); + }); try { py::gil_scoped_release release; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index c8dae887..9fedd517 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -248,10 +248,10 @@ Stub::RunCommand() // parent process sends a message to the stub process asking the stub // process to release any objects it has held in shared memory. ScopedDefer receive_initialize_finalize( - std::bind([this] { stub_message_queue_->Pop(); })); - ScopedDefer _(std::bind([this, &initialize_response_msg] { + [this] { stub_message_queue_->Pop(); }); + ScopedDefer _([this, &initialize_response_msg] { SendIPCMessage(initialize_response_msg); - })); + }); initialize_response.data_->response_has_error = false; initialize_response.data_->response_is_error_set = false; @@ -314,10 +314,9 @@ Stub::RunCommand() execute_response->Args() = response_batch.handle_; - ScopedDefer execute_finalize( - std::bind([this] { stub_message_queue_->Pop(); })); - ScopedDefer _(std::bind( - [this, &execute_response] { SendIPCMessage(execute_response); })); + ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); + ScopedDefer _( + [this, &execute_response] { SendIPCMessage(execute_response); }); response_batch_shm_ptr->has_error = false; response_batch_shm_ptr->is_error_set = false; @@ -498,14 +497,14 @@ Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) // Pop a dummy message from the stub message queue indicating that the parent // has finished copying the tensors. - ScopedDefer _(std::bind([this, has_cpu_buffer] { + ScopedDefer _([this, has_cpu_buffer] { if (has_cpu_buffer) { stub_message_queue_->Pop(); } - })); + }); ScopedDefer load_gpu_buffer_response( - std::bind([this, has_cpu_buffer] { parent_message_queue_->Push(1000); })); + [this, has_cpu_buffer] { parent_message_queue_->Push(1000); }); for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::shared_ptr& src_buffer = gpu_tensors_[i]; diff --git a/src/python.cc b/src/python.cc index a4c5cb1d..048c4a54 100644 --- a/src/python.cc +++ b/src/python.cc @@ -669,12 +669,12 @@ ModelInstanceState::StartStubProcess() } } else { - ScopedDefer _(std::bind([this] { + ScopedDefer _([this] { // Push a dummy message to the message queue so that the stub // process is notified that it can release the object stored in // shared memory. stub_message_queue_->Push(1000); - })); + }); stub_pid_ = pid; triton::common::TritonJson::WriteBuffer buffer; @@ -1154,6 +1154,12 @@ ModelInstanceState::ProcessRequests( } } + // Wait for all the pending BLS requests to be completed. + ScopedDefer bls_defer([this] { + WaitForBLSRequestsToFinish(); + CleanupBLSResponses(); + }); + for (size_t i = 0; i < request_count; i++) { if (max_batch_size > 0) { // Retrieve the batch size from one of the inputs, if the model @@ -1297,14 +1303,14 @@ ModelInstanceState::ProcessRequests( } - ScopedDefer execute_finalize(std::bind([this, &restart] { + ScopedDefer execute_finalize([this, &restart] { // Push a dummy message to the message queue so that // the stub process is notified that it can release // the object stored in shared memory. NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); if (!restart) stub_message_queue_->Push(1000); - })); + }); if (restart) { return; } @@ -1609,6 +1615,7 @@ ModelInstanceState::ProcessRequests( #endif // TRITON_ENABLE_GPU } + bls_defer.Complete(); for (uint32_t r = 0; r < request_count; ++r) { // If error happens at this stage, we can only log it GUARDED_RESPOND_IF_ERROR( @@ -2051,10 +2058,6 @@ TRITONBACKEND_ModelInstanceExecute( bool restart = false; instance_state->ProcessRequests(requests, request_count, restart); - // Wait for all the pending BLS requests to be completed. - instance_state->WaitForBLSRequestsToFinish(); - instance_state->CleanupBLSResponses(); - for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; LOG_IF_ERROR( diff --git a/src/shm_manager.cc b/src/shm_manager.cc index 03ebdb40..4cc5d98d 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -29,7 +29,6 @@ #include #include -#include "pb_utils.h" #include "shm_manager.h" namespace triton { namespace backend { namespace python { @@ -85,12 +84,38 @@ SharedMemoryManager::SharedMemoryManager( shm_mutex_ = managed_buffer_->find_or_construct("shm_mutex")(); total_size_ = managed_buffer_->find_or_construct("total size")(); + read_only_ = false; if (create) { *total_size_ = current_capacity_; new (shm_mutex_) bi::interprocess_mutex; } } +SharedMemoryManager::SharedMemoryManager(const std::string& shm_region_name) +{ + shm_region_name_ = shm_region_name; + create_ = false; + shm_growth_bytes_ = 1024; + + shm_obj_ = std::make_unique( + bi::open_only, shm_region_name.c_str(), bi::read_write); + + shm_map_ = std::make_shared(*shm_obj_, bi::read_write); + old_shm_maps_.push_back(shm_map_); + + int64_t shm_size = 0; + shm_obj_->get_size(shm_size); + managed_buffer_ = std::make_unique( + bi::open_only, shm_map_->get_address(), shm_size); + current_capacity_ = shm_size; + + // Construct a mutex in shared memory. + shm_mutex_ = + managed_buffer_->find_or_construct("shm_mutex")(); + total_size_ = managed_buffer_->find_or_construct("total size")(); + read_only_ = true; +} + void SharedMemoryManager::GrowIfNeeded(uint64_t byte_size) { @@ -143,13 +168,16 @@ SharedMemoryManager::GrowIfNeeded(uint64_t byte_size) size_t SharedMemoryManager::FreeMemory() { + GrowIfNeeded(0); return managed_buffer_->get_free_memory(); } SharedMemoryManager::~SharedMemoryManager() noexcept(false) { - bi::shared_memory_object::remove(shm_region_name_.c_str()); + if (!read_only_) { + bi::shared_memory_object::remove(shm_region_name_.c_str()); + } } }}} // namespace triton::backend::python diff --git a/src/shm_manager.h b/src/shm_manager.h index e990cbf5..93072c07 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -64,6 +64,8 @@ class SharedMemoryManager { const std::string& shm_region_name, size_t shm_size, size_t shm_growth_bytes, bool create); + SharedMemoryManager(const std::string& shm_region_name); + template AllocatedSharedMemory Construct(uint64_t count = 1, bool aligned = false) { @@ -156,6 +158,7 @@ class SharedMemoryManager { size_t shm_growth_bytes_; uint64_t* total_size_; bool create_; + bool read_only_; template AllocatedSharedMemory WrapObjectInUniquePtr( diff --git a/src/shm_monitor/CMakeLists.txt b/src/shm_monitor/CMakeLists.txt new file mode 100644 index 00000000..0f7d4b86 --- /dev/null +++ b/src/shm_monitor/CMakeLists.txt @@ -0,0 +1,49 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required (VERSION 3.18) + +pybind11_add_module( + triton-shm-monitor + EXCLUDE_FROM_ALL + ./shm_monitor.cc + ../shm_manager.h + ../shm_manager.cc +) + +target_link_libraries( + triton-shm-monitor + PRIVATE + -lrt # shared memory +) + +set_property(TARGET triton-shm-monitor PROPERTY OUTPUT_NAME triton_shm_monitor) + +install( + TARGETS + triton-shm-monitor + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/python OPTIONAL +) diff --git a/src/shm_monitor/shm_monitor.cc b/src/shm_monitor/shm_monitor.cc new file mode 100644 index 00000000..dfeb1fbb --- /dev/null +++ b/src/shm_monitor/shm_monitor.cc @@ -0,0 +1,40 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include "../shm_manager.h" + +namespace triton { namespace backend { namespace python { +namespace py = pybind11; + +PYBIND11_MODULE(triton_shm_monitor, m) +{ + py::class_(m, "SharedMemoryManager") + .def(py::init()) + .def("free_memory", &SharedMemoryManager::FreeMemory); +} + +}}} // namespace triton::backend::python From 18cade6cc49813bccb5c51964481ec53765ac699 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 6 Apr 2022 09:42:46 -0400 Subject: [PATCH 020/216] Fix message queue on ARM (#132) * Fix message queue on ARM * Add comment --- src/message_queue.h | 6 +++--- src/shm_manager.h | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/message_queue.h b/src/message_queue.h index 19811a55..140dbe9d 100644 --- a/src/message_queue.h +++ b/src/message_queue.h @@ -42,13 +42,13 @@ namespace bi = boost::interprocess; /// \param sem_empty Semaphore object counting the number of empty buffer slots. /// \param sem_full Semaphore object counting the number of used buffer slots. struct MessageQueueShm { + bi::interprocess_semaphore sem_empty{0}; + bi::interprocess_semaphore sem_full{0}; + bi::interprocess_mutex mutex; std::size_t size; bi::managed_external_buffer::handle_t buffer; - bi::interprocess_mutex mutex; int head; int tail; - bi::interprocess_semaphore sem_empty{0}; - bi::interprocess_semaphore sem_full{0}; }; class MessageQueue { diff --git a/src/shm_manager.h b/src/shm_manager.h index 93072c07..332f7ee3 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -54,9 +54,16 @@ struct AllocatedSharedMemory { bi::managed_external_buffer::handle_t handle_; }; +// The alignment here is used to extend the size of the shared memory allocation +// struct to 16 bytes. The reason for this change is that when an aligned shared +// memory location is requested using the `Construct` method, the memory +// alignment of the object will be incorrect since the shared memory ownership +// info is placed in the beginning and the actual object is placed after that +// (i.e. 4 plus the aligned address is not 16-bytes aligned). The aligned memory +// is required by semaphore otherwise it may lead to SIGBUS error on ARM. struct AllocatedShmOwnership { uint32_t ref_count_; -}; +} __attribute__((aligned(16))); class SharedMemoryManager { public: From 0e6c78a225f1dd80e4163fee10940e44c65d1003 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 7 Apr 2022 13:50:45 -0400 Subject: [PATCH 021/216] Improve the lifetime of BLS output tensors (#131) * Improve the lifetime of BLS output tensors * Add a note about the improved lifecycle of BLS output tensors * Fix stub shutdown if there is an error during initialization * Fix CPU only build * Fix memory manager destruction --- CMakeLists.txt | 3 +- README.md | 12 +- src/infer_request.cc | 11 ++ src/memory_manager.cc | 119 ++++++++++++++++++++ src/memory_manager.h | 81 ++++++++++++++ src/message_queue.cc | 243 ---------------------------------------- src/message_queue.h | 200 +++++++++++++++++++++++++++++---- src/pb_memory.cc | 27 +++++ src/pb_memory.h | 11 ++ src/pb_stub.cc | 28 +++-- src/pb_stub.h | 11 +- src/pb_tensor.h | 1 + src/pb_utils.h | 1 + src/python.cc | 159 ++++++++++++++++---------- src/request_executor.cc | 11 +- src/shm_manager.cc | 12 +- src/shm_manager.h | 4 +- 17 files changed, 581 insertions(+), 353 deletions(-) create mode 100644 src/memory_manager.cc create mode 100644 src/memory_manager.h delete mode 100644 src/message_queue.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index fe09828e..2ee13244 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,7 +132,6 @@ set( src/infer_response.h src/infer_request.cc src/infer_request.h - src/message_queue.cc src/message_queue.h src/ipc_message.cc src/ipc_message.h @@ -162,6 +161,8 @@ set( src/pb_env.h src/pb_metric_reporter.cc src/pb_metric_reporter.h + src/memory_manager.cc + src/memory_manager.h src/request_executor.cc src/request_executor.h ) diff --git a/README.md b/README.md index 4fa1a816..6a48906e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ + +# Decoupled Model Examples + +In this section we demonstrate an end-to-end examples for developing and +serving [decoupled models](../../README.md#decoupled-mode-beta) in Python backend. + +[repeat_model.py](repeat_model.py) and [square_model.py](square_model.py) demonstrate +how to write a decoupled model where each request can generate 0 to many responses. +These files are heavily commented to describe each fuinction call. +These example models are designed to show the flexibility available to decoupled models +and in no way should be used in production. These examples circumvents +the restriction placed by the [instance count](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#instance-groups) +and allows multiple requests to be in process even for single instance. In +real deployment, the model should not allow the caller thread to return from +`execute` until that instance is ready to handle another set of requests. + +## Deploying the Decoupled Models + +1. Create the model repository: + +```console +$ mkdir -p models/repeat_int32/1 +$ mkdir -p models/square_int32/1 + +# Copy the Python models +$ cp examples/decoupled/repeat_model.py models/repeat_int32/1/model.py +$ cp examples/decoupled/repeat_config.pbtxt models/repeat_int32/config.pbtxt +$ cp examples/decoupled/square_model.py models/square_int32/1/model.py +$ cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt +``` + +2. Start the tritonserver: + +``` +tritonserver --model-repository `pwd`/models +``` + +## Running inference on Repeat model: + +Send inference requests to repeat model using [repeat_client.py](repeat_client.py). + +``` +python3 examples/decoupled/repeat_client.py +``` + +You should see an output similar to the output below: + +``` +stream started... +async_stream_infer +model_name: "repeat_int32" +id: "0" +inputs { + name: "IN" + datatype: "INT32" + shape: 4 +} +inputs { + name: "DELAY" + datatype: "UINT32" + shape: 4 +} +inputs { + name: "WAIT" + datatype: "UINT32" + shape: 1 +} +outputs { + name: "OUT" +} +outputs { + name: "IDX" +} +raw_input_contents: "\004\000\000\000\002\000\000\000\000\000\000\000\001\000\000\000" +raw_input_contents: "\001\000\000\000\002\000\000\000\003\000\000\000\004\000\000\000" +raw_input_contents: "\005\000\000\000" + +enqueued request 0 to stream... +infer_response { + model_name: "repeat_int32" + model_version: "1" + id: "0" + outputs { + name: "IDX" + datatype: "UINT32" + shape: 1 + } + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\000\000\000\000" + raw_output_contents: "\004\000\000\000" +} + +infer_response { + model_name: "repeat_int32" + model_version: "1" + id: "0" + outputs { + name: "IDX" + datatype: "UINT32" + shape: 1 + } + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\001\000\000\000" + raw_output_contents: "\002\000\000\000" +} + +infer_response { + model_name: "repeat_int32" + model_version: "1" + id: "0" + outputs { + name: "IDX" + datatype: "UINT32" + shape: 1 + } + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\002\000\000\000" + raw_output_contents: "\000\000\000\000" +} + +infer_response { + model_name: "repeat_int32" + model_version: "1" + id: "0" + outputs { + name: "IDX" + datatype: "UINT32" + shape: 1 + } + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\003\000\000\000" + raw_output_contents: "\001\000\000\000" +} + +PASS: repeat_int32 +stream stopped... + +``` + +Look how a single request generated 4 responses. + +## Running inference on Square model: + +Send inference requests to square model using [square_client.py](square_client.py). + +``` +python3 examples/decoupled/square_client.py +``` + +You should see an output similar to the output below: + +``` +stream started... +async_stream_infer +model_name: "square_int32" +id: "0" +inputs { + name: "IN" + datatype: "INT32" + shape: 1 +} +outputs { + name: "OUT" +} +raw_input_contents: "\004\000\000\000" + +enqueued request 0 to stream... +async_stream_infer +model_name: "square_int32" +id: "1" +inputs { + name: "IN" + datatype: "INT32" + shape: 1 +} +outputs { + name: "OUT" +} +raw_input_contents: "\002\000\000\000" + +enqueued request 1 to stream... +async_stream_infer +model_name: "square_int32" +id: "2" +inputs { + name: "IN" + datatype: "INT32" + shape: 1 +} +outputs { + name: "OUT" +} +raw_input_contents: "\000\000\000\000" + +enqueued request 2 to stream... +async_stream_infer +model_name: "square_int32" +id: "3" +inputs { + name: "IN" + datatype: "INT32" + shape: 1 +} +outputs { + name: "OUT" +} +raw_input_contents: "\001\000\000\000" + +enqueued request 3 to stream... +infer_response { + model_name: "square_int32" + model_version: "1" + id: "0" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\004\000\000\000" +} + +infer_response { + model_name: "square_int32" + model_version: "1" + id: "1" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\002\000\000\000" +} + +infer_response { + model_name: "square_int32" + model_version: "1" + id: "0" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\004\000\000\000" +} + +infer_response { + model_name: "square_int32" + model_version: "1" + id: "3" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\001\000\000\000" +} + +infer_response { + model_name: "square_int32" + model_version: "1" + id: "1" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\002\000\000\000" +} + +infer_response { + model_name: "square_int32" + model_version: "1" + id: "0" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\004\000\000\000" +} + +infer_response { + model_name: "square_int32" + model_version: "1" + id: "0" + outputs { + name: "OUT" + datatype: "INT32" + shape: 1 + } + raw_output_contents: "\004\000\000\000" +} + +PASS: square_int32 +stream stopped... + +``` + +Look how responses were delivered out-of-order of requests. +The generated responses can be tracked to their request using +the `id` field. \ No newline at end of file From 2b98842559055b4d2837dd972e2392683c846079 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Wed, 11 May 2022 14:06:48 -0700 Subject: [PATCH 036/216] Fix the hyperlink to model transaction policy (#153) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6200486..77abf8b7 100644 --- a/README.md +++ b/README.md @@ -342,7 +342,7 @@ This mode allows user to send multiple responses for a request or not send any responses for a request. A model may also send responses out-of-order relative to the order that the request batches are executed. Such models are called *decoupled* models. In -order to use this mode, the [transaction policy](https://github.com/triton-inference-server/server/docs/model_configuration.md#model-transaction-policy) +order to use this mode, the [transaction policy](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#model-transaction-policy) in the model configuration must be set to decoupled. From 92245a771aca9be9986ae7ed71c4ae90ccafdfdb Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Sat, 21 May 2022 16:59:34 -0400 Subject: [PATCH 037/216] Add GPU tensor support to decoupled API (#154) --- examples/bls/async_model.py | 4 +- src/infer_response.cc | 139 ++++++++++--- src/infer_response.h | 33 +++- src/ipc_message.h | 2 +- src/pb_stub.cc | 100 +--------- src/pb_stub.h | 67 ++++++- src/pb_utils.cc | 19 ++ src/pb_utils.h | 12 ++ src/python.cc | 383 ++++++++++++++++++------------------ src/request_executor.cc | 2 +- src/response_sender.cc | 67 +++++-- 11 files changed, 489 insertions(+), 339 deletions(-) diff --git a/examples/bls/async_model.py b/examples/bls/async_model.py index d070069d..ef287fdd 100644 --- a/examples/bls/async_model.py +++ b/examples/bls/async_model.py @@ -120,11 +120,11 @@ async def execute(self, requests): raise pb_utils.TritonModelException( infer_response.error().message()) - # Get the OUTPUT0 from the "pytorch" model inference resposne + # Get the OUTPUT0 from the "pytorch" model inference response pytorch_output0_tensor = pb_utils.get_output_tensor_by_name( inference_responses[0], "OUTPUT0") - # Get the OUTPUT1 from the "addsub" model inference resposne + # Get the OUTPUT1 from the "addsub" model inference response addsub_output1_tensor = pb_utils.get_output_tensor_by_name( inference_responses[1], "OUTPUT1") diff --git a/src/infer_response.cc b/src/infer_response.cc index facb2da0..f897307f 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -164,40 +164,68 @@ InferResponse::Error() } #ifndef TRITON_PB_STUB -TRITONSERVER_Error* +std::shared_ptr InferResponse::Send( TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream, - const uint32_t flags) + bool& requires_deferred_callback, const uint32_t flags, + std::unique_ptr& shm_pool, + std::vector, void*>>& output_buffers, + const std::set& requested_output_names, + TRITONBACKEND_Response* response) { - // [FIXME] Use this code to send responses in non-decoupled mode. - TRITONBACKEND_Response* response = nullptr; - TRITONSERVER_Error* response_error = nullptr; - ScopedDefer response_error_handling([&response, &response_error, flags, - response_factory] { - if (response != nullptr) { - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend(response, flags, response_error), - "failed to send the response."); - if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory_ptr( - reinterpret_cast(response_factory)); - } - } - }); + std::shared_ptr response_error = + WrapTritonErrorInSharedPtr(nullptr); + std::unique_ptr response_error_handling; + requires_deferred_callback = false; + + // Should only destruct the response factory whenever a response factory is + // being created. + bool destruct_response_factor = (response == nullptr); + + if (response == nullptr) { + SET_ERROR_AND_RETURN( + response_error, + TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)); + } - SET_ERROR_AND_RETURN( - response_error, - TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)); + // This lambda expression will be called when this function exits, if the + // inference response doesn't have any GPU tensors. Otherwise, it will be + // called when the object is destructed or DeferredSendCallback is called. + response_error_handling = std::make_unique( + [response, response_error, flags, response_factory, + destruct_response_factor] { + if (response != nullptr) { + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend(response, flags, *response_error), + "failed to send the response."); + if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL && + destruct_response_factor) { + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + response_factory_ptr( + reinterpret_cast( + response_factory)); + } + } + }); + + // Moves the response sending callback so that it is not called until the stub + // process fills in the GPU buffers. + ScopedDefer deferred_task( + [this, &requires_deferred_callback, &response_error_handling] { + if (requires_deferred_callback) { + deferred_send_callback_ = std::move(response_error_handling); + } + }); if (HasError()) { - response_error = TRITONSERVER_ErrorNew( + *response_error = TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str()); return nullptr; } bool cuda_copy = false; + for (auto& output_tensor : OutputTensors()) { TRITONSERVER_MemoryType src_memory_type = output_tensor->MemoryType(); int64_t src_memory_type_id = output_tensor->MemoryTypeId(); @@ -205,12 +233,8 @@ InferResponse::Send( TRITONSERVER_MemoryType actual_memory_type = src_memory_type; int64_t actual_memory_type_id = src_memory_type_id; - // [FIXME] GPU tensors are not supported in the decoupled API mode. if (actual_memory_type == TRITONSERVER_MEMORY_GPU) { - response_error = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "GPU tensors are not supported in decoupled API."); - return response_error; + requires_deferred_callback = true; } TRITONBACKEND_Output* response_output; @@ -222,12 +246,61 @@ InferResponse::Send( output_tensor->Dims().data(), output_tensor->Dims().size())); void* buffer; - bool cuda_used = false; SET_ERROR_AND_RETURN( response_error, TRITONBACKEND_OutputBuffer( response_output, &buffer, output_tensor->ByteSize(), &actual_memory_type, &actual_memory_type_id)); + bool cuda_used = false; + TRITONSERVER_BufferAttributes* output_buffer_attributes; + SET_ERROR_AND_RETURN( + response_error, TRITONBACKEND_OutputBufferAttributes( + response_output, &output_buffer_attributes)); + + std::unique_ptr output_buffer; + if (src_memory_type == TRITONSERVER_MEMORY_GPU && + actual_memory_type == TRITONSERVER_MEMORY_GPU) { +#ifdef TRITON_ENABLE_GPU + cudaIpcMemHandle_t* cuda_ipc_mem_handle_p; + SET_ERROR_AND_RETURN( + response_error, + TRITONSERVER_BufferAttributesCudaIpcHandle( + output_buffer_attributes, + reinterpret_cast(&cuda_ipc_mem_handle_p))); + + if (cuda_ipc_mem_handle_p != nullptr) { + SET_ERROR_AND_RETURN_IF_EXCEPTION( + response_error, + output_buffer = PbMemory::Create( + shm_pool, actual_memory_type, actual_memory_type_id, + output_tensor->ByteSize(), reinterpret_cast(buffer), + false /* copy_gpu */)); + output_buffer->SetCudaIpcHandle(cuda_ipc_mem_handle_p); + } else { + SET_ERROR_AND_RETURN_IF_EXCEPTION( + response_error, + output_buffer = PbMemory::Create( + shm_pool, actual_memory_type, actual_memory_type_id, + output_tensor->ByteSize(), reinterpret_cast(buffer), + true /* copy_gpu */)); + } + output_buffers.push_back({std::move(output_buffer), buffer}); +#endif + } + + // When we requested a GPU buffer but received a CPU buffer. + if (src_memory_type == TRITONSERVER_MEMORY_GPU && + (actual_memory_type == TRITONSERVER_MEMORY_CPU || + actual_memory_type == TRITONSERVER_MEMORY_CPU_PINNED)) { + SET_ERROR_AND_RETURN_IF_EXCEPTION( + response_error, + output_buffer = PbMemory::Create( + shm_pool, actual_memory_type, actual_memory_type_id, + output_tensor->ByteSize(), nullptr /* data ptr */)); + + output_buffers.push_back({std::move(output_buffer), buffer}); + } + if (src_memory_type != TRITONSERVER_MEMORY_GPU) { SET_ERROR_AND_RETURN( response_error, @@ -251,4 +324,12 @@ InferResponse::Send( } #endif +#ifndef TRITON_PB_STUB +void +InferResponse::DeferredSendCallback() +{ + deferred_send_callback_.reset(); +} +#endif + }}} // namespace triton::backend::python diff --git a/src/infer_response.h b/src/infer_response.h index 69a22d04..5e994eaa 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -29,6 +29,7 @@ #include "pb_error.h" #include "pb_tensor.h" #include "pb_utils.h" +#include "scoped_defer.h" namespace triton { namespace backend { namespace python { @@ -44,11 +45,24 @@ struct ResponseShm { do { \ TRITONSERVER_Error* raasnie_err__ = (X); \ if (raasnie_err__ != nullptr) { \ - E = raasnie_err__; \ + *E = raasnie_err__; \ return E; \ } \ } while (false) +#define SET_ERROR_AND_RETURN_IF_EXCEPTION(E, X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \ + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ + *E = rarie_err__; \ + return E; \ + } \ + } while (false) + class InferResponse { public: InferResponse( @@ -66,10 +80,19 @@ class InferResponse { bi::managed_external_buffer::handle_t ShmHandle(); #ifndef TRITON_PB_STUB - /// Send an inference response - TRITONSERVER_Error* Send( + /// Send an inference response. If the response has a GPU tensor, sending the + /// response needs to be done in two step. The boolean + /// 'requires_deferred_callback' indicates whether DeferredSendCallback method + /// should be called or not. + std::shared_ptr Send( TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream, - const uint32_t flags); + bool& requires_deferred_callback, const uint32_t flags, + std::unique_ptr& shm_pool, + std::vector, void*>>& output_buffers, + const std::set& requested_output_names = {}, + TRITONBACKEND_Response* response = nullptr); + + void DeferredSendCallback(); #endif // Disallow copying the inference response object. @@ -84,5 +107,7 @@ class InferResponse { std::shared_ptr error_; bi::managed_external_buffer::handle_t shm_handle_; AllocatedSharedMemory response_shm_; + std::vector, void*>> gpu_output_buffers_; + std::unique_ptr deferred_send_callback_; }; }}} // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index eb89c57f..7cf9ede0 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -37,7 +37,7 @@ namespace bi = boost::interprocess; typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_ExecuteRequest, - PYTHONSTUB_ExecuteResposne, + PYTHONSTUB_ExecuteResponse, PYTHONSTUB_InitializeRequest, PYTHONSTUB_InitializeResponse, PYTHONSTUB_FinalizeRequest, diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 37c7f10b..a57503b7 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -25,7 +25,6 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "pb_stub.h" - #include #include #include @@ -59,65 +58,6 @@ using namespace pybind11::literals; namespace bi = boost::interprocess; namespace triton { namespace backend { namespace python { -#define LOG_IF_EXCEPTION(X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& pb_exception) { \ - LOG_INFO << pb_exception.what(); \ - } \ - } while (false) - -#define LOG_EXCEPTION(E) \ - do { \ - LOG_INFO << E.what(); \ - } while (false) - -// Macros that use current filename and line number. -#define LOG_INFO LOG_INFO_FL(__FILE__, __LINE__) - -class Logger { - public: - // Log a message. - void Log(const std::string& msg) { std::cerr << msg << std::endl; } - - // Flush the log. - void Flush() { std::cerr << std::flush; } -}; - -Logger gLogger_; -class LogMessage { - public: - LogMessage(const char* file, int line) - { - std::string path(file); - size_t pos = path.rfind('/'); - if (pos != std::string::npos) { - path = path.substr(pos + 1, std::string::npos); - } - - struct timeval tv; - gettimeofday(&tv, NULL); - struct tm tm_time; - gmtime_r(((time_t*)&(tv.tv_sec)), &tm_time); - stream_ << std::setfill('0') << std::setw(2) << (tm_time.tm_mon + 1) - << std::setw(2) << tm_time.tm_mday << " " << std::setw(2) - << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':' - << std::setw(2) << tm_time.tm_sec << "." << std::setw(6) - << tv.tv_usec << ' ' << static_cast(getpid()) << ' ' - << path << ':' << line << "] "; - } - - ~LogMessage() { gLogger_.Log(stream_.str()); } - - std::stringstream& stream() { return stream_; } - - private: - std::stringstream stream_; -}; - -#define LOG_INFO_FL(FN, LN) LogMessage((char*)(FN), LN).stream() std::atomic non_graceful_exit = {false}; @@ -458,52 +398,20 @@ Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) return; } - // We need to hold the cpu_buffers until the main process makes a copy from - // them. - std::vector> cpu_buffers; std::vector> dst_buffers; - bool has_cpu_buffer = false; for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); - if (dst_buffer->MemoryType() == TRITONSERVER_MEMORY_CPU) { - has_cpu_buffer = true; - } dst_buffers.emplace_back(std::move(dst_buffer)); } - // Pop a dummy message from the stub message queue indicating that the parent - // has finished copying the tensors. - ScopedDefer _([this, has_cpu_buffer] { - if (has_cpu_buffer) { - stub_message_queue_->Pop(); - } - }); - ScopedDefer load_gpu_buffer_response( - [this, has_cpu_buffer] { parent_message_queue_->Push(DUMMY_MESSAGE); }); + [this] { parent_message_queue_->Push(DUMMY_MESSAGE); }); for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::shared_ptr& src_buffer = gpu_tensors_[i]; - - // If the memory type is CPU, the buffer is empty and we need to create - // a buffer. - if (dst_buffers[i]->MemoryType() == TRITONSERVER_MEMORY_CPU) { - dst_buffers[i] = PbMemory::Create( - shm_pool_, dst_buffers[i]->MemoryType(), - dst_buffers[i]->MemoryTypeId(), src_buffer->ByteSize(), - nullptr /* buffer */); - - // Update the handle so that the main process can load it. - gpu_buffers_handle_shm[i] = dst_buffers[i]->ShmHandle(); - } - PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); - - if (dst_buffers[i]->MemoryType() == TRITONSERVER_MEMORY_CPU) { - cpu_buffers.push_back(std::move(dst_buffers[i])); - } } gpu_tensors_.clear(); @@ -541,7 +449,7 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) LoadRequestsFromSharedMemory(request_batch_shm_ptr); std::unique_ptr execute_response = IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResposne; + execute_response->Command() = PYTHONSTUB_ExecuteResponse; AllocatedSharedMemory response_batch = shm_pool_->Construct(); @@ -615,7 +523,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { std::unique_ptr execute_response = IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResposne; + execute_response->Command() = PYTHONSTUB_ExecuteResponse; AllocatedSharedMemory response_batch = shm_pool_->Construct( request_batch_shm_ptr->batch_size * @@ -693,7 +601,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) size_t response_size = py::len(responses); // If the number of request objects do not match the number of - // resposne objects throw an error. + // response objects throw an error. if (response_size != batch_size) { std::string err = "Number of InferenceResponse objects do not match the number " diff --git a/src/pb_stub.h b/src/pb_stub.h index e503b7fc..312c7ca0 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -24,6 +24,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + #include #include #include @@ -31,6 +33,9 @@ #include #include #include +#include +#include +#include #include #include #include "infer_request.h" @@ -39,7 +44,6 @@ #include "message_queue.h" #include "pb_utils.h" -#pragma once namespace bi = boost::interprocess; namespace py = pybind11; @@ -47,6 +51,67 @@ using namespace pybind11::literals; namespace triton { namespace backend { namespace python { +#define LOG_IF_EXCEPTION(X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + LOG_INFO << pb_exception.what(); \ + } \ + } while (false) + +#define LOG_EXCEPTION(E) \ + do { \ + LOG_INFO << E.what(); \ + } while (false) + +// Macros that use current filename and line number. +#define LOG_INFO LOG_INFO_FL(__FILE__, __LINE__) + +class Logger { + public: + // Log a message. + void Log(const std::string& msg) { std::cerr << msg << std::endl; } + + // Flush the log. + void Flush() { std::cerr << std::flush; } +}; + +static Logger gLogger_; + +class LogMessage { + public: + LogMessage(const char* file, int line) + { + std::string path(file); + size_t pos = path.rfind('/'); + if (pos != std::string::npos) { + path = path.substr(pos + 1, std::string::npos); + } + + struct timeval tv; + gettimeofday(&tv, NULL); + struct tm tm_time; + gmtime_r(((time_t*)&(tv.tv_sec)), &tm_time); + stream_ << std::setfill('0') << std::setw(2) << (tm_time.tm_mon + 1) + << std::setw(2) << tm_time.tm_mday << " " << std::setw(2) + << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':' + << std::setw(2) << tm_time.tm_sec << "." << std::setw(6) + << tv.tv_usec << ' ' << static_cast(getpid()) << ' ' + << path << ':' << line << "] "; + } + + ~LogMessage() { gLogger_.Log(stream_.str()); } + + std::stringstream& stream() { return stream_; } + + private: + std::stringstream stream_; +}; + +#define LOG_INFO_FL(FN, LN) LogMessage((char*)(FN), LN).stream() + class Stub { public: Stub(){}; diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 4de37d33..db6f83a4 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -211,4 +211,23 @@ CUDAHandler::~CUDAHandler() noexcept(false) } } #endif + +#ifndef TRITON_PB_STUB +std::shared_ptr +WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) +{ + std::shared_ptr response_error( + new TRITONSERVER_Error*, [](TRITONSERVER_Error** error) { + if (error != nullptr && *error != nullptr) { + TRITONSERVER_ErrorDelete(*error); + } + + if (error != nullptr) { + delete error; + } + }); + *response_error = error; + return response_error; +} +#endif }}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 3d5cbb3d..5af7a9dd 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -153,6 +153,13 @@ struct ResponseSenderBase { struct ResponseSendMessage : ResponseSenderBase { bi::managed_external_buffer::handle_t response; + + // GPU Buffers handle + bi::managed_external_buffer::handle_t gpu_buffers_handle; + + // GPU buffers count + uint32_t gpu_buffers_count; + uint32_t flags; }; @@ -198,4 +205,9 @@ class CUDAHandler { }; #endif // TRITON_ENABLE_GPU +#ifndef TRITON_PB_STUB +std::shared_ptr WrapTritonErrorInSharedPtr( + TRITONSERVER_Error* error); +#endif + }}} // namespace triton::backend::python diff --git a/src/python.cc b/src/python.cc index 891304d4..93fb4927 100644 --- a/src/python.cc +++ b/src/python.cc @@ -353,7 +353,8 @@ class ModelInstanceState : public BackendModelInstance { // Set error for response send message void SetErrorForResponseSendMessage( - ResponseSendMessage* response_send_message, TRITONSERVER_Error* error, + ResponseSendMessage* response_send_message, + std::shared_ptr error, std::unique_ptr& error_message); TRITONSERVER_Error* SaveRequestsToSharedMemory( @@ -475,14 +476,15 @@ ModelInstanceState::ExistsInClosedRequests(intptr_t closed_request) void ModelInstanceState::SetErrorForResponseSendMessage( - ResponseSendMessage* response_send_message, TRITONSERVER_Error* error, + ResponseSendMessage* response_send_message, + std::shared_ptr error, std::unique_ptr& error_message) { - if (error != nullptr) { + if (error && *error != nullptr) { response_send_message->has_error = true; LOG_IF_EXCEPTION( error_message = - PbString::Create(shm_pool_, TRITONSERVER_ErrorMessage(error))); + PbString::Create(shm_pool_, TRITONSERVER_ErrorMessage(*error))); response_send_message->error = error_message->ShmHandle(); response_send_message->is_error_set = true; } @@ -862,7 +864,7 @@ ModelInstanceState::StartStubProcess() return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string( - "Received unexpected resposne from Python backend stub: ") + + "Received unexpected response from Python backend stub: ") + name_) .c_str()); } @@ -1094,16 +1096,9 @@ ModelInstanceState::GetInputTensor( HostPolicyName().c_str()); } - ModelState* model_state = reinterpret_cast(Model()); bool cpu_only_tensors = model_state->ForceCPUOnlyInputTensors(); - if (!cpu_only_tensors && model_state->IsDecoupled()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "FORCE_CPU_ONLY_INPUT_TENSORS set to OFF is not yet supported in the " - "decoupled API."); - } if (input_dtype == TRITONSERVER_TYPE_BYTES) { cpu_only_tensors = true; } @@ -1147,12 +1142,9 @@ ModelInstanceState::GetInputTensor( input_name, input_buffer, input_byte_size, TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */); } else { - bool cuda_used = false; - CopyBuffer( - "Failed to copy the output tensor to buffer.", src_memory_type, - src_memory_type_id, TRITONSERVER_MEMORY_CPU /* memory_type */, - 0 /* memory type id */, input_byte_size, src_ptr, input_buffer, - CudaStream(), &cuda_used); + size_t byte_size = input_byte_size; + RETURN_IF_ERROR(backend::ReadInputTensor( + request, input_name, input_buffer, &byte_size)); } } else { #ifdef TRITON_ENABLE_GPU @@ -1162,54 +1154,82 @@ ModelInstanceState::GetInputTensor( std::vector> alloc_perference; alloc_perference = {{TRITONSERVER_MEMORY_GPU, src_memory_type_id}}; + // collector is used in the non-decoupled mode. if (collector) { RETURN_IF_ERROR(collector->ProcessTensor( input_name, nullptr, 0, alloc_perference, reinterpret_cast(&buffer), &input_byte_size, &src_memory_type, &src_memory_type_id)); - } - - // If the tensor is using the cuda shared memory, we need to extract the - // handle that was used to create the device pointer. This is because of a - // limitation in the legacy CUDA IPC API that doesn't allow getting the - // handle of an exported pointer. If the cuda handle exists, it indicates - // that the cuda shared memory was used and the input is in a single buffer. - // [FIXME] for the case where the input is in cuda shared memory and uses - // multiple input buffers this needs to be changed. - TRITONSERVER_BufferAttributes* buffer_attributes; - // This value is not used. - const void* buffer_p; - RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes( - in, 0, &buffer_p, &buffer_attributes)); + // If the tensor is using the cuda shared memory, we need to extract the + // handle that was used to create the device pointer. This is because of a + // limitation in the legacy CUDA IPC API that doesn't allow getting the + // handle of an exported pointer. If the cuda handle exists, it indicates + // that the cuda shared memory was used and the input is in a single + // buffer. + // [FIXME] For the case where the input is in cuda shared memory and uses + // multiple input buffers this needs to be changed. + TRITONSERVER_BufferAttributes* buffer_attributes; + + // This value is not used. + const void* buffer_p; + RETURN_IF_ERROR(TRITONBACKEND_InputBufferAttributes( + in, 0, &buffer_p, &buffer_attributes)); - if (collector) { input_tensor = std::make_shared( std::string(input_name), std::vector(input_shape, input_shape + input_dims_count), input_dtype, src_memory_type, src_memory_type_id, const_cast(buffer), input_byte_size, nullptr /* DLManagedTensor */); + + cudaIpcMemHandle_t* cuda_ipc_handle; + RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle( + buffer_attributes, reinterpret_cast(&cuda_ipc_handle))); + if (cuda_ipc_handle != nullptr) { + RETURN_IF_EXCEPTION( + input_tensor->SaveToSharedMemory(shm_pool_, false /* copy_gpu */)); + RETURN_IF_EXCEPTION( + input_tensor->Memory()->SetCudaIpcHandle(cuda_ipc_handle)); + } else { + RETURN_IF_EXCEPTION( + input_tensor->SaveToSharedMemory(shm_pool_, true /* copy_gpu */)); + } } else { + void* dev_ptr; + RETURN_IF_CUDA_ERROR( + cudaMalloc(&dev_ptr, input_byte_size), TRITONSERVER_ERROR_INTERNAL, + std::string("Failed to allocated CUDA memory")); + + size_t byte_size = input_byte_size; + + bool cuda_used = false; + RETURN_IF_ERROR(backend::ReadInputTensor( + request, input_name, reinterpret_cast(dev_ptr), &byte_size, + TRITONSERVER_MEMORY_GPU, src_memory_type_id, CudaStream(), + &cuda_used)); + + if (cuda_used) { +#ifdef TRITON_ENABLE_GPU + cudaStreamSynchronize(stream_); +#endif + } + input_tensor = std::make_shared( std::string(input_name), std::vector(input_shape, input_shape + input_dims_count), input_dtype, src_memory_type, src_memory_type_id, - const_cast(buffer_p), input_byte_size, + const_cast(dev_ptr), input_byte_size, nullptr /* DLManagedTensor */); - } - cudaIpcMemHandle_t* cuda_ipc_handle; - RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle( - buffer_attributes, reinterpret_cast(&cuda_ipc_handle))); - if (cuda_ipc_handle != nullptr) { - RETURN_IF_EXCEPTION( - input_tensor->SaveToSharedMemory(shm_pool_, false /* copy_gpu */)); - RETURN_IF_EXCEPTION( - input_tensor->Memory()->SetCudaIpcHandle(cuda_ipc_handle)); - } else { RETURN_IF_EXCEPTION( input_tensor->SaveToSharedMemory(shm_pool_, true /* copy_gpu */)); + + std::unique_ptr gpu_memory_record = + std::make_unique(input_tensor->Memory()->DataPtr()); + uint64_t memory_release_id = + memory_manager_->AddRecord(std::move(gpu_memory_record)); + input_tensor->Memory()->SetMemoryReleaseId(memory_release_id); } #else return TRITONSERVER_ErrorNew( @@ -1410,7 +1430,7 @@ ModelInstanceState::DecoupledMessageQueueMonitor() // Need to notify the model instance thread that the execute response has // been received. - if (message->Command() == PYTHONSTUB_ExecuteResposne) { + if (message->Command() == PYTHONSTUB_ExecuteResponse) { std::lock_guard guard{mu_}; received_message_ = std::move(message); cv_.notify_one(); @@ -1468,13 +1488,76 @@ ModelInstanceState::ResponseSendDecoupled( InferResponse::LoadFromSharedMemory( shm_pool_, send_message_payload->response, false /* open cuda ipc handle */); - TRITONSERVER_Error* error = infer_response->Send( - response_factory, CudaStream(), send_message_payload->flags); + + bool requires_deferred_callback = false; + std::vector, void*>> gpu_output_buffers; + std::shared_ptr error = infer_response->Send( + response_factory, CudaStream(), requires_deferred_callback, + send_message_payload->flags, shm_pool_, gpu_output_buffers); SetErrorForResponseSendMessage(send_message_payload, error, error_message); + + if (requires_deferred_callback) { + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Construct( + sizeof(uint64_t) + + gpu_output_buffers.size() * + sizeof(bi::managed_external_buffer::handle_t)); + uint64_t* gpu_buffer_count = + reinterpret_cast(gpu_buffers_handle.data_.get()); + *gpu_buffer_count = gpu_output_buffers.size(); + bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = + reinterpret_cast( + gpu_buffers_handle.data_.get() + sizeof(uint64_t)); + send_message_payload->gpu_buffers_handle = gpu_buffers_handle.handle_; + + size_t index = 0; + for (auto& output_buffer_pair : gpu_output_buffers) { + std::unique_ptr& pb_memory = output_buffer_pair.first; + gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); + ++index; + } + + // Additional round trip so that the stub can fill the GPU output buffers. + { + bi::scoped_lock guard{send_message_payload->mu}; + send_message_payload->is_stub_turn = true; + send_message_payload->cv.notify_all(); + + while (send_message_payload->is_stub_turn) { + send_message_payload->cv.wait(guard); + } + } + + index = 0; + bool cuda_copy = false; + for (auto& output_buffer_pair : gpu_output_buffers) { + auto& pb_memory = output_buffer_pair.first; + + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + bool cuda_used; + void* pointer = output_buffer_pair.second; + + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used); + cuda_copy |= cuda_used; + } + gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); + ++index; +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } +#endif // TRITON_ENABLE_GPU + } + } } else { TRITONSERVER_Error* error = TRITONBACKEND_ResponseFactorySendFlags( response_factory, send_message_payload->flags); - SetErrorForResponseSendMessage(send_message_payload, error, error_message); + SetErrorForResponseSendMessage( + send_message_payload, WrapTritonErrorInSharedPtr(error), error_message); if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { std::unique_ptr< @@ -1717,9 +1800,11 @@ ModelInstanceState::ProcessRequests( // If the output provided by the model is in GPU, we will pass the list of // buffers provided by Triton to the stub process. bool has_gpu_output = false; + std::vector requires_deferred_callback; - // GPU output buffers - std::vector, std::pair>> + std::vector> shm_responses; + std::unordered_map< + uint32_t, std::vector, void*>>> gpu_output_buffers; for (uint32_t r = 0; r < request_count; ++r) { @@ -1728,7 +1813,8 @@ ModelInstanceState::ProcessRequests( TRITONBACKEND_Request* request = requests[r]; uint32_t requested_output_count = 0; - std::unique_ptr infer_response; + shm_responses.emplace_back(nullptr); + std::unique_ptr& infer_response = shm_responses.back(); try { infer_response = InferResponse::LoadFromSharedMemory( shm_pool_, response_shm_handle[r], false /* open_cuda_handle */); @@ -1764,7 +1850,6 @@ ModelInstanceState::ProcessRequests( responses, r, TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); - bool cuda_copy = false; std::set requested_output_names; for (size_t j = 0; j < requested_output_count; ++j) { const char* output_name; @@ -1774,106 +1859,24 @@ ModelInstanceState::ProcessRequests( requested_output_names.insert(output_name); } - for (auto& output_tensor : infer_response->OutputTensors()) { - if (requested_output_names.find(output_tensor->Name()) == - requested_output_names.end()) { - continue; - } - - TRITONSERVER_MemoryType src_memory_type = output_tensor->MemoryType(); - int64_t src_memory_type_id = output_tensor->MemoryTypeId(); - - TRITONSERVER_MemoryType actual_memory_type = src_memory_type; - int64_t actual_memory_type_id = src_memory_type_id; - - if (actual_memory_type == TRITONSERVER_MEMORY_GPU) - has_gpu_output = true; - - TRITONBACKEND_Output* response_output; - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_ResponseOutput( - response, &response_output, output_tensor->Name().c_str(), - static_cast(output_tensor->TritonDtype()), - output_tensor->Dims().data(), output_tensor->Dims().size())); - - void* buffer; - bool cuda_used = false; - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_OutputBuffer( - response_output, &buffer, output_tensor->ByteSize(), - &actual_memory_type, &actual_memory_type_id)); - - TRITONSERVER_BufferAttributes* output_buffer_attributes; - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_OutputBufferAttributes( - response_output, &output_buffer_attributes)); + bool require_deferred_callback = false; - std::unique_ptr output_buffer; - if (src_memory_type == TRITONSERVER_MEMORY_GPU && - actual_memory_type == TRITONSERVER_MEMORY_GPU) { - if ((*responses)[r] != nullptr) { -#ifdef TRITON_ENABLE_GPU - cudaIpcMemHandle_t* cuda_ipc_mem_handle_p; - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONSERVER_BufferAttributesCudaIpcHandle( - output_buffer_attributes, - reinterpret_cast(&cuda_ipc_mem_handle_p))); - - if (cuda_ipc_mem_handle_p != nullptr) { - GUARDED_RESPOND_IF_EXCEPTION( - responses, r, - output_buffer = PbMemory::Create( - shm_pool_, actual_memory_type, actual_memory_type_id, - output_tensor->ByteSize(), reinterpret_cast(buffer), - false /* copy_gpu */)); - output_buffer->SetCudaIpcHandle(cuda_ipc_mem_handle_p); - } else { - GUARDED_RESPOND_IF_EXCEPTION( - responses, r, - output_buffer = PbMemory::Create( - shm_pool_, actual_memory_type, actual_memory_type_id, - output_tensor->ByteSize(), reinterpret_cast(buffer), - true /* copy_gpu */)); - } - gpu_output_buffers.push_back({std::move(output_buffer), {buffer, r}}); -#endif - } - } + gpu_output_buffers[r] = + std::vector, void*>>{}; + std::shared_ptr error = infer_response->Send( + nullptr, CudaStream(), require_deferred_callback, + TRITONSERVER_RESPONSE_COMPLETE_FINAL, shm_pool_, gpu_output_buffers[r], + requested_output_names, response); + GUARDED_RESPOND_IF_ERROR(responses, r, *error); - // When we requested a GPU buffer but received a CPU buffer. - if (src_memory_type == TRITONSERVER_MEMORY_GPU && - (actual_memory_type == TRITONSERVER_MEMORY_CPU || - actual_memory_type == TRITONSERVER_MEMORY_CPU_PINNED)) { - GUARDED_RESPOND_IF_EXCEPTION( - responses, r, - output_buffer = PbMemory::Create( - shm_pool_, actual_memory_type, actual_memory_type_id, - 0 /* byte size */, nullptr /* data ptr */)); - - gpu_output_buffers.push_back({std::move(output_buffer), {buffer, r}}); - } + // Error object will be deleted by the GUARDED_RESPOND macro + *error = nullptr; + error.reset(); - if (src_memory_type != TRITONSERVER_MEMORY_GPU) { - GUARDED_RESPOND_IF_ERROR( - responses, r, - CopyBuffer( - "Failed to copy the output tensor to buffer.", src_memory_type, - src_memory_type_id, actual_memory_type, actual_memory_type_id, - output_tensor->ByteSize(), output_tensor->DataPtr(), buffer, - CudaStream(), &cuda_used)); - } - - cuda_copy |= cuda_used; - } -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); + if (require_deferred_callback) { + has_gpu_output = true; } -#endif // TRITON_ENABLE_GPU + requires_deferred_callback.push_back(require_deferred_callback); } // Finalize the execute. @@ -1882,18 +1885,26 @@ ModelInstanceState::ProcessRequests( // If the output tensor is in GPU, there will be a second round trip // required for filling the GPU buffers provided by the main process. if (has_gpu_output) { + size_t total_gpu_buffers_count = 0; + for (auto& gpu_output_buffer : gpu_output_buffers) { + total_gpu_buffers_count += gpu_output_buffer.second.size(); + } AllocatedSharedMemory gpu_buffers_handle = shm_pool_->Construct( - sizeof(uint64_t) + gpu_output_buffers.size() * + sizeof(uint64_t) + total_gpu_buffers_count * sizeof(bi::managed_external_buffer::handle_t)); uint64_t* gpu_buffer_count = reinterpret_cast(gpu_buffers_handle.data_.get()); - *gpu_buffer_count = gpu_output_buffers.size(); + *gpu_buffer_count = total_gpu_buffers_count; bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = reinterpret_cast( gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - for (size_t i = 0; i < gpu_output_buffers.size(); i++) { - gpu_buffers_handle_shm[i] = gpu_output_buffers[i].first->ShmHandle(); + size_t index = 0; + for (auto& gpu_output_buffer : gpu_output_buffers) { + for (auto& buffer_memory_pair : gpu_output_buffer.second) { + gpu_buffers_handle_shm[index] = buffer_memory_pair.first->ShmHandle(); + ++index; + } } ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; @@ -1904,50 +1915,40 @@ ModelInstanceState::ProcessRequests( bool cuda_copy = false; - // CPU tensors require an additional notification to the stub process. - // This is to ask the stub process to release the tensor. - bool has_cpu_tensor = false; - for (size_t i = 0; i < gpu_output_buffers.size(); i++) { - std::unique_ptr& memory = gpu_output_buffers[i].first; - if (memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - bool cuda_used; - has_cpu_tensor = true; - std::unique_ptr pb_cpu_memory = - PbMemory::LoadFromSharedMemory( - shm_pool_, gpu_buffers_handle_shm[i], - false /* open cuda handle */); - uint32_t response_index = gpu_output_buffers[i].second.second; - void* pointer = gpu_output_buffers[i].second.first; - - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - pb_cpu_memory->ByteSize(), pb_cpu_memory->DataPtr(), pointer, - CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; - } - } - - if (has_cpu_tensor) { - stub_message_queue_->Push(DUMMY_MESSAGE); - } + index = 0; + for (auto& gpu_output_buffer : gpu_output_buffers) { + for (auto& buffer_memory_pair : gpu_output_buffer.second) { + auto& pb_memory = buffer_memory_pair.first; + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + bool cuda_used; + uint32_t response_index = gpu_output_buffer.first; + void* pointer = buffer_memory_pair.second; + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } + gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); + ++index; + } #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } #endif // TRITON_ENABLE_GPU + } } bls_defer.Complete(); for (uint32_t r = 0; r < request_count; ++r) { - // If error happens at this stage, we can only log it - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr)); + if (requires_deferred_callback[r]) { + shm_responses[r]->DeferredSendCallback(); + } } uint64_t exec_end_ns = 0; @@ -2457,13 +2458,13 @@ TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_Response* response = nullptr; LOG_IF_ERROR( TRITONBACKEND_ResponseNew(&response, request), - "Failed to create a new resposne."); + "Failed to create a new response."); if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend( response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, error), - "Failed to send the error resposne."); + "Failed to send the error response."); } } } diff --git a/src/request_executor.cc b/src/request_executor.cc index c338defc..5118c8bf 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -306,7 +306,7 @@ RequestExecutor::Infer( if (response != nullptr) { LOG_IF_ERROR( TRITONSERVER_InferenceResponseDelete(response), - "Failed to delete inference resposne."); + "Failed to delete inference response."); *triton_response = nullptr; } diff --git a/src/response_sender.cc b/src/response_sender.cc index 44b5de7f..e8394df9 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -28,6 +28,7 @@ #include #include #include "pb_stub.h" +#include "pb_stub_utils.h" #include "scoped_defer.h" namespace triton { namespace backend { namespace python { @@ -67,17 +68,6 @@ ResponseSender::Send( "set to zero."); } - if (infer_response) { - for (auto& tensor : infer_response->OutputTensors()) { - if (!tensor->IsCPU()) { - throw PythonBackendException( - "Tensor '" + tensor->Name() + - "' is stored in GPU. GPU tensors are not supported yet in the " - "decoupled response sender."); - } - } - } - std::unique_ptr& stub = Stub::GetOrCreateInstance(); AllocatedSharedMemory response_send_message = @@ -112,10 +102,9 @@ ResponseSender::Send( ipc_message->Command() = PYTHONSTUB_ResponseSend; ipc_message->Args() = response_send_message.handle_; - ScopedDefer _([&send_message_payload] { + ScopedDefer _([send_message_payload] { { bi::scoped_lock guard{send_message_payload->mu}; - send_message_payload->is_stub_turn = false; send_message_payload->cv.notify_all(); } @@ -129,6 +118,57 @@ ResponseSender::Send( } } + bool has_gpu_output = false; + std::vector> gpu_tensors; + if (infer_response) { + for (auto& tensor : infer_response->OutputTensors()) { + if (!tensor->IsCPU()) { + has_gpu_output = true; + gpu_tensors.push_back(tensor); + } + } + } + + if (has_gpu_output) { + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Load(send_message_payload->gpu_buffers_handle); + + bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = + reinterpret_cast( + gpu_buffers_handle.data_.get() + sizeof(uint64_t)); + uint64_t* gpu_buffer_count = + reinterpret_cast(gpu_buffers_handle.data_.get()); + if (gpu_tensors.size() != *gpu_buffer_count) { + LOG_INFO + << (std::string( + "GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors.size()) + + " != " + std::to_string(*gpu_buffer_count)); + return; + } + + std::vector> dst_buffers; + + for (size_t i = 0; i < gpu_tensors.size(); i++) { + std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); + dst_buffers.emplace_back(std::move(dst_buffer)); + std::shared_ptr& src_buffer = gpu_tensors[i]; + PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); + } + + { + bi::scoped_lock guard{send_message_payload->mu}; + send_message_payload->is_stub_turn = false; + send_message_payload->cv.notify_one(); + while (!send_message_payload->is_stub_turn) { + // Wait for the stub process to send the response and populate error + // message if any. + send_message_payload->cv.wait(guard); + } + } + } + if (send_message_payload->has_error) { if (send_message_payload->is_error_set) { std::unique_ptr error = PbString::LoadFromSharedMemory( @@ -140,5 +180,4 @@ ResponseSender::Send( } } } - }}} // namespace triton::backend::python From 94a3ed28347b4d92290dc9587c5d62af0efd6378 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Tue, 10 May 2022 12:42:39 -0700 Subject: [PATCH 038/216] Improve lifetime of NumPy objects (#149) (#151) * Improve lifetime of NumPy objects * Fix the leak * Add error for using decoupled API in BLS Co-authored-by: Iman Tabrizian --- src/response_sender.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/response_sender.cc b/src/response_sender.cc index e8394df9..d4081c8c 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -68,6 +68,17 @@ ResponseSender::Send( "set to zero."); } + if (infer_response) { + for (auto& tensor : infer_response->OutputTensors()) { + if (!tensor->IsCPU()) { + throw PythonBackendException( + "Tensor '" + tensor->Name() + + "' is stored in GPU. GPU tensors are not supported yet in the " + "decoupled response sender."); + } + } + } + std::unique_ptr& stub = Stub::GetOrCreateInstance(); AllocatedSharedMemory response_send_message = From 85613cfcc5e97cc8d6e3630100f0314de4f64991 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 12 May 2022 18:23:31 -0400 Subject: [PATCH 039/216] Improve decoupled API known issue failure (#155) * Fix the hyperlink to model transaction policy (#153) * Improve decoupled API known issue failure Co-authored-by: Tanmay Verma --- src/python.cc | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/python.cc b/src/python.cc index 93fb4927..0e38917b 100644 --- a/src/python.cc +++ b/src/python.cc @@ -2078,6 +2078,18 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) TRITONSERVER_ErrorDelete(error); } + triton::common::TritonJson::Value model_transaction_policy; + if (model_config_.Find( + "model_transaction_policy", &model_transaction_policy)) { + triton::common::TritonJson::Value decoupled; + if (model_transaction_policy.Find("decoupled", &decoupled)) { + auto error = decoupled.AsBool(&decoupled_); + if (error != nullptr) { + throw BackendModelException(error); + } + } + } + // Skip the FORCE_CPU_ONLY_INPUT_TENSORS variable if it doesn't exits. std::string force_cpu_only_input_tensor; error = nullptr; @@ -2090,6 +2102,12 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) TRITONSERVER_LOG_INFO, (std::string("Forcing CPU only input tensors.")).c_str()); } else if (force_cpu_only_input_tensor == "no") { + if (decoupled_) { + throw BackendModelException(TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "FORCE_CPU_ONLY_INPUT_TENSORS set to OFF is not yet supported in " + "the decoupled API.")); + } force_cpu_only_input_tensors_ = false; LOG_MESSAGE( TRITONSERVER_LOG_INFO, @@ -2109,18 +2127,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) } } - triton::common::TritonJson::Value model_transaction_policy; - if (model_config_.Find( - "model_transaction_policy", &model_transaction_policy)) { - triton::common::TritonJson::Value decoupled; - if (model_transaction_policy.Find("decoupled", &decoupled)) { - auto error = decoupled.AsBool(&decoupled_); - if (error != nullptr) { - throw BackendModelException(error); - } - } - } - if (artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) { throw BackendModelException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, From 6edcdd8c1af544bc91084850a0c1da5ba8820fa7 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 1 Jun 2022 12:18:04 -0400 Subject: [PATCH 040/216] Allow FORCE_CPU_ONLY_INPUT to be off for decoupled (#158) --- src/python.cc | 6 ------ src/response_sender.cc | 11 ----------- 2 files changed, 17 deletions(-) diff --git a/src/python.cc b/src/python.cc index 0e38917b..f489c510 100644 --- a/src/python.cc +++ b/src/python.cc @@ -2102,12 +2102,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) TRITONSERVER_LOG_INFO, (std::string("Forcing CPU only input tensors.")).c_str()); } else if (force_cpu_only_input_tensor == "no") { - if (decoupled_) { - throw BackendModelException(TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_UNSUPPORTED, - "FORCE_CPU_ONLY_INPUT_TENSORS set to OFF is not yet supported in " - "the decoupled API.")); - } force_cpu_only_input_tensors_ = false; LOG_MESSAGE( TRITONSERVER_LOG_INFO, diff --git a/src/response_sender.cc b/src/response_sender.cc index d4081c8c..e8394df9 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -68,17 +68,6 @@ ResponseSender::Send( "set to zero."); } - if (infer_response) { - for (auto& tensor : infer_response->OutputTensors()) { - if (!tensor->IsCPU()) { - throw PythonBackendException( - "Tensor '" + tensor->Name() + - "' is stored in GPU. GPU tensors are not supported yet in the " - "decoupled response sender."); - } - } - } - std::unique_ptr& stub = Stub::GetOrCreateInstance(); AllocatedSharedMemory response_send_message = From 03914b23494076b1bbac4be33d248e2c7c1190df Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 1 Jun 2022 12:31:07 -0400 Subject: [PATCH 041/216] Disable async BLS support in decoupled models (#157) --- src/pb_stub.cc | 14 ++++++-------- src/python.cc | 1 - src/request_executor.h | 1 + 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a57503b7..6e8a766a 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -459,7 +459,6 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) bool has_exception = false; std::string error_string; std::unique_ptr error_string_shm; - bool is_coroutine; ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); ScopedDefer _( @@ -480,13 +479,6 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) py::object execute_return = model_instance_.attr("execute")(py_request_list); - py::module asyncio = py::module::import("asyncio"); - - is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); - if (is_coroutine) { - execute_return = asyncio.attr("run")(execute_return); - } - if (!py::isinstance(execute_return)) { throw PythonBackendException( "Python model '" + model_instance_name_ + @@ -748,6 +740,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def( "async_exec", [](std::shared_ptr& infer_request) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + if (stub->IsDecoupled()) { + throw PythonBackendException( + "Async BLS request execution is not support in the decoupled " + "API."); + } py::object loop = py::module_::import("asyncio").attr("get_running_loop")(); py::cpp_function callback = [infer_request]() { diff --git a/src/python.cc b/src/python.cc index f489c510..11b97fec 100644 --- a/src/python.cc +++ b/src/python.cc @@ -1714,7 +1714,6 @@ ModelInstanceState::ProcessRequests( requests, request_count); } - ScopedDefer execute_finalize([this, &restart] { // Push a dummy message to the message queue so that // the stub process is notified that it can release diff --git a/src/request_executor.h b/src/request_executor.h index bf2fd4a4..9dab7609 100644 --- a/src/request_executor.h +++ b/src/request_executor.h @@ -41,6 +41,7 @@ class RequestExecutor { std::unique_ptr Infer( const std::shared_ptr& infer_request, TRITONSERVER_InferenceResponse** response); + RequestExecutor( std::unique_ptr& shm_pool, TRITONSERVER_Server* server); From b521096cf2dd60ee88704d8bdff0f02d4d501ea2 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 1 Jun 2022 14:13:37 -0700 Subject: [PATCH 042/216] Python models should have API where they can provide max_batch/inputs/outputs for auto-complete (#138) * Add api for python backend * Update max_batch_size in model config * Add objects and functions in utils.py * Use the second interpreter to retrieve model config * Update comment * Add documentation * Add checking and auto-complete logic * Fix comment * Address review * Fix restart * Address review * Fix auto-complete example * Fix for ci test * Address review * Rename functions * Fix client name * Address review * Address review * Fix up * Add more information to max_batch_size error message * Address review * Resolve conflicts * Remove config file * Call auto_complete_config without instantiating object * Address review * Fix up * Address review --- CMakeLists.txt | 18 +- README.md | 112 ++- examples/auto_complete/batch_model.py | 222 +++++ examples/auto_complete/client.py | 130 +++ examples/auto_complete/nobatch_model.py | 222 +++++ src/ipc_message.h | 4 +- src/memory_manager.h | 2 + src/pb_map.h | 2 + src/pb_stub.cc | 170 +++- src/pb_stub.h | 10 +- src/pb_utils.h | 13 + src/{python.cc => python_be.cc} | 939 +++---------------- src/python_be.h | 356 +++++++ src/request_executor.h | 4 +- src/resources/triton_python_backend_utils.py | 179 +++- src/stub_launcher.cc | 522 +++++++++++ src/stub_launcher.h | 172 ++++ 17 files changed, 2259 insertions(+), 818 deletions(-) create mode 100644 examples/auto_complete/batch_model.py create mode 100644 examples/auto_complete/client.py create mode 100644 examples/auto_complete/nobatch_model.py rename src/{python.cc => python_be.cc} (65%) create mode 100644 src/python_be.h create mode 100644 src/stub_launcher.cc create mode 100644 src/stub_launcher.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e1ea9963..19631e6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,8 +155,9 @@ set( ) set( - PYTHNON_BACKEND_SRCS - src/python.cc + PYTHON_BACKEND_SRCS + src/python_be.cc + src/python_be.h src/pb_env.cc src/pb_env.h src/pb_metric_reporter.cc @@ -165,20 +166,22 @@ set( src/memory_manager.h src/request_executor.cc src/request_executor.h + src/stub_launcher.h + src/stub_launcher.cc ) list(APPEND - PYTHNON_BACKEND_SRCS + PYTHON_BACKEND_SRCS ${COMMON_SRCS} ) add_library( triton-python-backend SHARED - ${PYTHNON_BACKEND_SRCS} + ${PYTHON_BACKEND_SRCS} ) set( - PYTHNON_BACKEND_STUB_SRCS + PYTHON_BACKEND_STUB_SRCS src/pb_stub_utils.h src/pb_stub_utils.cc src/response_sender.cc @@ -188,13 +191,13 @@ set( ) list(APPEND - PYTHNON_BACKEND_STUB_SRCS + PYTHON_BACKEND_STUB_SRCS ${COMMON_SRCS} ) add_executable( triton-python-backend-stub - ${PYTHNON_BACKEND_STUB_SRCS} + ${PYTHON_BACKEND_STUB_SRCS} ) add_dependencies(triton-python-backend boostorg) @@ -241,7 +244,6 @@ target_link_libraries( Threads::Threads triton-backend-utils # from repo-backend pybind11::embed - dlpack -lrt # shared memory -larchive # libarchive ) diff --git a/README.md b/README.md index 77abf8b7..4eb87d6f 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ any C++ code. - [Quick Start](#quick-start) - [Building from Source](#building-from-source) - [Usage](#usage) + - [`auto_complete_config`](#auto_complete_config) - [`initialize`](#initialize) - [`execute`](#execute) - [Default Mode](#default-mode) @@ -192,6 +193,85 @@ class TritonPythonModel: that is created must have "TritonPythonModel" as the class name. """ + @staticmethod + def auto_complete_config(auto_complete_model_config): + """`auto_complete_config` is called only once when the server is started + with `--strict-model-config=false`. Implementing this function is optional. + A no implementation of `auto_complete_config` will do nothing. This function + can be used to set `max_batch_size`, `input` and `output` properties of the + model using `set_max_batch_size`, `add_input`, and `add_output`. + These properties will allow Triton to load the model with minimal model + configuration in absence of a configuration file. This function returns the + `pb_utils.ModelConfig` object with these properties. You can use `as_dict` + function to gain read-only access to the `pb_utils.ModelConfig` object. + The `pb_utils.ModelConfig` object being returned from here will be used as + the final configuration for the model. + + Note: The Python interpreter used to invoke this function will be destroyed + upon returning from this function and as a result none of the objects created + here will be available in the `initialize`, `execute`, or `finalize` functions. + + Parameters + ---------- + auto_complete_model_config : pb_utils.ModelConfig + An object containing the existing model configuration. You can build upon + the configuration given by this object when setting the properties for + this model. + + Returns + ------- + pb_utils.ModelConfig + An object containing the auto-completed model configuration + """ + inputs = [{ + 'name': 'INPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'INPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + outputs = [{ + 'name': 'OUTPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'OUTPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + + # Demonstrate the usage of `as_dict`, `add_input`, `add_output`, + # and `set_max_batch_size` functions. + # Store the model configuration as a dictionary. + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config['input']: + input_names.append(input['name']) + for output in config['output']: + output_names.append(output['name']) + + for input in inputs: + # The name checking here is only for demonstrating the usage of + # `as_dict` function. `add_input` will check for conflicts and + # raise errors if an input with the same name already exists in + # the configuration but has different data_type or dims property. + if input['name'] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + # The name checking here is only for demonstrating the usage of + # `as_dict` function. `add_output` will check for conflicts and + # raise errors if an output with the same name already exists in + # the configuration but has different data_type or dims property. + if output['name'] not in output_names: + auto_complete_model_config.add_output(output) + + auto_complete_model_config.set_max_batch_size(0) + + return auto_complete_model_config + def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows @@ -252,7 +332,35 @@ class TritonPythonModel: ``` -Every Python backend can implement three main functions: +Every Python backend can implement four main functions: + +### `auto_complete_config` + +`auto_complete_config` is called only once when the server is started +with [`--strict-model-config=false`]( + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#auto-generated-model-configuration). +Implementing this function is optional. A no implementation of +`auto_complete_config` will do nothing. This function can be used to set +[`max_batch_size`]( + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size), +[`input`]( + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) and +[`output`]( + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) +properties of the model using `set_max_batch_size`, `add_input`, and +`add_output`. These properties will allow Triton to load the model with +[minimal model configuration]( + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#minimal-model-configuration) +in absence of a configuration file. This function returns the +`pb_utils.ModelConfig` object with these properties. You can use `as_dict` +function to gain read-only access to the `pb_utils.ModelConfig` object. +The `pb_utils.ModelConfig` object being returned from here will be used as the +final configuration for the model. + +Note: The Python interpreter used to invoke this function will be destroyed +upon returning from this function and as a result none of the objects +created here will be available in the `initialize`, `execute`, or `finalize` +functions. ### `initialize` @@ -413,7 +521,7 @@ from below known issues: Implementing `finalize` is optional. This function allows you to do any clean ups necessary before the model is unloaded from Triton server. -You can look at the [add_sub example](examples/add_sub.py) which contains +You can look at the [add_sub example](examples/add_sub/model.py) which contains a complete example of implementing all these functions for a Python model that adds and subtracts the inputs given to it. After implementing all the necessary functions, you should save this file as `model.py`. diff --git a/examples/auto_complete/batch_model.py b/examples/auto_complete/batch_model.py new file mode 100644 index 00000000..20156420 --- /dev/null +++ b/examples/auto_complete/batch_model.py @@ -0,0 +1,222 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + @staticmethod + def auto_complete_config(auto_complete_model_config): + """`auto_complete_config` is called only once when the server is + started with `--strict-model-config=false`. Implementing this + function is optional. A no implementation of `auto_complete_config` + will do nothing. This function can be used to set `max_batch_size`, + `input` and `output` properties of the model using + `set_max_batch_size`, `add_input`, and `add_output`. These properties + will allow Triton to load the model with minimal model configuration + in absence of a configuration file. This function returns the + `pb_utils.ModelConfig` object with these properties. You can use + `as_dict` function to gain read-only access to the + `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object being + returned from here will be used as the final configuration for the + model. + + Note: The Python interpreter used to invoke this function will be + destroyed upon returning from this function and as a result none of + the objects created here will be available in the `initialize`, + `execute`, or `finalize` functions. + + Parameters + ---------- + auto_complete_model_config : pb_utils.ModelConfig + An object containing the existing model configuration. You can build + upon the configuration given by this object when setting the + properties for this model. + + Returns + ------- + pb_utils.ModelConfig + An object containing the auto-completed model configuration + """ + inputs = [{ + 'name': 'INPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'INPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + outputs = [{ + 'name': 'OUTPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'OUTPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + + # Demonstrate the usage of `as_dict`, `add_input`, `add_output`, + # and `set_max_batch_size` functions. + # Store the model configuration as a dictionary. + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config['input']: + input_names.append(input['name']) + for output in config['output']: + output_names.append(output['name']) + + for input in inputs: + # The name checking here is only for demonstrating the usage of + # `as_dict` function. `add_input` will check for conflicts and + # raise errors if an input with the same name already exists in + # the configuration but has different data_type or dims property. + if input['name'] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + # The name checking here is only for demonstrating the usage of + # `as_dict` function. `add_output` will check for conflicts and + # raise errors if an output with the same name already exists in + # the configuration but has different data_type or dims property. + if output['name'] not in output_names: + auto_complete_model_config.add_output(output) + + auto_complete_model_config.set_max_batch_size(4) + + return auto_complete_model_config + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args['model_config']) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT1") + + # Convert Triton types to numpy types + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get INPUT0 + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + # Get INPUT1 + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy()) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_tensor_0 = pb_utils.Tensor("OUTPUT0", + out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", + out_1.astype(output1_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occured")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/examples/auto_complete/client.py b/examples/auto_complete/client.py new file mode 100644 index 00000000..9ddf5d32 --- /dev/null +++ b/examples/auto_complete/client.py @@ -0,0 +1,130 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonclient.utils import * +import tritonclient.http as httpclient +import sys + +import numpy as np + +nobatch_model_name = "nobatch_auto_complete" +batch_model_name = "batch_auto_complete" +nobatch_shape = [4] +batch_shape = [1, 4] + + +def validate_ios(config, expected_ios, model_name): + for io in config: + for expected_io in expected_ios: + if io["name"] == expected_io["name"]: + if io["data_type"] != expected_io["data_type"]: + print("model '" + model_name + "' has unexpected data_type") + sys.exit(1) + elif io["dims"] != expected_io["dims"]: + print("model '" + model_name + "' has unexpected dims") + sys.exit(1) + + +if __name__ == '__main__': + with httpclient.InferenceServerClient("localhost:8000") as client: + expected_max_batch_size = { + "nobatch_auto_complete": 0, + "batch_auto_complete": 4 + } + expected_inputs = [{ + 'name': 'INPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'INPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + expected_outputs = [{ + 'name': 'OUTPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'OUTPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + models = [nobatch_model_name, batch_model_name] + shapes = [nobatch_shape, batch_shape] + + for model_name, shape in zip(models, shapes): + # Validate the auto-complete model configuration + model_config = client.get_model_config(model_name) + if model_config["max_batch_size"] != expected_max_batch_size[ + model_name]: + print("model '" + model_name + + "' has unexpected max_batch_size") + sys.exit(1) + validate_ios(model_config["input"], expected_inputs, model_name) + validate_ios(model_config["output"], expected_outputs, model_name) + + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + inputs = [ + httpclient.InferInput("INPUT0", input0_data.shape, + np_to_triton_dtype(input0_data.dtype)), + httpclient.InferInput("INPUT1", input1_data.shape, + np_to_triton_dtype(input1_data.dtype)), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0"), + httpclient.InferRequestedOutput("OUTPUT1"), + ] + + response = client.infer(model_name, + inputs, + request_id=str(1), + outputs=outputs) + + result = response.get_response() + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + + print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data)) + print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data)) + + if not np.allclose(input0_data + input1_data, output0_data): + print("auto_complete example error: incorrect sum") + sys.exit(1) + + if not np.allclose(input0_data - input1_data, output1_data): + print("auto_complete example error: incorrect difference") + sys.exit(1) + + print('PASS: auto_complete') + + sys.exit(0) diff --git a/examples/auto_complete/nobatch_model.py b/examples/auto_complete/nobatch_model.py new file mode 100644 index 00000000..757561d1 --- /dev/null +++ b/examples/auto_complete/nobatch_model.py @@ -0,0 +1,222 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + @staticmethod + def auto_complete_config(auto_complete_model_config): + """`auto_complete_config` is called only once when the server is + started with `--strict-model-config=false`. Implementing this + function is optional. A no implementation of `auto_complete_config` + will do nothing. This function can be used to set `max_batch_size`, + `input` and `output` properties of the model using + `set_max_batch_size`, `add_input`, and `add_output`. These properties + will allow Triton to load the model with minimal model configuration + in absence of a configuration file. This function returns the + `pb_utils.ModelConfig` object with these properties. You can use + `as_dict` function to gain read-only access to the + `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object being + returned from here will be used as the final configuration for the + model. + + Note: The Python interpreter used to invoke this function will be + destroyed upon returning from this function and as a result none of + the objects created here will be available in the `initialize`, + `execute`, or `finalize` functions. + + Parameters + ---------- + auto_complete_model_config : pb_utils.ModelConfig + An object containing the existing model configuration. You can build + upon the configuration given by this object when setting the + properties for this model. + + Returns + ------- + pb_utils.ModelConfig + An object containing the auto-completed model configuration + """ + inputs = [{ + 'name': 'INPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'INPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + outputs = [{ + 'name': 'OUTPUT0', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }, { + 'name': 'OUTPUT1', + 'data_type': 'TYPE_FP32', + 'dims': [4] + }] + + # Demonstrate the usage of `as_dict`, `add_input`, `add_output`, + # and `set_max_batch_size` functions. + # Store the model configuration as a dictionary. + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config['input']: + input_names.append(input['name']) + for output in config['output']: + output_names.append(output['name']) + + for input in inputs: + # The name checking here is only for demonstrating the usage of + # `as_dict` function. `add_input` will check for conflicts and + # raise errors if an input with the same name already exists in + # the configuration but has different data_type or dims property. + if input['name'] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + # The name checking here is only for demonstrating the usage of + # `as_dict` function. `add_output` will check for conflicts and + # raise errors if an output with the same name already exists in + # the configuration but has different data_type or dims property. + if output['name'] not in output_names: + auto_complete_model_config.add_output(output) + + auto_complete_model_config.set_max_batch_size(0) + + return auto_complete_model_config + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args['model_config']) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT1") + + # Convert Triton types to numpy types + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get INPUT0 + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + # Get INPUT1 + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy()) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_tensor_0 = pb_utils.Tensor("OUTPUT0", + out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", + out_1.astype(output1_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occured")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/src/ipc_message.h b/src/ipc_message.h index 7cf9ede0..3aad4904 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -46,7 +46,9 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_InferExecRequest, PYTHONSTUB_InferExecResponse, PYTHONSTUB_ResponseSend, - PYTHONSTUB_ResponseClose + PYTHONSTUB_ResponseClose, + PYTHONSTUB_AutoCompleteRequest, + PYTHONSTUB_AutoCompleteResponse } PYTHONSTUB_CommandType; /// diff --git a/src/memory_manager.h b/src/memory_manager.h index e463d191..7930d0e8 100644 --- a/src/memory_manager.h +++ b/src/memory_manager.h @@ -24,6 +24,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + #include #include #include diff --git a/src/pb_map.h b/src/pb_map.h index 0172e5f4..c4827b7c 100644 --- a/src/pb_map.h +++ b/src/pb_map.h @@ -24,6 +24,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + #include #include "pb_string.h" #include "shm_manager.h" diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 6e8a766a..a2c7cee0 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -37,12 +37,14 @@ #include #include #include +#include #include #include #include "infer_response.h" #include "pb_error.h" #include "pb_map.h" #include "pb_string.h" +#include "pb_utils.h" #include "response_sender.h" #include "scoped_defer.h" #include "shm_manager.h" @@ -60,7 +62,6 @@ namespace triton { namespace backend { namespace python { std::atomic non_graceful_exit = {false}; - void SignalHandler(int signum) { @@ -73,12 +74,12 @@ Stub::Instantiate( const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& model_instance_name) + const std::string& name) { model_path_ = model_path; model_version_ = model_version; triton_install_path_ = triton_install_path; - model_instance_name_ = model_instance_name; + name_ = name; health_mutex_ = nullptr; initialized_ = false; @@ -185,7 +186,7 @@ Stub::IsDecoupled() bool Stub::RunCommand() { - NVTX_RANGE(nvtx_, "RunCommand " + model_instance_name_); + NVTX_RANGE(nvtx_, "RunCommand " + name_); std::unique_ptr ipc_message; { // Release the GIL lock when waiting for new message. Without this line, the @@ -195,6 +196,73 @@ Stub::RunCommand() ipc_message = this->PopMessage(); } switch (ipc_message->Command()) { + case PYTHONSTUB_CommandType::PYTHONSTUB_AutoCompleteRequest: { + // Only run this case when Triton Server is started with + // '--strict-model-config=false' + bool has_exception = false; + std::string error_string; + std::string auto_complete_config; + + std::unique_ptr auto_complete_response_msg = + IPCMessage::Create(shm_pool_, false /* inline_response */); + auto_complete_response_msg->Command() = PYTHONSTUB_AutoCompleteResponse; + std::unique_ptr error_string_shm; + std::unique_ptr auto_complete_config_shm; + AllocatedSharedMemory auto_complete_response = + shm_pool_->Construct(); + + ScopedDefer receive_autocomplete_finalize( + [this] { stub_message_queue_->Pop(); }); + ScopedDefer _([this, &auto_complete_response_msg] { + SendIPCMessage(auto_complete_response_msg); + }); + + auto_complete_response.data_->response_has_error = false; + auto_complete_response.data_->response_is_error_set = false; + auto_complete_response.data_->response_has_model_config = false; + auto_complete_response_msg->Args() = auto_complete_response.handle_; + + try { + AutoCompleteModelConfig(ipc_message->Args(), &auto_complete_config); + } + catch (const PythonBackendException& pb_exception) { + has_exception = true; + error_string = pb_exception.what(); + } + catch (const py::error_already_set& error) { + has_exception = true; + error_string = error.what(); + } + + if (has_exception) { + // Do not delete the region. The region will be deleted by the parent + // process. + shm_pool_->SetDeleteRegion(false); + LOG_INFO << "Failed to initialize Python stub for auto-complete: " + << error_string; + auto_complete_response.data_->response_has_error = true; + auto_complete_response.data_->response_is_error_set = false; + + LOG_IF_EXCEPTION( + error_string_shm = PbString::Create(shm_pool_, error_string)); + if (error_string_shm != nullptr) { + auto_complete_response.data_->response_is_error_set = true; + auto_complete_response.data_->response_error = + error_string_shm->ShmHandle(); + } + + return true; // Terminate the stub process. + } else { + LOG_IF_EXCEPTION( + auto_complete_config_shm = + PbString::Create(shm_pool_, auto_complete_config)); + if (auto_complete_config_shm != nullptr) { + auto_complete_response.data_->response_has_model_config = true; + auto_complete_response.data_->response_model_config = + auto_complete_config_shm->ShmHandle(); + } + } + } break; case PYTHONSTUB_CommandType::PYTHONSTUB_InitializeRequest: { bool has_exception = false; std::string error_string; @@ -252,7 +320,6 @@ Stub::RunCommand() return true; // Terminate the stub process. } - } break; case PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest: { AllocatedSharedMemory request_batch = @@ -288,8 +355,8 @@ Stub::RunCommand() return false; } -void -Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) +py::module +Stub::StubSetup() { py::module sys = py::module_::import("sys"); @@ -315,6 +382,69 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) sys.attr("path").attr("append")(model_path_parent); sys.attr("path").attr("append")(model_path_parent_parent); sys.attr("path").attr("append")(python_backend_folder); + sys = py::module_::import( + (std::string(model_version_) + "." + model_name_trimmed).c_str()); + + py::module python_backend_utils = + py::module_::import("triton_python_backend_utils"); + py::module c_python_backend_utils = + py::module_::import("c_python_backend_utils"); + py::setattr( + python_backend_utils, "TritonError", + c_python_backend_utils.attr("TritonError")); + py::setattr( + python_backend_utils, "TritonModelException", + c_python_backend_utils.attr("TritonModelException")); + py::setattr( + python_backend_utils, "Tensor", c_python_backend_utils.attr("Tensor")); + py::setattr( + python_backend_utils, "InferenceRequest", + c_python_backend_utils.attr("InferenceRequest")); + py::setattr( + python_backend_utils, "InferenceResponse", + c_python_backend_utils.attr("InferenceResponse")); + c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); + + deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); + serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor"); + + return sys; +} + +void +Stub::AutoCompleteModelConfig( + bi::managed_external_buffer::handle_t string_handle, + std::string* auto_complete_config) +{ + py::module sys = StubSetup(); + + std::unique_ptr pb_string_shm = + PbString::LoadFromSharedMemory(shm_pool_, string_handle); + + py::module python_backend_utils = + py::module_::import("triton_python_backend_utils"); + py::object model_config = + python_backend_utils.attr("ModelConfig")(pb_string_shm->String()); + + if (py::hasattr(sys.attr("TritonPythonModel"), "auto_complete_config")) { + model_config = sys.attr("TritonPythonModel") + .attr("auto_complete_config")(model_config); + } + + if (!py::isinstance(model_config, python_backend_utils.attr("ModelConfig"))) { + throw PythonBackendException( + "auto_complete_config function in model '" + name_ + + "' must return a valid pb.ModelConfig object."); + } + py::module json = py::module_::import("json"); + (*auto_complete_config) = std::string( + py::str(json.attr("dumps")(model_config.attr("_model_config")))); +} + +void +Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) +{ + py::module sys = StubSetup(); py::module python_backend_utils = py::module_::import("triton_python_backend_utils"); @@ -336,10 +466,7 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) c_python_backend_utils.attr("InferenceResponse")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); - py::object TritonPythonModel = - py::module_::import( - (std::string(model_version_) + "." + model_name_trimmed).c_str()) - .attr("TritonPythonModel"); + py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor"); model_instance_ = TritonPythonModel(); @@ -475,13 +602,13 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) } { - NVTX_RANGE(nvtx_, "PyExecute " + model_instance_name_); + NVTX_RANGE(nvtx_, "PyExecute " + name_); py::object execute_return = model_instance_.attr("execute")(py_request_list); if (!py::isinstance(execute_return)) { throw PythonBackendException( - "Python model '" + model_instance_name_ + + "Python model '" + name_ + "' is using the decoupled mode and the execute function must " "return None."); } @@ -499,8 +626,8 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) if (has_exception) { std::string err_message = std::string( - "Failed to process the request(s) for model '" + - model_instance_name_ + "', message: ") + + "Failed to process the request(s) for model '" + name_ + + "', message: ") + error_string; LOG_INFO << err_message.c_str(); response_batch_shm_ptr->has_error = true; @@ -570,7 +697,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) bool is_coroutine; { - NVTX_RANGE(nvtx_, "PyExecute " + model_instance_name_); + NVTX_RANGE(nvtx_, "PyExecute " + name_); execute_return = model_instance_.attr("execute")(request_list); is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); } @@ -643,8 +770,8 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) if (has_exception) { std::string err_message = std::string( - "Failed to process the request(s) for model '" + - model_instance_name_ + "', message: ") + + "Failed to process the request(s) for model '" + name_ + + "', message: ") + error_string; LOG_INFO << err_message.c_str(); error_string_shm = PbString::Create(shm_pool_, error_string); @@ -691,6 +818,7 @@ Stub::~Stub() model_instance_ = py::none(); } + stub_instance_.reset(); stub_message_queue_.reset(); parent_message_queue_.reset(); memory_manager_message_queue_.reset(); @@ -843,14 +971,14 @@ main(int argc, char** argv) std::string model_version = model_path_tokens[model_path_tokens.size() - 2]; int64_t shm_growth_size = std::stoi(argv[4]); std::string triton_install_path = argv[6]; - std::string model_instance_name = argv[8]; + std::string name = argv[8]; std::unique_ptr& stub = Stub::GetOrCreateInstance(); try { stub->Instantiate( shm_growth_size, shm_default_size, shm_region_name, model_path, model_version, argv[6] /* triton install path */, - std::stoi(argv[7]) /* IPCControl handle */, model_instance_name); + std::stoi(argv[7]) /* IPCControl handle */, name); } catch (const PythonBackendException& pb_exception) { LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what(); @@ -888,7 +1016,7 @@ main(int argc, char** argv) // The stub process will always keep listening for new notifications from the // parent process. After the notification is received the stub process will - // run the appropriate comamnd and wait for new notifications. + // run the appropriate command and wait for new notifications. bool finalize = false; while (true) { if (finalize) { diff --git a/src/pb_stub.h b/src/pb_stub.h index 312c7ca0..d0baf7cc 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -134,6 +134,14 @@ class Stub { /// Run a single command from the shared memory. bool RunCommand(); + /// Setup for the stub process + py::module StubSetup(); + + /// Set the model configuration for auto-complete + void AutoCompleteModelConfig( + bi::managed_external_buffer::handle_t string_handle, + std::string* auto_complete_config); + /// Initialize the user's Python code. void Initialize(bi::managed_external_buffer::handle_t map_handle); @@ -173,7 +181,7 @@ class Stub { bi::interprocess_mutex* health_mutex_; std::string model_path_; std::string model_version_; - std::string model_instance_name_; + std::string name_; std::string triton_install_path_; IPCControlShm* ipc_control_; std::unique_ptr shm_pool_; diff --git a/src/pb_utils.h b/src/pb_utils.h index 5af7a9dd..0c51615d 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -113,6 +113,19 @@ struct InitializeResponseShm { bi::managed_external_buffer::handle_t response_error; }; +struct AutoCompleteResponseShm { + // Indicates whether the response has an error or not. + bool response_has_error; + // Indicates whether the response error is set or not. + bool response_is_error_set; + // Contains the error message. + bi::managed_external_buffer::handle_t response_error; + // Indicates whether the response has model config or not. + bool response_has_model_config; + // Contains the model config + bi::managed_external_buffer::handle_t response_model_config; +}; + // Control data structure for the communication between the Python stub and the // main stub. struct IPCControlShm { diff --git a/src/python.cc b/src/python_be.cc similarity index 65% rename from src/python.cc rename to src/python_be.cc index 11b97fec..916be898 100644 --- a/src/python.cc +++ b/src/python_be.cc @@ -24,350 +24,15 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "infer_request.h" -#include "infer_response.h" -#include "ipc_message.h" -#include "memory_manager.h" -#include "message_queue.h" -#include "pb_env.h" -#include "pb_map.h" -#include "pb_metric_reporter.h" -#include "pb_utils.h" -#include "request_executor.h" -#include "scoped_defer.h" -#include "shm_manager.h" -#include "triton/backend/backend_common.h" -#include "triton/backend/backend_input_collector.h" -#include "triton/backend/backend_memory.h" -#include "triton/backend/backend_model.h" -#include "triton/backend/backend_model_instance.h" -#include "triton/common/nvtx.h" -#include "triton/common/triton_json.h" -#include "triton/core/tritonbackend.h" -#include "triton/core/tritonserver.h" - -#define LOG_IF_EXCEPTION(X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& pb_exception) { \ - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what()); \ - } \ - } while (false) - -#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ - do { \ - TRITONSERVER_Error* raasnie_err__ = (X); \ - if (raasnie_err__ != nullptr) { \ - for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ - if ((*RESPONSES)[ridx] != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - raasnie_err__), \ - "failed to send error response"); \ - (*RESPONSES)[ridx] = nullptr; \ - } \ - } \ - TRITONSERVER_ErrorDelete(raasnie_err__); \ - return; \ - } \ - } while (false) - - -#define RESPOND_ALL_AND_RETURN_IF_EXCEPTION(RESPONSES, RESPONSES_COUNT, X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& exception) { \ - TRITONSERVER_Error* raarie_err__ = TRITONSERVER_ErrorNew( \ - TRITONSERVER_ERROR_INTERNAL, exception.what()); \ - for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ - if ((*RESPONSES)[ridx] != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - raarie_err__), \ - "failed to send error response"); \ - (*RESPONSES)[ridx] = nullptr; \ - } \ - } \ - TRITONSERVER_ErrorDelete(raarie_err__); \ - return; \ - } \ - } while (false) - -#define RESPOND_AND_RETURN_IF_ERROR(REQUEST, X) \ - do { \ - TRITONSERVER_Error* rarie_err__ = (X); \ - if (rarie_err__ != nullptr) { \ - TRITONBACKEND_Response* rarie_response__ = nullptr; \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseNew(&rarie_response__, REQUEST), \ - "failed to create response"); \ - if (rarie_response__ != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - rarie_response__, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - rarie_err__), \ - "failed to send error response"); \ - } \ - return rarie_err__; \ - } \ - } while (false) - -#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X) \ - do { \ - if ((*RESPONSES)[IDX] != nullptr) { \ - TRITONSERVER_Error* err__ = (X); \ - if (err__ != nullptr) { \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - err__), \ - "failed to send error response"); \ - (*RESPONSES)[IDX] = nullptr; \ - TRITONSERVER_ErrorDelete(err__); \ - } \ - } \ - } while (false) - -#define GUARDED_RESPOND_IF_EXCEPTION(RESPONSES, IDX, X) \ - do { \ - if ((*RESPONSES)[IDX] != nullptr) { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& pb_exception) { \ - TRITONSERVER_Error* err__ = TRITONSERVER_ErrorNew( \ - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ - LOG_IF_ERROR( \ - TRITONBACKEND_ResponseSend( \ - (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ - err__), \ - "failed to send error response"); \ - (*RESPONSES)[IDX] = nullptr; \ - TRITONSERVER_ErrorDelete(err__); \ - } \ - } \ - } while (false) - -#define RETURN_IF_EXCEPTION(X) \ - do { \ - try { \ - (X); \ - } \ - catch (const PythonBackendException& pb_exception) { \ - TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \ - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ - return rarie_err__; \ - } \ - } while (false) +#include "python_be.h" namespace triton { namespace backend { namespace python { namespace bi = boost::interprocess; -struct BackendState { - std::string python_lib; - int64_t shm_default_byte_size; - int64_t shm_growth_byte_size; - int64_t stub_timeout_seconds; - int64_t shm_message_queue_size; - std::atomic number_of_instance_inits; - std::string shared_memory_region_prefix; - int64_t thread_pool_size; - std::unique_ptr env_manager; -}; - -class ModelState : public BackendModel { - public: - static TRITONSERVER_Error* Create( - TRITONBACKEND_Model* triton_model, ModelState** state); - - // Get backend state - BackendState* StateForBackend() { return backend_state_; } - - // Get the Python execution environment - std::string PythonExecutionEnv() { return python_execution_env_; } - - // Force CPU only tensors - bool ForceCPUOnlyInputTensors() { return force_cpu_only_input_tensors_; } - - // Is decoupled API being used. - bool IsDecoupled() { return decoupled_; } - - private: - ModelState(TRITONBACKEND_Model* triton_model); - BackendState* backend_state_; - std::string python_execution_env_; - bool force_cpu_only_input_tensors_; - bool decoupled_; -}; - -class ModelInstanceState : public BackendModelInstance { - ModelInstanceState( - ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance); - - TRITONBACKEND_Model* triton_model_; - bi::interprocess_mutex* health_mutex_; - std::unique_ptr> - stub_message_queue_; - std::unique_ptr> - parent_message_queue_; - std::unique_ptr memory_manager_; - std::string model_path_; - std::unique_ptr> - ipc_control_; - bi::managed_external_buffer::handle_t ipc_control_handle_; - std::vector> futures_; - std::vector bls_inference_responses_; - std::mutex bls_responses_mutex_; - std::unique_ptr shm_pool_; - std::string shm_region_name_; - std::vector closed_requests_; - std::mutex closed_requests_mutex_; - std::unique_ptr thread_pool_; - - // Stub process pid - pid_t stub_pid_; - - // Parent process pid - pid_t parent_pid_; - bool initialized_; - - // Path to python execution environment - std::string path_to_libpython_; - std::string path_to_activate_; - - // Decoupled monitor thread - std::thread decoupled_monitor_; - bool decoupled_thread_; - std::mutex mu_; - std::condition_variable cv_; - std::unique_ptr received_message_; - - public: - static TRITONSERVER_Error* Create( - ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance, - ModelInstanceState** model_instance_state); - - ~ModelInstanceState(); - - // Create the stub process. - TRITONSERVER_Error* SetupStubProcess(); - TRITONSERVER_Error* SendMessageToStub(off_t message); - void ResponseSendDecoupled(std::shared_ptr response_send_message); - - // Checks whether the stub process is live - bool IsStubProcessAlive(); - - // Get a message from the stub process - TRITONSERVER_Error* ReceiveMessageFromStub(off_t& message); - - // Get a message from the stub process - void SendMessageAndReceiveResponse( - off_t message, off_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // Responds to all the requests with an error message. - void RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // In the decoupled mode, the parent message queue is monitored only by this - // function during the execute phase. No other thread should pop any message - // from the message queue in the decoupled mode. - void DecoupledMessageQueueMonitor(); - - // Kill stub process - void KillStubProcess(); - - // Start stub process - TRITONSERVER_Error* StartStubProcess(); - - // Convert TRITONBACKEND_Input to Python backend tensors. - TRITONSERVER_Error* GetInputTensor( - const uint32_t input_idx, std::shared_ptr& input_tensor, - TRITONBACKEND_Request* request, - std::shared_ptr>& responses); - - // Process all the requests obtained from Triton. - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - bool& restart); - - // Process all the requests in the decoupled mode. - TRITONSERVER_Error* ProcessRequestsDecoupled( - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_infer_requests); - - bool ExistsInClosedRequests(intptr_t closed_request); - - // Execute a BLS Request - void ExecuteBLSRequest(std::shared_ptr ipc_message); - - // Cleanup BLS responses - void CleanupBLSResponses(); - - // Wait for BLS requests to complete - void WaitForBLSRequestsToFinish(); - - // Check the incoming requests for errors - TRITONSERVER_Error* CheckIncomingRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - size_t& total_batch_size); - - // Set error for response send message - void SetErrorForResponseSendMessage( - ResponseSendMessage* response_send_message, - std::shared_ptr error, - std::unique_ptr& error_message); - - TRITONSERVER_Error* SaveRequestsToSharedMemory( - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_inference_requests, - AllocatedSharedMemory& request_batch, - std::shared_ptr>& responses); -}; - ModelInstanceState::ModelInstanceState( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) - : BackendModelInstance(model_state, triton_model_instance), stub_pid_(0), - initialized_(false) + : BackendModelInstance(model_state, triton_model_instance) { } @@ -388,15 +53,6 @@ ModelInstanceState::Create( return nullptr; } -void -ModelInstanceState::KillStubProcess() -{ - kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); - stub_pid_ = 0; -} - TRITONSERVER_Error* ModelInstanceState::CheckIncomingRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -408,7 +64,6 @@ ModelInstanceState::CheckIncomingRequests( // For each request collect the total batch size for this inference // execution. The batch-size, number of inputs, and size of each // input has already been checked so don't need to do that here. - total_batch_size = 0; for (size_t i = 0; i < request_count; i++) { // If we get a nullptr request then something is badly wrong. Fail @@ -483,8 +138,8 @@ ModelInstanceState::SetErrorForResponseSendMessage( if (error && *error != nullptr) { response_send_message->has_error = true; LOG_IF_EXCEPTION( - error_message = - PbString::Create(shm_pool_, TRITONSERVER_ErrorMessage(*error))); + error_message = PbString::Create( + Stub()->ShmPool(), TRITONSERVER_ErrorMessage(*error))); response_send_message->error = error_message->ShmHandle(); response_send_message->is_error_set = true; } @@ -531,11 +186,12 @@ ModelInstanceState::SendMessageToStub( boost::get_system_time() + boost::posix_time::milliseconds(timeout_miliseconds); - bi::scoped_lock lock(*health_mutex_, timeout); + bi::scoped_lock lock( + *(Stub()->HealthMutex()), timeout); // Check if lock has been acquired. if (lock) { - ipc_control_->stub_health = false; + Stub()->IpcControl()->stub_health = false; } else { // If it failed to obtain the lock, it means that the stub has been // stuck or exited while holding the health mutex lock. @@ -544,7 +200,7 @@ ModelInstanceState::SendMessageToStub( } } - stub_message_queue_->Push( + Stub()->StubMessageQueue()->Push( message, timeout_miliseconds /* duration ms */, success); if (!success && !IsStubProcessAlive()) { @@ -568,11 +224,12 @@ ModelInstanceState::ReceiveMessageFromStub( boost::get_system_time() + boost::posix_time::milliseconds(timeout_miliseconds); - bi::scoped_lock lock(*health_mutex_, timeout); + bi::scoped_lock lock( + *Stub()->HealthMutex(), timeout); // Check if lock has been acquired. if (lock) { - ipc_control_->stub_health = false; + Stub()->IpcControl()->stub_health = false; } else { // If it failed to obtain the lock, it means that the stub has been // stuck or exited while holding the health mutex lock. @@ -581,7 +238,7 @@ ModelInstanceState::ReceiveMessageFromStub( } } - message = parent_message_queue_->Pop( + message = Stub()->ParentMessageQueue()->Pop( timeout_miliseconds /* duration ms */, success); if (!success && !IsStubProcessAlive()) { @@ -632,11 +289,11 @@ ModelInstanceState::IsStubProcessAlive() { boost::posix_time::ptime timeout = boost::get_system_time() + boost::posix_time::seconds(1); - bi::scoped_lock lock(*health_mutex_, timeout); + bi::scoped_lock lock(*Stub()->HealthMutex(), timeout); // Check if lock has been acquired. if (lock) { - return ipc_control_->stub_health; + return Stub()->IpcControl()->stub_health; } else { // If It failed to obtain the lock, it means that the stub has been // stuck or exited while holding the health mutex lock. @@ -644,257 +301,6 @@ ModelInstanceState::IsStubProcessAlive() } } -TRITONSERVER_Error* -ModelInstanceState::StartStubProcess() -{ - // Destruct any in-use shared memory object before starting the stub process. - ipc_control_ = nullptr; - stub_message_queue_ = nullptr; - parent_message_queue_ = nullptr; - memory_manager_ = nullptr; - ModelState* model_state = reinterpret_cast(Model()); - thread_pool_ = std::make_unique( - model_state->StateForBackend()->thread_pool_size); - int64_t shm_default_size = - model_state->StateForBackend()->shm_default_byte_size; - int64_t shm_growth_byte_size = - model_state->StateForBackend()->shm_growth_byte_size; - - try { - // It is necessary for restart to make sure that the previous shared memory - // pool is destructed before the new pool is created. - shm_pool_ = nullptr; - shm_pool_ = std::make_unique( - shm_region_name_, shm_default_size, shm_growth_byte_size, - true /* create */); - } - catch (const PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - } - - AllocatedSharedMemory ipc_control = - shm_pool_->Construct(); - ipc_control_ = std::move(ipc_control.data_); - ipc_control_handle_ = ipc_control.handle_; - - auto message_queue_size = - model_state->StateForBackend()->shm_message_queue_size; - - RETURN_IF_EXCEPTION( - stub_message_queue_ = - MessageQueue::Create( - shm_pool_, message_queue_size)); - RETURN_IF_EXCEPTION( - parent_message_queue_ = - MessageQueue::Create( - shm_pool_, message_queue_size)); - - std::unique_ptr> memory_manager_message_queue; - RETURN_IF_EXCEPTION( - memory_manager_message_queue = - MessageQueue::Create(shm_pool_, message_queue_size)); - - memory_manager_message_queue->ResetSemaphores(); - ipc_control_->memory_manager_message_queue = - memory_manager_message_queue->ShmHandle(); - ipc_control_->decoupled = model_state->IsDecoupled(); - - memory_manager_ = - std::make_unique(std::move(memory_manager_message_queue)); - ipc_control_->parent_message_queue = parent_message_queue_->ShmHandle(); - ipc_control_->stub_message_queue = stub_message_queue_->ShmHandle(); - - new (&(ipc_control_->stub_health_mutex)) bi::interprocess_mutex; - health_mutex_ = &(ipc_control_->stub_health_mutex); - - stub_message_queue_->ResetSemaphores(); - parent_message_queue_->ResetSemaphores(); - - std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); - const char* model_path = model_state->RepositoryPath().c_str(); - - initialized_ = false; - - pid_t pid = fork(); - if (pid < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Failed to fork the stub process."); - } - - // Stub process - if (pid == 0) { - const char* stub_args[4]; - stub_args[0] = "bash"; - stub_args[1] = "-c"; - stub_args[3] = nullptr; // Last argument must be nullptr - - // Default Python backend stub - std::string python_backend_stub = - model_state->StateForBackend()->python_lib + - "/triton_python_backend_stub"; - - // Path to alternative Python backend stub - std::string model_python_backend_stub = - std::string(model_path) + "/triton_python_backend_stub"; - - if (FileExists(model_python_backend_stub)) { - python_backend_stub = model_python_backend_stub; - } - - std::string bash_argument; - - // This shared memory variable indicates whether the stub process should - // revert the LD_LIBRARY_PATH changes to avoid shared library issues in - // executables and libraries. - ipc_control_->uses_env = false; - if (model_state->PythonExecutionEnv() != "") { - std::stringstream ss; - - // Need to properly set the LD_LIBRARY_PATH so that Python environments - // using different python versions load properly. - ss << "source " << path_to_activate_ - << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ - << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ - << " " << shm_region_name_ << " " << shm_default_size << " " - << shm_growth_byte_size << " " << parent_pid_ << " " - << model_state->StateForBackend()->python_lib << " " - << ipc_control_handle_ << " " << Name(); - ipc_control_->uses_env = true; - bash_argument = ss.str(); - } else { - std::stringstream ss; - ss << " exec " << python_backend_stub << " " << model_path_ << " " - << shm_region_name_ << " " << shm_default_size << " " - << shm_growth_byte_size << " " << parent_pid_ << " " - << model_state->StateForBackend()->python_lib << " " - << ipc_control_handle_ << " " << Name(); - bash_argument = ss.str(); - } - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Starting Python backend stub: ") + bash_argument) - .c_str()); - - stub_args[2] = bash_argument.c_str(); - - int stub_status_code = - system((python_backend_stub + "> /dev/null 2>&1").c_str()); - - // If running stub process without any arguments returns any status code, - // other than 1, it can indicate a permission issue as a result of - // downloading the stub process from a cloud object storage service. - if (WEXITSTATUS(stub_status_code) != 1) { - // Give the execute permission for the triton_python_backend_stub to the - // owner. - int error = chmod(python_backend_stub.c_str(), S_IXUSR); - if (error != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Failed to give execute permission to " - "triton_python_backend_stub in ") + - python_backend_stub + " " + Name() + - " Error No.: " + std::to_string(error)) - .c_str()); - } - } - - if (execvp("bash", (char**)stub_args) != 0) { - std::stringstream ss; - ss << "Failed to run python backend stub. Errno = " << errno << '\n' - << "Python backend stub path: " << python_backend_stub << '\n' - << "Shared Memory Region Name: " << shm_region_name_ << '\n' - << "Shared Memory Default Byte Size: " << shm_default_size << '\n' - << "Shared Memory Growth Byte Size: " << shm_growth_byte_size << '\n'; - std::string log_message = ss.str(); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str()); - - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Failed to initialize model instance ") + Name()) - .c_str()); - } - } else { - ScopedDefer _([this] { - // Push a dummy message to the message queue so that the stub - // process is notified that it can release the object stored in - // shared memory. - stub_message_queue_->Push(DUMMY_MESSAGE); - - // If the model is not initialized, wait for the stub process to exit. - if (!initialized_) { - int status; - stub_message_queue_.reset(); - parent_message_queue_.reset(); - memory_manager_.reset(); - waitpid(stub_pid_, &status, 0); - } - }); - - stub_pid_ = pid; - triton::common::TritonJson::WriteBuffer buffer; - Model()->ModelConfig().Write(&buffer); - - std::unordered_map initialize_map = { - {"model_config", buffer.MutableContents()}, - {"model_instance_kind", TRITONSERVER_InstanceGroupKindString(kind_)}, - {"model_instance_name", name_}, - {"model_instance_device_id", std::to_string(device_id_)}, - {"model_repository", model_state->RepositoryPath()}, - {"model_version", std::to_string(model_state->Version())}, - {"model_name", model_state->Name()}}; - - std::unique_ptr initialize_message = - IPCMessage::Create(shm_pool_, false /* inline_response */); - initialize_message->Command() = PYTHONSTUB_InitializeRequest; - - std::unique_ptr pb_map = PbMap::Create(shm_pool_, initialize_map); - bi::managed_external_buffer::handle_t initialize_map_handle = - pb_map->ShmHandle(); - - initialize_message->Args() = initialize_map_handle; - stub_message_queue_->Push(initialize_message->ShmHandle()); - - std::unique_ptr initialize_response_message = - IPCMessage::LoadFromSharedMemory( - shm_pool_, parent_message_queue_->Pop()); - - if (initialize_response_message->Command() != - PYTHONSTUB_InitializeResponse) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string( - "Received unexpected response from Python backend stub: ") + - name_) - .c_str()); - } - - auto initialize_response = - std::move((shm_pool_->Load( - initialize_response_message->Args()))) - .data_; - - if (initialize_response->response_has_error) { - if (initialize_response->response_is_error_set) { - std::unique_ptr error_message = - PbString::LoadFromSharedMemory( - shm_pool_, initialize_response->response_error); - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, error_message->String().c_str()); - } else { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Initialize() failed for ") + model_state->Name()) - .c_str()); - } - } - - initialized_ = true; - } - - return nullptr; // success -} - TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -907,7 +313,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( ModelState* model_state = reinterpret_cast(Model()); RETURN_IF_EXCEPTION( - request_batch = shm_pool_->Construct( + request_batch = Stub()->ShmPool()->Construct( sizeof(RequestBatch) + request_count * sizeof(bi::managed_external_buffer::handle_t))); @@ -974,7 +380,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( reinterpret_cast(request)); } - RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(shm_pool_)); + RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); requests_shm[r] = infer_request->ShmHandle(); pb_inference_requests.emplace_back(std::move(infer_request)); } @@ -983,72 +389,17 @@ ModelInstanceState::SaveRequestsToSharedMemory( } TRITONSERVER_Error* -ModelInstanceState::SetupStubProcess() +ModelInstanceState::LaunchStubProcess() { - std::string kind = TRITONSERVER_InstanceGroupKindString(kind_); ModelState* model_state = reinterpret_cast(Model()); + Stub() = std::make_unique( + "MODEL_INSTANCE_STUB", Name(), DeviceId(), + TRITONSERVER_InstanceGroupKindString(Kind())); + RETURN_IF_ERROR(Stub()->Initialize(model_state)); + RETURN_IF_ERROR(Stub()->Launch()); - // Increase the stub process count to avoid shared memory region name - // collision - model_state->StateForBackend()->number_of_instance_inits++; - shm_region_name_ = - model_state->StateForBackend()->shared_memory_region_prefix + - std::to_string(model_state->StateForBackend()->number_of_instance_inits); - - uint64_t model_version = model_state->Version(); - const char* model_path = model_state->RepositoryPath().c_str(); - - std::stringstream ss; - std::string artifact_name; - RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( - "default_model_filename", &artifact_name)); - ss << model_path << "/" << model_version << "/"; - - if (artifact_name.size() > 0) { - ss << artifact_name; - } else { - // Default artifact name. - ss << "model.py"; - } - - model_path_ = ss.str(); - struct stat buffer; - - // Check if model.py exists - if (stat(model_path_.c_str(), &buffer) != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("model.py does not exist in the model repository path: " + model_path_) - .c_str()); - } - - // Path to the extracted Python env - std::string python_execution_env = ""; - if (model_state->PythonExecutionEnv() != "") { - try { - python_execution_env = - model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( - model_state->PythonExecutionEnv()); - } - catch (PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - } - - path_to_activate_ = python_execution_env + "/bin/activate"; - path_to_libpython_ = python_execution_env + "/lib"; - if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Path ") + path_to_activate_ + - " does not exist. The Python environment should contain an " - "'activate' script.") - .c_str()); - } - } - - parent_pid_ = getpid(); - RETURN_IF_ERROR(StartStubProcess()); + thread_pool_ = std::make_unique( + model_state->StateForBackend()->thread_pool_size); if (model_state->IsDecoupled()) { decoupled_thread_ = true; @@ -1059,7 +410,6 @@ ModelInstanceState::SetupStubProcess() return nullptr; } - TRITONSERVER_Error* ModelInstanceState::GetInputTensor( const uint32_t input_idx, std::shared_ptr& input_tensor, @@ -1133,8 +483,8 @@ ModelInstanceState::GetInputTensor( input_dtype, TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */, nullptr /* buffer ptr*/, input_byte_size, nullptr /* DLManagedTensor */); - RETURN_IF_EXCEPTION( - input_tensor->SaveToSharedMemory(shm_pool_, false /* copy_gpu */)); + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( + Stub()->ShmPool(), false /* copy_gpu */)); char* input_buffer = reinterpret_cast(input_tensor->DataPtr()); if (collector) { @@ -1160,7 +510,6 @@ ModelInstanceState::GetInputTensor( input_name, nullptr, 0, alloc_perference, reinterpret_cast(&buffer), &input_byte_size, &src_memory_type, &src_memory_type_id)); - // If the tensor is using the cuda shared memory, we need to extract the // handle that was used to create the device pointer. This is because of a // limitation in the legacy CUDA IPC API that doesn't allow getting the @@ -1187,13 +536,13 @@ ModelInstanceState::GetInputTensor( RETURN_IF_ERROR(TRITONSERVER_BufferAttributesCudaIpcHandle( buffer_attributes, reinterpret_cast(&cuda_ipc_handle))); if (cuda_ipc_handle != nullptr) { - RETURN_IF_EXCEPTION( - input_tensor->SaveToSharedMemory(shm_pool_, false /* copy_gpu */)); + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( + Stub()->ShmPool(), false /* copy_gpu */)); RETURN_IF_EXCEPTION( input_tensor->Memory()->SetCudaIpcHandle(cuda_ipc_handle)); } else { - RETURN_IF_EXCEPTION( - input_tensor->SaveToSharedMemory(shm_pool_, true /* copy_gpu */)); + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( + Stub()->ShmPool(), true /* copy_gpu */)); } } else { void* dev_ptr; @@ -1222,13 +571,13 @@ ModelInstanceState::GetInputTensor( const_cast(dev_ptr), input_byte_size, nullptr /* DLManagedTensor */); - RETURN_IF_EXCEPTION( - input_tensor->SaveToSharedMemory(shm_pool_, true /* copy_gpu */)); + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( + Stub()->ShmPool(), true /* copy_gpu */)); std::unique_ptr gpu_memory_record = std::make_unique(input_tensor->Memory()->DataPtr()); uint64_t memory_release_id = - memory_manager_->AddRecord(std::move(gpu_memory_record)); + Stub()->GetMemoryManager()->AddRecord(std::move(gpu_memory_record)); input_tensor->Memory()->SetMemoryReleaseId(memory_release_id); } #else @@ -1245,8 +594,8 @@ void ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) { ModelState* model_state = reinterpret_cast(Model()); - auto request_executor = - std::make_unique(shm_pool_, model_state->TritonServer()); + auto request_executor = std::make_unique( + Stub()->ShmPool(), model_state->TritonServer()); bool is_response_batch_set = false; std::unique_ptr infer_response; ResponseBatch* response_batch; @@ -1255,10 +604,11 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) std::unique_ptr bls_response; AllocatedSharedMemory response_batch_shm; try { - bls_response = IPCMessage::Create(shm_pool_, false /* inline_response */); + bls_response = + IPCMessage::Create(Stub()->ShmPool(), false /* inline_response */); AllocatedSharedMemory request_batch = - shm_pool_->Load(ipc_message->Args()); + Stub()->ShmPool()->Load(ipc_message->Args()); RequestBatch* request_batch_shm_ptr = reinterpret_cast(request_batch.data_.get()); @@ -1266,7 +616,7 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) ipc_message->ResponseHandle() = bls_response->ShmHandle(); // The response batch of the handle will contain a ResponseBatch - response_batch_shm = shm_pool_->Construct( + response_batch_shm = Stub()->ShmPool()->Construct( sizeof(ResponseBatch) + sizeof(bi::managed_external_buffer::handle_t)); response_batch = reinterpret_cast(response_batch_shm.data_.get()); @@ -1291,7 +641,7 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) reinterpret_cast( request_batch.data_.get() + sizeof(RequestBatch)); infer_request = InferRequest::LoadFromSharedMemory( - shm_pool_, *request_handle, false /* open_cuda_handle */); + Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */); // If the BLS inputs are in GPU an additional round trip between the // stub process and the main process is required. The reason is that we @@ -1317,8 +667,8 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) break; } lbackend_memory.reset(backend_memory); - input_tensor->SetMemory(std::move( - PbMemory::Create(shm_pool_, std::move(lbackend_memory)))); + input_tensor->SetMemory(std::move(PbMemory::Create( + Stub()->ShmPool(), std::move(lbackend_memory)))); #endif // TRITON_ENABLE_GPU } } @@ -1334,9 +684,10 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) // waiting for a message from the parent process. if (has_gpu_tensor) { try { - gpu_handles = - shm_pool_->Construct( - gpu_buffers_count); + gpu_handles = Stub() + ->ShmPool() + ->Construct( + gpu_buffers_count); request_batch_shm_ptr->gpu_buffers_count = gpu_buffers_count; request_batch_shm_ptr->gpu_buffers_handle = gpu_handles.handle_; size_t i = 0; @@ -1362,7 +713,7 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) request_executor->Infer(infer_request, &inference_response); if (infer_response) { - infer_response->SaveToSharedMemory(shm_pool_); + infer_response->SaveToSharedMemory(Stub()->ShmPool()); for (auto& output_tensor : infer_response->OutputTensors()) { // For GPU tensors we need to store the memory release id in memory @@ -1373,7 +724,8 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) std::make_unique( output_tensor->Memory()->DataPtr()); uint64_t memory_release_id = - memory_manager_->AddRecord(std::move(gpu_memory_record)); + Stub()->GetMemoryManager()->AddRecord( + std::move(gpu_memory_record)); output_tensor->Memory()->SetMemoryReleaseId(memory_release_id); #endif } @@ -1390,7 +742,8 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) if (is_response_batch_set) { response_batch->has_error = true; LOG_IF_EXCEPTION( - pb_error_message = PbString::Create(shm_pool_, pb_exception.what())); + pb_error_message = + PbString::Create(Stub()->ShmPool(), pb_exception.what())); if (pb_error_message != nullptr) { response_batch->is_error_set = true; @@ -1421,12 +774,13 @@ void ModelInstanceState::DecoupledMessageQueueMonitor() { while (decoupled_thread_) { - bi::managed_external_buffer::handle_t handle = parent_message_queue_->Pop(); + bi::managed_external_buffer::handle_t handle = + Stub()->ParentMessageQueue()->Pop(); if (handle == DUMMY_MESSAGE) { break; } std::unique_ptr message = - IPCMessage::LoadFromSharedMemory(shm_pool_, handle); + IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle); // Need to notify the model instance thread that the execute response has // been received. @@ -1458,7 +812,8 @@ ModelInstanceState::ResponseSendDecoupled( std::shared_ptr response_send_message) { AllocatedSharedMemory send_message = - shm_pool_->Load(response_send_message->Args()); + Stub()->ShmPool()->Load( + response_send_message->Args()); ResponseSendMessage* send_message_payload = reinterpret_cast(send_message.data_.get()); @@ -1486,19 +841,19 @@ ModelInstanceState::ResponseSendDecoupled( if (send_message_payload->response != 0) { std::unique_ptr infer_response = InferResponse::LoadFromSharedMemory( - shm_pool_, send_message_payload->response, + Stub()->ShmPool(), send_message_payload->response, false /* open cuda ipc handle */); bool requires_deferred_callback = false; std::vector, void*>> gpu_output_buffers; std::shared_ptr error = infer_response->Send( response_factory, CudaStream(), requires_deferred_callback, - send_message_payload->flags, shm_pool_, gpu_output_buffers); + send_message_payload->flags, Stub()->ShmPool(), gpu_output_buffers); SetErrorForResponseSendMessage(send_message_payload, error, error_message); if (requires_deferred_callback) { AllocatedSharedMemory gpu_buffers_handle = - shm_pool_->Construct( + Stub()->ShmPool()->Construct( sizeof(uint64_t) + gpu_output_buffers.size() * sizeof(bi::managed_external_buffer::handle_t)); @@ -1601,24 +956,25 @@ ModelInstanceState::ProcessRequestsDecoupled( std::unique_ptr ipc_message; RETURN_IF_EXCEPTION( - ipc_message = IPCMessage::Create(shm_pool_, false /*inline_response*/)); + ipc_message = + IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/)); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; ipc_message->Args() = request_batch.handle_; received_message_ = nullptr; - ScopedDefer _([this] { stub_message_queue_->Push(DUMMY_MESSAGE); }); + ScopedDefer _([this] { Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); { std::unique_lock guard{mu_}; - stub_message_queue_->Push(ipc_message->ShmHandle()); + Stub()->StubMessageQueue()->Push(ipc_message->ShmHandle()); cv_.wait(guard, [this] { return received_message_ != nullptr; }); } AllocatedSharedMemory response_batch = - shm_pool_->Load(received_message_->Args()); + Stub()->ShmPool()->Load(received_message_->Args()); if (response_batch.data_->has_error) { if (response_batch.data_->is_error_set) { auto error = PbString::LoadFromSharedMemory( - shm_pool_, response_batch.data_->error); + Stub()->ShmPool(), response_batch.data_->error); return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } @@ -1689,7 +1045,7 @@ ModelInstanceState::ProcessRequests( responses)); std::shared_ptr ipc_message = - IPCMessage::Create(shm_pool_, false /*inline_response*/); + IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; ipc_message->Args() = request_batch.handle_; @@ -1699,7 +1055,7 @@ ModelInstanceState::ProcessRequests( // This means that the stub process has exited and Python // backend failed to restart the stub process. - if (stub_pid_ == 0) { + if (Stub()->StubPid() == 0) { const char* error_message = "The stub process has exited unexpectedly."; RespondErrorToAllRequests( error_message, responses, requests, request_count); @@ -1720,7 +1076,7 @@ ModelInstanceState::ProcessRequests( // the object stored in shared memory. NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); if (!restart) - stub_message_queue_->Push(DUMMY_MESSAGE); + Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); if (restart) { return; @@ -1728,8 +1084,8 @@ ModelInstanceState::ProcessRequests( RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, - ipc_message = - IPCMessage::LoadFromSharedMemory(shm_pool_, response_message)); + ipc_message = IPCMessage::LoadFromSharedMemory( + Stub()->ShmPool(), response_message)); // If the stub command is no longer PYTHONSTUB_InferExecRequest, it indicates // that inference request exeuction has finished and there are no more BLS @@ -1753,8 +1109,8 @@ ModelInstanceState::ProcessRequests( RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, - ipc_message = - IPCMessage::LoadFromSharedMemory(shm_pool_, response_message)); + ipc_message = IPCMessage::LoadFromSharedMemory( + Stub()->ShmPool(), response_message)); } uint64_t compute_end_ns = 0; @@ -1765,7 +1121,7 @@ ModelInstanceState::ProcessRequests( AllocatedSharedMemory response_batch; RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, - response_batch = shm_pool_->Load(ipc_message->Args())); + response_batch = Stub()->ShmPool()->Load(ipc_message->Args())); ResponseBatch* response_batch_shm_ptr = reinterpret_cast(response_batch.data_.get()); @@ -1779,7 +1135,7 @@ ModelInstanceState::ProcessRequests( RESPOND_ALL_AND_RETURN_IF_EXCEPTION( responses, request_count, error_message_shm = PbString::LoadFromSharedMemory( - shm_pool_, response_batch_shm_ptr->error)); + Stub()->ShmPool(), response_batch_shm_ptr->error)); RespondErrorToAllRequests( error_message_shm->String().c_str(), responses, requests, request_count); @@ -1816,7 +1172,8 @@ ModelInstanceState::ProcessRequests( std::unique_ptr& infer_response = shm_responses.back(); try { infer_response = InferResponse::LoadFromSharedMemory( - shm_pool_, response_shm_handle[r], false /* open_cuda_handle */); + Stub()->ShmPool(), response_shm_handle[r], + false /* open_cuda_handle */); if (infer_response->HasError()) { TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, @@ -1864,14 +1221,13 @@ ModelInstanceState::ProcessRequests( std::vector, void*>>{}; std::shared_ptr error = infer_response->Send( nullptr, CudaStream(), require_deferred_callback, - TRITONSERVER_RESPONSE_COMPLETE_FINAL, shm_pool_, gpu_output_buffers[r], - requested_output_names, response); + TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), + gpu_output_buffers[r], requested_output_names, response); GUARDED_RESPOND_IF_ERROR(responses, r, *error); // Error object will be deleted by the GUARDED_RESPOND macro *error = nullptr; error.reset(); - if (require_deferred_callback) { has_gpu_output = true; } @@ -1888,9 +1244,11 @@ ModelInstanceState::ProcessRequests( for (auto& gpu_output_buffer : gpu_output_buffers) { total_gpu_buffers_count += gpu_output_buffer.second.size(); } - AllocatedSharedMemory gpu_buffers_handle = shm_pool_->Construct( - sizeof(uint64_t) + total_gpu_buffers_count * - sizeof(bi::managed_external_buffer::handle_t)); + AllocatedSharedMemory gpu_buffers_handle = + Stub()->ShmPool()->Construct( + sizeof(uint64_t) + + total_gpu_buffers_count * + sizeof(bi::managed_external_buffer::handle_t)); uint64_t* gpu_buffer_count = reinterpret_cast(gpu_buffers_handle.data_.get()); *gpu_buffer_count = total_gpu_buffers_count; @@ -1960,64 +1318,21 @@ ModelInstanceState::ProcessRequests( ModelInstanceState::~ModelInstanceState() { - if (initialized_) { - { - bi::scoped_lock lock(*health_mutex_); - ipc_control_->stub_health = false; - } - - // Sleep 1 second so that the child process has a chance to change the - // health variable - sleep(1); - - bool healthy = false; - bool force_kill = false; - { - bi::scoped_lock lock(*health_mutex_); - healthy = ipc_control_->stub_health; - } - - if (healthy) { - // Finalize command does not have any arguments. - std::unique_ptr ipc_message = - IPCMessage::Create(shm_pool_, false /* inline_response */); - - ModelState* model_state = reinterpret_cast(Model()); - if (model_state->IsDecoupled()) { - futures_.clear(); - parent_message_queue_->Push(DUMMY_MESSAGE); - decoupled_monitor_.join(); - } - - // Wait for all the futures to be finished. - thread_pool_->wait(); - - ipc_message->Command() = PYTHONSTUB_FinalizeRequest; - stub_message_queue_->Push(ipc_message->ShmHandle()); - parent_message_queue_->Pop(); - - - stub_message_queue_.reset(); - parent_message_queue_.reset(); - memory_manager_.reset(); - - } else { - force_kill = true; - } - - int status; - if (force_kill) { - kill(stub_pid_, SIGKILL); + ModelState* model_state = reinterpret_cast(Model()); + Stub()->UpdateHealth(); + if (Stub()->IsHealthy()) { + if (model_state->IsDecoupled()) { + futures_.clear(); + Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); + decoupled_monitor_.join(); } - waitpid(stub_pid_, &status, 0); + // Wait for all the futures to be finished. + thread_pool_->wait(); } - // First destroy the IPCControl. This makes sure that IPCControl is - // destroyed before the shared memory manager goes out of scope. - ipc_control_.reset(); - stub_message_queue_.reset(); - parent_message_queue_.reset(); - memory_manager_.reset(); + Stub()->TerminateStub(); + received_message_.reset(); + Stub().reset(); } TRITONSERVER_Error* @@ -2033,6 +1348,29 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) RETURN_IF_ERROR(ex.err_); } + // Auto-complete the configuration if requested... + bool auto_complete_config = false; + RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig( + triton_model, &auto_complete_config)); + if (auto_complete_config) { + RETURN_IF_ERROR((*state)->LaunchAutoCompleteStubProcess()); + triton::common::TritonJson::WriteBuffer buf; + (*state)->Stub()->AutoCompleteConfig().Write(&buf); + + TRITONSERVER_Message* message; + RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson( + &message, buf.Base(), buf.Size())); + RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig( + triton_model, 1 /* config_version */, message)); + (*state)->ModelConfig() = std::move((*state)->Stub()->AutoCompleteConfig()); + + (*state)->Stub()->UpdateHealth(); + (*state)->Stub()->TerminateStub(); + (*state)->Stub().reset(); + } + + RETURN_IF_ERROR((*state)->ValidateModelConfig()); + return nullptr; // success } @@ -2055,6 +1393,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_BackendState(backend, &bstate)); backend_state_ = reinterpret_cast(bstate); triton::common::TritonJson::Value params; + common::TritonJson::Value model_config; if (model_config_.Find("parameters", ¶ms)) { // Skip the EXECUTION_ENV_PATH variable if it doesn't exist. TRITONSERVER_Error* error = @@ -2128,6 +1467,40 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) } } +TRITONSERVER_Error* +ModelState::LaunchAutoCompleteStubProcess() +{ + Stub() = std::make_unique("AUTOCOMPLETE_STUB"); + RETURN_IF_ERROR(Stub()->Initialize(this)); + try { + RETURN_IF_ERROR(Stub()->Launch()); + } + catch (const BackendModelException& ex) { + Stub()->UpdateHealth(); + Stub()->TerminateStub(); + Stub().reset(); + RETURN_ERROR_IF_TRUE( + ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, + std::string("unexpected nullptr in BackendModelException")); + RETURN_IF_ERROR(ex.err_); + } + + return nullptr; +} + +TRITONSERVER_Error* +ModelState::ValidateModelConfig() +{ + // We have the json DOM for the model configuration... + triton::common::TritonJson::WriteBuffer buffer; + RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer)); + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("model configuration:\n") + buffer.Contents()).c_str()); + + return nullptr; +} + extern "C" { TRITONSERVER_Error* @@ -2405,7 +1778,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( instance, reinterpret_cast(instance_state))); - RETURN_IF_ERROR(instance_state->SetupStubProcess()); + RETURN_IF_ERROR(instance_state->LaunchStubProcess()); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("TRITONBACKEND_ModelInstanceInitialize: instance " @@ -2439,9 +1812,9 @@ TRITONBACKEND_ModelInstanceExecute( LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "Stub process is unhealthy and it will be restarted."); - instance_state->KillStubProcess(); + instance_state->Stub()->KillStubProcess(); LOG_IF_ERROR( - instance_state->StartStubProcess(), + instance_state->Stub()->Launch(), "Failed to restart the stub process."); } } else { diff --git a/src/python_be.h b/src/python_be.h new file mode 100644 index 00000000..d66e169a --- /dev/null +++ b/src/python_be.h @@ -0,0 +1,356 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "infer_request.h" +#include "infer_response.h" +#include "ipc_message.h" +#include "memory_manager.h" +#include "message_queue.h" +#include "pb_env.h" +#include "pb_map.h" +#include "pb_metric_reporter.h" +#include "pb_utils.h" +#include "request_executor.h" +#include "scoped_defer.h" +#include "shm_manager.h" +#include "stub_launcher.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_input_collector.h" +#include "triton/backend/backend_memory.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/common/nvtx.h" +#include "triton/common/triton_json.h" +#include "triton/core/tritonbackend.h" +#include "triton/core/tritonserver.h" + +#define LOG_IF_EXCEPTION(X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, pb_exception.what()); \ + } \ + } while (false) + +#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ + do { \ + TRITONSERVER_Error* raasnie_err__ = (X); \ + if (raasnie_err__ != nullptr) { \ + for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ + if ((*RESPONSES)[ridx] != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + raasnie_err__), \ + "failed to send error response"); \ + (*RESPONSES)[ridx] = nullptr; \ + } \ + } \ + TRITONSERVER_ErrorDelete(raasnie_err__); \ + return; \ + } \ + } while (false) + + +#define RESPOND_ALL_AND_RETURN_IF_EXCEPTION(RESPONSES, RESPONSES_COUNT, X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& exception) { \ + TRITONSERVER_Error* raarie_err__ = TRITONSERVER_ErrorNew( \ + TRITONSERVER_ERROR_INTERNAL, exception.what()); \ + for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) { \ + if ((*RESPONSES)[ridx] != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + raarie_err__), \ + "failed to send error response"); \ + (*RESPONSES)[ridx] = nullptr; \ + } \ + } \ + TRITONSERVER_ErrorDelete(raarie_err__); \ + return; \ + } \ + } while (false) + +#define RESPOND_AND_RETURN_IF_ERROR(REQUEST, X) \ + do { \ + TRITONSERVER_Error* rarie_err__ = (X); \ + if (rarie_err__ != nullptr) { \ + TRITONBACKEND_Response* rarie_response__ = nullptr; \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseNew(&rarie_response__, REQUEST), \ + "failed to create response"); \ + if (rarie_response__ != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + rarie_response__, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + rarie_err__), \ + "failed to send error response"); \ + } \ + return rarie_err__; \ + } \ + } while (false) + +#define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, X) \ + do { \ + if ((*RESPONSES)[IDX] != nullptr) { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + err__), \ + "failed to send error response"); \ + (*RESPONSES)[IDX] = nullptr; \ + TRITONSERVER_ErrorDelete(err__); \ + } \ + } \ + } while (false) + +#define GUARDED_RESPOND_IF_EXCEPTION(RESPONSES, IDX, X) \ + do { \ + if ((*RESPONSES)[IDX] != nullptr) { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + TRITONSERVER_Error* err__ = TRITONSERVER_ErrorNew( \ + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ + LOG_IF_ERROR( \ + TRITONBACKEND_ResponseSend( \ + (*RESPONSES)[IDX], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \ + err__), \ + "failed to send error response"); \ + (*RESPONSES)[IDX] = nullptr; \ + TRITONSERVER_ErrorDelete(err__); \ + } \ + } \ + } while (false) + +#define RETURN_IF_EXCEPTION(X) \ + do { \ + try { \ + (X); \ + } \ + catch (const PythonBackendException& pb_exception) { \ + TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \ + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ + return rarie_err__; \ + } \ + } while (false) + +namespace triton { namespace backend { namespace python { + +namespace bi = boost::interprocess; + +struct BackendState { + std::string python_lib; + int64_t shm_default_byte_size; + int64_t shm_growth_byte_size; + int64_t stub_timeout_seconds; + int64_t shm_message_queue_size; + std::atomic number_of_instance_inits; + std::string shared_memory_region_prefix; + int64_t thread_pool_size; + std::unique_ptr env_manager; +}; + +class ModelState : public BackendModel { + public: + static TRITONSERVER_Error* Create( + TRITONBACKEND_Model* triton_model, ModelState** state); + + // Get backend state + BackendState* StateForBackend() { return backend_state_; } + + // Get the Python execution environment + std::string PythonExecutionEnv() { return python_execution_env_; } + + // Force CPU only tensors + bool ForceCPUOnlyInputTensors() { return force_cpu_only_input_tensors_; } + + // Is decoupled API being used. + bool IsDecoupled() { return decoupled_; } + + // Launch auto-complete stub process. + TRITONSERVER_Error* LaunchAutoCompleteStubProcess(); + + // Validate Model Configuration + TRITONSERVER_Error* ValidateModelConfig(); + + // Auto-complete stub + std::unique_ptr& Stub() { return auto_complete_stub_; } + + private: + ModelState(TRITONBACKEND_Model* triton_model); + BackendState* backend_state_; + std::string python_execution_env_; + bool force_cpu_only_input_tensors_; + bool decoupled_; + std::unique_ptr auto_complete_stub_; +}; + +class ModelInstanceState : public BackendModelInstance { + ModelInstanceState( + ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance); + + TRITONBACKEND_Model* triton_model_; + std::unique_ptr model_instance_stub_; + std::vector bls_inference_responses_; + std::mutex bls_responses_mutex_; + std::vector closed_requests_; + std::mutex closed_requests_mutex_; + + // Decoupled monitor thread + std::thread decoupled_monitor_; + bool decoupled_thread_; + std::mutex mu_; + std::condition_variable cv_; + std::unique_ptr received_message_; + std::vector> futures_; + std::unique_ptr thread_pool_; + + public: + static TRITONSERVER_Error* Create( + ModelState* model_state, TRITONBACKEND_ModelInstance* model_instance, + ModelInstanceState** model_instance_state); + + ~ModelInstanceState(); + + // Launch stub process. + TRITONSERVER_Error* LaunchStubProcess(); + + TRITONSERVER_Error* SendMessageToStub(off_t message); + void ResponseSendDecoupled(std::shared_ptr response_send_message); + + // Checks whether the stub process is live + bool IsStubProcessAlive(); + + // Get a message from the stub process + TRITONSERVER_Error* ReceiveMessageFromStub(off_t& message); + + // Get a message from the stub process + void SendMessageAndReceiveResponse( + off_t message, off_t& response, bool& restart, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // Responds to all the requests with an error message. + void RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // In the decoupled mode, the parent message queue is monitored only by this + // function during the execute phase. No other thread should pop any message + // from the message queue in the decoupled mode. + void DecoupledMessageQueueMonitor(); + + // Convert TRITONBACKEND_Input to Python backend tensors. + TRITONSERVER_Error* GetInputTensor( + const uint32_t input_idx, std::shared_ptr& input_tensor, + TRITONBACKEND_Request* request, + std::shared_ptr>& responses); + + // Process all the requests obtained from Triton. + void ProcessRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count, + bool& restart); + + // Process all the requests in the decoupled mode. + TRITONSERVER_Error* ProcessRequestsDecoupled( + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector>& pb_infer_requests); + + bool ExistsInClosedRequests(intptr_t closed_request); + + // Execute a BLS Request + void ExecuteBLSRequest(std::shared_ptr ipc_message); + + // Cleanup BLS responses + void CleanupBLSResponses(); + + // Wait for BLS requests to complete + void WaitForBLSRequestsToFinish(); + + // Check the incoming requests for errors + TRITONSERVER_Error* CheckIncomingRequests( + TRITONBACKEND_Request** requests, const uint32_t request_count, + size_t& total_batch_size); + + // Set error for response send message + void SetErrorForResponseSendMessage( + ResponseSendMessage* response_send_message, + std::shared_ptr error, + std::unique_ptr& error_message); + + TRITONSERVER_Error* SaveRequestsToSharedMemory( + TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector>& pb_inference_requests, + AllocatedSharedMemory& request_batch, + std::shared_ptr>& responses); + + // Model instance stub + std::unique_ptr& Stub() { return model_instance_stub_; } +}; +}}} // namespace triton::backend::python diff --git a/src/request_executor.h b/src/request_executor.h index 9dab7609..bb3e1e60 100644 --- a/src/request_executor.h +++ b/src/request_executor.h @@ -24,11 +24,14 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + #include #include "infer_request.h" #include "infer_response.h" namespace triton { namespace backend { namespace python { + TRITONSERVER_Error* CreateTritonErrorFromException( const PythonBackendException& pb_exception); @@ -48,5 +51,4 @@ class RequestExecutor { ~RequestExecutor(); }; - }}} // namespace triton::backend::python diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index 17c7b150..18ace28d 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ import numpy as np import struct +import json TRITON_STRING_TO_NUMPY = { 'TYPE_BOOL': bool, @@ -295,6 +296,182 @@ def triton_string_to_numpy(triton_type_string): return TRITON_STRING_TO_NUMPY[triton_type_string] +class ModelConfig: + """An object of ModelConfig class is used to describe + the model configuration for autocomplete. + Parameters + ---------- + model_config : ModelConfig Object + Object containing the model configuration. Only the max_batch_size, inputs + and outputs properties can be modified for auto-complete model configuration. + """ + + def __init__(self, model_config): + self._model_config = json.loads(model_config) + + def as_dict(self): + """Provide the read-only access to the model configuration + Returns + ------- + dict + dictionary type of the model configuration contained in + the ModelConfig object + """ + return self._model_config + + def set_max_batch_size(self, max_batch_size): + """Set the max batch size for the model. + Parameters + ---------- + max_batch_size : int + The max_batch_size to be set. + Raises + ------ + ValueError + If configuration has specified max_batch_size non-zero value which + is larger than the max_batch_size to be set for the model. + """ + if self._model_config["max_batch_size"] > max_batch_size: + raise ValueError( + "configuration specified max_batch_size " + + str(self._model_config["max_batch_size"]) + + ", but in auto-complete-config function for model '" + + self._model_config["name"] + "' specified max_batch_size " + + str(max_batch_size)) + else: + self._model_config["max_batch_size"] = max_batch_size + + def add_input(self, input): + """Add the input for the model. + Parameters + ---------- + input : dict + The input to be added. + Raises + ------ + ValueError + If input contains property other than 'name', 'data_type' + and 'dims' or any of the properties are not set, or if an + input with the same name already exists in the configuration + but has different data_type or dims property + """ + valid_properties = ['name', 'data_type', 'dims'] + for current_property in input: + if current_property not in valid_properties: + raise ValueError( + "input '" + input['name'] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' contains property other than 'name', 'data_type' and 'dims'." + ) + + if 'name' not in input: + raise ValueError( + "input in auto-complete-config function for model '" + + self._model_config["name"] + "' is missing 'name' property.") + elif 'data_type' not in input: + raise ValueError("input '" + input['name'] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'data_type' property.") + elif 'dims' not in input: + raise ValueError("input '" + input['name'] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'dims' property.") + + for current_input in self._model_config["input"]: + if input['name'] == current_input['name']: + if current_input[ + 'data_type'] != "TYPE_INVALID" and current_input[ + 'data_type'] != input['data_type']: + raise ValueError("unable to load model '" + + self._model_config["name"] + + "', configuration expects datatype " + + current_input['data_type'] + + " for input '" + input['name'] + + "', model provides " + input['data_type']) + elif current_input[ + 'dims'] and current_input['dims'] != input['dims']: + raise ValueError( + "model '" + self._model_config["name"] + "', tensor '" + + input['name'] + "': the model expects dims " + + str(input['dims']) + + " but the model configuration specifies dims " + + str(current_input['dims'])) + else: + current_input['data_type'] = input['data_type'] + current_input['dims'] = input['dims'] + return + + self._model_config["input"].append(input) + + def add_output(self, output): + """Add the output for the model. + Parameters + ---------- + output : dict + The output to be added. + Raises + ------ + ValueError + If output contains property other than 'name', 'data_type' + and 'dims' or any of the properties are not set, or if an + output with the same name already exists in the configuration + but has different data_type or dims property + """ + valid_properties = ['name', 'data_type', 'dims'] + for current_property in output: + if current_property not in valid_properties: + raise ValueError( + "output '" + output['name'] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' contains property other than 'name', 'data_type' and 'dims'." + ) + + if 'name' not in output: + raise ValueError( + "output in auto-complete-config function for model '" + + self._model_config["name"] + "' is missing 'name' property.") + elif 'data_type' not in output: + raise ValueError("output '" + output['name'] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'data_type' property.") + elif 'dims' not in output: + raise ValueError("output '" + output['name'] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'dims' property.") + + for current_output in self._model_config["output"]: + if output['name'] == current_output['name']: + if current_output[ + 'data_type'] != "TYPE_INVALID" and current_output[ + 'data_type'] != output['data_type']: + raise ValueError("unable to load model '" + + self._model_config["name"] + + "', configuration expects datatype " + + current_output['data_type'] + + " for output '" + output['name'] + + "', model provides " + output['data_type']) + elif current_output[ + 'dims'] and current_output['dims'] != output['dims']: + raise ValueError( + "model '" + self._model_config["name"] + "', tensor '" + + output['name'] + "': the model expects dims " + + str(output['dims']) + + " but the model configuration specifies dims " + + str(current_output['dims'])) + else: + current_output['data_type'] = output['data_type'] + current_output['dims'] = output['dims'] + return + + self._model_config["output"].append(output) + + TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1 TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2 TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1 diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc new file mode 100644 index 00000000..68ffd125 --- /dev/null +++ b/src/stub_launcher.cc @@ -0,0 +1,522 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "stub_launcher.h" +#include "python_be.h" + +namespace triton { namespace backend { namespace python { + +StubLauncher::StubLauncher(const std::string stub_process_kind) + : parent_pid_(0), stub_pid_(0), is_initialized_(false), + stub_process_kind_(stub_process_kind), model_instance_name_(""), + device_id_(0), kind_("") + +{ +} + +StubLauncher::StubLauncher( + const std::string stub_process_kind, const std::string model_instance_name, + const int32_t device_id, const std::string kind) + : parent_pid_(0), stub_pid_(0), is_initialized_(false), + stub_process_kind_(stub_process_kind), + model_instance_name_(model_instance_name), device_id_(device_id), + kind_(kind) +{ +} + +TRITONSERVER_Error* +StubLauncher::Initialize(ModelState* model_state) +{ + model_name_ = model_state->Name(); + shm_default_byte_size_ = + model_state->StateForBackend()->shm_default_byte_size; + shm_growth_byte_size_ = model_state->StateForBackend()->shm_growth_byte_size; + shm_message_queue_size_ = + model_state->StateForBackend()->shm_message_queue_size; + python_execution_env_ = model_state->PythonExecutionEnv(); + python_lib_ = model_state->StateForBackend()->python_lib; + model_state->ModelConfig().Write(&model_config_buffer_); + is_decoupled_ = model_state->IsDecoupled(); + model_repository_path_ = model_state->RepositoryPath(); + + // Increase the stub process count to avoid shared memory region name + // collision + model_state->StateForBackend()->number_of_instance_inits++; + shm_region_name_ = + model_state->StateForBackend()->shared_memory_region_prefix + + std::to_string(model_state->StateForBackend()->number_of_instance_inits); + + model_version_ = model_state->Version(); + + std::stringstream ss; + std::string artifact_name; + RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( + "default_model_filename", &artifact_name)); + ss << model_repository_path_ << "/" << model_version_ << "/"; + + if (artifact_name.size() > 0) { + ss << artifact_name; + } else { + // Default artifact name. + ss << "model.py"; + } + + model_path_ = ss.str(); + struct stat buffer; + + // Check if model.py exists + if (stat(model_path_.c_str(), &buffer) != 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("model.py does not exist in the model repository path: " + model_path_) + .c_str()); + } + + // Path to the extracted Python env + std::string python_execution_env = ""; + if (python_execution_env_ != "") { + try { + python_execution_env = + model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( + python_execution_env_); + } + catch (PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + path_to_activate_ = python_execution_env + "/bin/activate"; + path_to_libpython_ = python_execution_env + "/lib"; + if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Path " + path_to_activate_ + + " does not exist. The Python environment should contain an " + "'activate' script.") + .c_str()); + } + } + + parent_pid_ = getpid(); + + return nullptr; +} + +TRITONSERVER_Error* +StubLauncher::Setup() +{ + // Destruct any in-use shared memory object before starting the stub process. + ipc_control_ = nullptr; + stub_message_queue_ = nullptr; + parent_message_queue_ = nullptr; + memory_manager_ = nullptr; + + try { + // It is necessary for restart to make sure that the previous shared memory + // pool is destructed before the new pool is created. + shm_pool_ = nullptr; + shm_pool_ = std::make_unique( + shm_region_name_, shm_default_byte_size_, shm_growth_byte_size_, + true /* create */); + } + catch (const PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + AllocatedSharedMemory current_ipc_control = + shm_pool_->Construct(); + ipc_control_ = std::move(current_ipc_control.data_); + ipc_control_handle_ = current_ipc_control.handle_; + + RETURN_IF_EXCEPTION( + stub_message_queue_ = + MessageQueue::Create( + shm_pool_, shm_message_queue_size_)); + RETURN_IF_EXCEPTION( + parent_message_queue_ = + MessageQueue::Create( + shm_pool_, shm_message_queue_size_)); + + std::unique_ptr> memory_manager_message_queue; + RETURN_IF_EXCEPTION( + memory_manager_message_queue = + MessageQueue::Create(shm_pool_, shm_message_queue_size_)); + + memory_manager_message_queue->ResetSemaphores(); + ipc_control_->memory_manager_message_queue = + memory_manager_message_queue->ShmHandle(); + ipc_control_->decoupled = is_decoupled_; + + memory_manager_ = + std::make_unique(std::move(memory_manager_message_queue)); + ipc_control_->parent_message_queue = parent_message_queue_->ShmHandle(); + ipc_control_->stub_message_queue = stub_message_queue_->ShmHandle(); + + new (&(ipc_control_->stub_health_mutex)) bi::interprocess_mutex; + health_mutex_ = &(ipc_control_->stub_health_mutex); + + stub_message_queue_->ResetSemaphores(); + parent_message_queue_->ResetSemaphores(); + + is_initialized_ = false; + + return nullptr; +} + +TRITONSERVER_Error* +StubLauncher::Launch() +{ + RETURN_IF_ERROR(Setup()); + + std::string stub_name; + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + stub_name = model_name_; + } else { + stub_name = model_instance_name_; + } + + pid_t pid = fork(); + if (pid < 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Failed to fork the stub process for auto-complete."); + } + if (pid == 0) { + const char* stub_args[4]; + stub_args[0] = "bash"; + stub_args[1] = "-c"; + stub_args[3] = nullptr; // Last argument must be nullptr + + // Default Python backend stub + std::string python_backend_stub = + python_lib_ + "/triton_python_backend_stub"; + + // Path to alternative Python backend stub + std::string model_python_backend_stub = + std::string(model_repository_path_) + "/triton_python_backend_stub"; + + if (FileExists(model_python_backend_stub)) { + python_backend_stub = model_python_backend_stub; + } + + std::string bash_argument; + + // This shared memory variable indicates whether the stub process should + // revert the LD_LIBRARY_PATH changes to avoid shared library issues in + // executables and libraries. + ipc_control_->uses_env = false; + if (python_execution_env_ != "") { + std::stringstream ss; + + // Need to properly set the LD_LIBRARY_PATH so that Python environments + // using different python versions load properly. + ss << "source " << path_to_activate_ + << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ + << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ + << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " + << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ + << " " << ipc_control_handle_ << " " << stub_name; + ipc_control_->uses_env = true; + bash_argument = ss.str(); + } else { + std::stringstream ss; + ss << " exec " << python_backend_stub << " " << model_path_ << " " + << shm_region_name_ << " " << shm_default_byte_size_ << " " + << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ + << " " << ipc_control_handle_ << " " << stub_name; + bash_argument = ss.str(); + } + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Starting Python backend stub: ") + bash_argument) + .c_str()); + + stub_args[2] = bash_argument.c_str(); + + int stub_status_code = + system((python_backend_stub + "> /dev/null 2>&1").c_str()); + + // If running stub process without any arguments returns any status code, + // other than 1, it can indicate a permission issue as a result of + // downloading the stub process from a cloud object storage service. + if (WEXITSTATUS(stub_status_code) != 1) { + // Give the execute permission for the triton_python_backend_stub to the + // owner. + int error = chmod(python_backend_stub.c_str(), S_IXUSR); + if (error != 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Failed to give execute permission to " + "triton_python_backend_stub in ") + + python_backend_stub + " " + stub_name + + " Error No.: " + std::to_string(error)) + .c_str()); + } + } + + if (execvp("bash", (char**)stub_args) != 0) { + std::stringstream ss; + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Shared Memory Region Name: " << shm_region_name_ << '\n' + << "Shared Memory Default Byte Size: " << shm_default_byte_size_ + << '\n' + << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; + std::string log_message = ss.str(); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str()); + + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Failed to initialize ") + stub_name).c_str()); + } + } else { + ScopedDefer _([&] { + // Push a dummy message to the message queue so that the stub + // process is notified that it can release the object stored in + // shared memory. + stub_message_queue_->Push(DUMMY_MESSAGE); + + // If the model is not initialized, wait for the stub process to exit. + if (!is_initialized_) { + int status; + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); + waitpid(stub_pid_, &status, 0); + } + }); + + stub_pid_ = pid; + + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + try { + AutocompleteStubProcess(); + } + catch (const PythonBackendException& ex) { + // Need to kill the stub process first + kill(stub_pid_, SIGKILL); + int status; + waitpid(stub_pid_, &status, 0); + stub_pid_ = 0; + throw BackendModelException( + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); + } + } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(ModelInstanceStubProcess()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Unknown stub_process_kind: ") + stub_process_kind_) + .c_str()); + } + + is_initialized_ = true; + } + + return nullptr; +} + +void +StubLauncher::AutocompleteStubProcess() +{ + std::string model_config = model_config_buffer_.MutableContents(); + + std::unique_ptr auto_complete_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + auto_complete_message->Command() = PYTHONSTUB_AutoCompleteRequest; + + std::unique_ptr pb_string = + PbString::Create(shm_pool_, model_config); + bi::managed_external_buffer::handle_t string_handle = pb_string->ShmHandle(); + + auto_complete_message->Args() = string_handle; + stub_message_queue_->Push(auto_complete_message->ShmHandle()); + + std::unique_ptr auto_complete_response_message = + IPCMessage::LoadFromSharedMemory(shm_pool_, parent_message_queue_->Pop()); + + if (auto_complete_response_message->Command() != + PYTHONSTUB_AutoCompleteResponse) { + throw PythonBackendException( + "Received unexpected response from Python backend stub: " + + model_name_); + } + + auto auto_complete_response = + std::move((shm_pool_->Load( + auto_complete_response_message->Args()))) + .data_; + + if (auto_complete_response->response_has_error) { + if (auto_complete_response->response_is_error_set) { + std::unique_ptr error_message = PbString::LoadFromSharedMemory( + shm_pool_, auto_complete_response->response_error); + throw PythonBackendException(error_message->String()); + } else { + throw PythonBackendException("Auto-complete failed for " + model_name_); + } + } + + if (auto_complete_response->response_has_model_config) { + std::unique_ptr auto_complete_config = + PbString::LoadFromSharedMemory( + shm_pool_, auto_complete_response->response_model_config); + std::string auto_complete_config_string = auto_complete_config->String(); + if (!auto_complete_config_string.empty()) { + TRITONSERVER_Error* err = + auto_complete_config_.Parse(auto_complete_config_string); + if (err != nullptr) { + throw PythonBackendException("Failed to parse auto-complete JSON."); + } + } + } +} + +TRITONSERVER_Error* +StubLauncher::ModelInstanceStubProcess() +{ + std::unordered_map initialize_map = { + {"model_config", model_config_buffer_.MutableContents()}, + {"model_instance_kind", kind_}, + {"model_instance_name", model_instance_name_}, + {"model_instance_device_id", std::to_string(device_id_)}, + {"model_repository", model_path_}, + {"model_version", std::to_string(model_version_)}, + {"model_name", model_name_}}; + + std::unique_ptr initialize_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + initialize_message->Command() = PYTHONSTUB_InitializeRequest; + + std::unique_ptr pb_map = PbMap::Create(shm_pool_, initialize_map); + bi::managed_external_buffer::handle_t initialize_map_handle = + pb_map->ShmHandle(); + + initialize_message->Args() = initialize_map_handle; + stub_message_queue_->Push(initialize_message->ShmHandle()); + + std::unique_ptr initialize_response_message = + IPCMessage::LoadFromSharedMemory(shm_pool_, parent_message_queue_->Pop()); + + if (initialize_response_message->Command() != PYTHONSTUB_InitializeResponse) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string( + "Received unexpected response from Python backend stub: ") + + model_instance_name_) + .c_str()); + } + + auto initialize_response = + std::move((shm_pool_->Load( + initialize_response_message->Args()))) + .data_; + + if (initialize_response->response_has_error) { + if (initialize_response->response_is_error_set) { + std::unique_ptr error_message = PbString::LoadFromSharedMemory( + shm_pool_, initialize_response->response_error); + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, error_message->String().c_str()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Launch stub process failed for ") + model_name_) + .c_str()); + } + } + + return nullptr; +} + +void +StubLauncher::UpdateHealth() +{ + is_healthy_ = false; + if (is_initialized_) { + { + bi::scoped_lock lock(*health_mutex_); + ipc_control_->stub_health = false; + } + + // Sleep 1 second so that the child process has a chance to change the + // health variable + sleep(1); + + { + bi::scoped_lock lock(*health_mutex_); + is_healthy_ = ipc_control_->stub_health; + } + } +} + +void +StubLauncher::TerminateStub() +{ + if (is_initialized_) { + bool force_kill = false; + if (is_healthy_) { + // Finalize command does not have any arguments. + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + + ipc_message->Command() = PYTHONSTUB_FinalizeRequest; + stub_message_queue_->Push(ipc_message->ShmHandle()); + parent_message_queue_->Pop(); + + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); + } else { + force_kill = true; + } + + int status; + if (force_kill) { + kill(stub_pid_, SIGKILL); + } + waitpid(stub_pid_, &status, 0); + } + + // First destroy the IPCControl. This makes sure that IPCControl is + // destroyed before the shared memory manager goes out of scope. + ipc_control_.reset(); + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); +} + +void +StubLauncher::KillStubProcess() +{ + kill(stub_pid_, SIGKILL); + int status; + waitpid(stub_pid_, &status, 0); + stub_pid_ = 0; +} + +}}}; // namespace triton::backend::python diff --git a/src/stub_launcher.h b/src/stub_launcher.h new file mode 100644 index 00000000..4c3070be --- /dev/null +++ b/src/stub_launcher.h @@ -0,0 +1,172 @@ +// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ipc_message.h" +#include "memory_manager.h" +#include "message_queue.h" +#include "pb_utils.h" +#include "triton/backend/backend_common.h" +#include "triton/backend/backend_model.h" +#include "triton/backend/backend_model_instance.h" +#include "triton/core/tritonbackend.h" +#include "triton/core/tritonserver.h" + +namespace triton { namespace backend { namespace python { + +class ModelState; + +class StubLauncher { + public: + StubLauncher(const std::string stub_process_kind); + StubLauncher( + const std::string stub_process_kind, + const std::string model_instance_name, const int32_t device_id, + const std::string kind); + + // Initialize stub process + TRITONSERVER_Error* Initialize(ModelState* model_state); + + // Stub process setup + TRITONSERVER_Error* Setup(); + + // Launch stub process + TRITONSERVER_Error* Launch(); + + // Auto-complete stub process + void AutocompleteStubProcess(); + + // Model instance stub process + TRITONSERVER_Error* ModelInstanceStubProcess(); + + // Stub PID + pid_t StubPid() { return stub_pid_; } + + // Health mutex + bi::interprocess_mutex* HealthMutex() { return health_mutex_; } + + // Stub message queue + std::unique_ptr>& + StubMessageQueue() + { + return stub_message_queue_; + } + + // Parent message queue + std::unique_ptr>& + ParentMessageQueue() + { + return parent_message_queue_; + } + + // Memory Manager + std::unique_ptr& GetMemoryManager() { return memory_manager_; } + + // IPC control + std::unique_ptr>& + IpcControl() + { + return ipc_control_; + } + + // Shared memory pool + std::unique_ptr& ShmPool() { return shm_pool_; } + + // Get auto-complete model configuration + common::TritonJson::Value& AutoCompleteConfig() + { + return auto_complete_config_; + } + + // Update health variable + void UpdateHealth(); + + // Is Healthy + bool IsHealthy() { return is_healthy_; } + + // Destruct Stub process + void TerminateStub(); + + // Kill stub process + void KillStubProcess(); + + private: + pid_t parent_pid_; + pid_t stub_pid_; + + bool is_initialized_; + bool is_decoupled_; + bool is_healthy_; + std::string shm_region_name_; + std::string model_repository_path_; + std::string model_path_; + const std::string stub_process_kind_; + std::string model_name_; + const std::string model_instance_name_; + const int32_t device_id_; + const std::string kind_; + uint64_t model_version_; + + std::string python_lib_; + int64_t shm_default_byte_size_; + int64_t shm_growth_byte_size_; + int64_t shm_message_queue_size_; + + // Path to python execution environment + std::string path_to_libpython_; + std::string path_to_activate_; + std::string python_execution_env_; + + common::TritonJson::WriteBuffer model_config_buffer_; + common::TritonJson::Value auto_complete_config_; + + bi::interprocess_mutex* health_mutex_; + std::unique_ptr> + stub_message_queue_; + std::unique_ptr> + parent_message_queue_; + std::unique_ptr memory_manager_; + std::unique_ptr> + ipc_control_; + bi::managed_external_buffer::handle_t ipc_control_handle_; + std::unique_ptr shm_pool_; +}; +}}} // namespace triton::backend::python From 242770bdd96de302b6d63e95e3865e0b73af51b8 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Fri, 3 Jun 2022 11:51:38 -0700 Subject: [PATCH 043/216] Remove beta quality from decoupled API in python backend (#162) --- README.md | 16 ++++------------ examples/decoupled/README.md | 2 +- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4eb87d6f..775ff73a 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ any C++ code. - [`initialize`](#initialize) - [`execute`](#execute) - [Default Mode](#default-mode) - - [Decoupled Mode](#decoupled-mode-beta) + - [Decoupled Mode](#decoupled-mode) - [`finalize`](#finalize) - [Model Config File](#model-config-file) - [Using Custom Python Execution Environments](#using-custom-python-execution-environments) @@ -444,7 +444,7 @@ class TritonPythonModel: ``` -#### Decoupled mode \[Beta\] +#### Decoupled mode This mode allows user to send multiple responses for a request or not send any responses for a request. A model may also send @@ -506,15 +506,7 @@ for more details on how to host a decoupled model. ##### Known Issues -The support for decoupled models is still in beta and suffers -from below known issues: - -* The decoupled mode doesn't support [FORCE_CPU_ONLY_INPUT_TENSORS](#input-tensor-device-placement) - parameter to be turned off. This means that the input tensors - will always be in CPU. -* Currently, the InferenceResponseSender.send method only supports - inference_response objects that contain only CPU tensors. -* The metrics collection may be incomplete. +* Currently, async BLS requests are not supported in decoupled mode. ### `finalize` @@ -1043,7 +1035,7 @@ You can find the complete example instructions in [examples/preprocessing](examp ## Decoupled Models The examples of decoupled models shows how to develop and serve -[decoupled models](../../README.md#decoupled-mode-beta) in Triton using Python backend. +[decoupled models](../../README.md#decoupled-mode) in Triton using Python backend. You can find the complete example instructions in [examples/decoupled](examples/decoupled/README.md). # Running with Inferentia diff --git a/examples/decoupled/README.md b/examples/decoupled/README.md index c000749c..5e231c78 100644 --- a/examples/decoupled/README.md +++ b/examples/decoupled/README.md @@ -29,7 +29,7 @@ # Decoupled Model Examples In this section we demonstrate an end-to-end examples for developing and -serving [decoupled models](../../README.md#decoupled-mode-beta) in Python backend. +serving [decoupled models](../../README.md#decoupled-mode) in Python backend. [repeat_model.py](repeat_model.py) and [square_model.py](square_model.py) demonstrate how to write a decoupled model where each request can generate 0 to many responses. From 8ae1782df2edc805d224ce5ebc3dda16e274a3e1 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 3 Jun 2022 15:36:51 -0700 Subject: [PATCH 044/216] Add README for auto-complete example (#161) * Add README and modify the client output * Address review * Remove parenthesis * Remove inference part in client.py * Fix error message --- examples/auto_complete/README.md | 108 +++++++++++++++++++++++++++++++ examples/auto_complete/client.py | 47 ++------------ examples/bls/sync_client.py | 10 +-- 3 files changed, 117 insertions(+), 48 deletions(-) create mode 100644 examples/auto_complete/README.md diff --git a/examples/auto_complete/README.md b/examples/auto_complete/README.md new file mode 100644 index 00000000..662bee9a --- /dev/null +++ b/examples/auto_complete/README.md @@ -0,0 +1,108 @@ + + +# Auto-Complete Example + +This example shows how to implement +[`auto_complete_config`](https://github.com/triton-inference-server/python_backend/#auto_complete_config) +function in Python backend to provide +[`max_batch_size`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size), +[`input`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) +and [`output`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) +properties. These properties will allow Triton to load the Python model with +[Minimal Model Configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#minimal-model-configuration) +in absence of a configuration file. + +The +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/model_repository.md) +should contain [nobatch_auto_complete](./nobatch_model.py), and +[batch_auto_complete](./batch_model.py) models. +The max_batch_size of [nobatch_auto_complete](./nobatch_model.py) model is set +to zero, whereas the max_batch_size of [batch_auto_complete](./batch_model.py) +model is set to 4. For models with a non-zero value of max_batch_size, the +configuration can specify a different value of max_batch_size as long as it +does not exceed the value set in the model file. + +The +[nobatch_auto_complete](./nobatch_model.py) and +[batch_auto_complete](./batch_model.py) models calculate the sum and difference +of the `INPUT0` and `INPUT1` and put the results in `OUTPUT0` and `OUTPUT1` +respectively. + +## Deploying the Auto-Complete Models + +1. Create the model repository: + +```console +$ mkdir -p models/nobatch_auto_complete/1/ +$ mkdir -p models/batch_auto_complete/1/ + +# Copy the Python models +$ cp examples/auto_complete/nobatch_model.py models/nobatch_auto_complete/1/model.py +$ cp examples/auto_complete/batch_model.py models/batch_auto_complete/1/model.py +``` +**Note that we don't need a model configuration file since Triton will use the +auto-complete model configuration provided in the Python model.** + +2. Start the tritonserver with +[`--strict-model-config=false`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#auto-generated-model-configuration): + +``` +tritonserver --model-repository `pwd`/models --strict-model-config=false +``` + +## Running inferences on Nobatch and Batch models: + +Send inference requests using [client.py](./client.py). + +``` +python3 examples/auto_complete/client.py +``` + +You should see an output similar to the output below: + +``` +'nobatch_auto_complete' configuration matches the expected auto complete configuration + +'batch_auto_complete' configuration matches the expected auto complete configuration + +PASS: auto_complete + +``` + +The [nobatch_model.py](./nobatch_model.py) and [batch_model.py](./batch_model.py) +model files are heavily commented with explanations about how to utilize +`set_max_batch_size`, `add_input`, and `add_output`functions to set +`max_batch_size`, `input` and `output` properties of the model. + +### Explanation of the Client Output + +For each model, the [client.py](./client.py) first requests the model +configuration from Triton to validate if the model configuration has been +registered as expected. The client then sends an inference request to verify +whether the inference has run properly and the result is correct. diff --git a/examples/auto_complete/client.py b/examples/auto_complete/client.py index 9ddf5d32..d2ef893b 100644 --- a/examples/auto_complete/client.py +++ b/examples/auto_complete/client.py @@ -32,8 +32,6 @@ nobatch_model_name = "nobatch_auto_complete" batch_model_name = "batch_auto_complete" -nobatch_shape = [4] -batch_shape = [1, 4] def validate_ios(config, expected_ios, model_name): @@ -72,10 +70,10 @@ def validate_ios(config, expected_ios, model_name): 'data_type': 'TYPE_FP32', 'dims': [4] }] + models = [nobatch_model_name, batch_model_name] - shapes = [nobatch_shape, batch_shape] - for model_name, shape in zip(models, shapes): + for model_name in models: # Validate the auto-complete model configuration model_config = client.get_model_config(model_name) if model_config["max_batch_size"] != expected_max_batch_size[ @@ -85,45 +83,8 @@ def validate_ios(config, expected_ios, model_name): sys.exit(1) validate_ios(model_config["input"], expected_inputs, model_name) validate_ios(model_config["output"], expected_outputs, model_name) - - input0_data = np.random.rand(*shape).astype(np.float32) - input1_data = np.random.rand(*shape).astype(np.float32) - inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), - ] - - inputs[0].set_data_from_numpy(input0_data) - inputs[1].set_data_from_numpy(input1_data) - - outputs = [ - httpclient.InferRequestedOutput("OUTPUT0"), - httpclient.InferRequestedOutput("OUTPUT1"), - ] - - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) - - result = response.get_response() - output0_data = response.as_numpy("OUTPUT0") - output1_data = response.as_numpy("OUTPUT1") - - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output1_data)) - - if not np.allclose(input0_data + input1_data, output0_data): - print("auto_complete example error: incorrect sum") - sys.exit(1) - - if not np.allclose(input0_data - input1_data, output1_data): - print("auto_complete example error: incorrect difference") - sys.exit(1) + print("'" + model_name + "' configuration matches the expected " + + "auto complete configuration\n") print('PASS: auto_complete') diff --git a/examples/bls/sync_client.py b/examples/bls/sync_client.py index 83cacb93..5d36e8a9 100644 --- a/examples/bls/sync_client.py +++ b/examples/bls/sync_client.py @@ -1,4 +1,4 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -68,11 +68,11 @@ print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( input0_data, input1_data, output1_data)) if not np.allclose(input0_data + input1_data, output0_data): - print("BLS async example error: incorrect sum") + print("BLS sync example error: incorrect sum") sys.exit(1) if not np.allclose(input0_data - input1_data, output1_data): - print("BLS async example error: incorrect difference") + print("BLS sync example error: incorrect difference") sys.exit(1) # Will perform the inference request on the pytorch model: @@ -92,11 +92,11 @@ print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( input0_data, input1_data, output1_data)) if not np.allclose(input0_data + input1_data, output0_data): - print("BLS async example error: incorrect sum") + print("BLS sync example error: incorrect sum") sys.exit(1) if not np.allclose(input0_data - input1_data, output1_data): - print("BLS async example error: incorrect difference") + print("BLS sync example error: incorrect difference") sys.exit(1) # Will perform the same inference request on an undefined model. This leads From 73fdcdad66f79064e5fb04b66fa30af375b6d0fa Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 6 Jun 2022 12:41:55 -0400 Subject: [PATCH 045/216] Add request statistic reporting for decoupled mode (#163) --- src/pb_metric_reporter.cc | 32 +++++++++++++++++++++++--------- src/pb_metric_reporter.h | 2 ++ src/python_be.cc | 29 +++++++++++++++++++++++++++-- src/python_be.h | 3 ++- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/pb_metric_reporter.cc b/src/pb_metric_reporter.cc index 20786872..19362905 100644 --- a/src/pb_metric_reporter.cc +++ b/src/pb_metric_reporter.cc @@ -36,7 +36,8 @@ PbMetricReporter::PbMetricReporter( std::shared_ptr> responses) : instance_(instance), requests_(requests), request_count_(request_count), responses_(responses), total_batch_size_(0), exec_start_ns_(0), - compute_start_ns_(0), compute_end_ns_(0), exec_end_ns_(0) + compute_start_ns_(0), compute_end_ns_(0), exec_end_ns_(0), + success_status_(true) { } @@ -51,12 +52,19 @@ PbMetricReporter::~PbMetricReporter() // request object. We use the execution start/end time for // compute also so that the entire execution time is associated // with the inference computation. - LOG_IF_ERROR( - TRITONBACKEND_ModelInstanceReportStatistics( - instance_, request, - ((*responses_)[r] != nullptr) /* success */, exec_start_ns_, - compute_start_ns_, compute_end_ns_, exec_end_ns_), - "failed reporting request statistics"); + if (responses_) { + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + instance_, request, ((*responses_)[r] != nullptr) /* success */, + exec_start_ns_, compute_start_ns_, compute_end_ns_, exec_end_ns_), + "failed reporting request statistics"); + } else { + LOG_IF_ERROR( + TRITONBACKEND_ModelInstanceReportStatistics( + instance_, request, success_status_, exec_start_ns_, + compute_start_ns_, compute_end_ns_, exec_end_ns_), + "failed reporting request statistics"); + } } // Report the entire batch statistics. This backend does not support @@ -64,8 +72,8 @@ PbMetricReporter::~PbMetricReporter() if (total_batch_size_ != 0) { LOG_IF_ERROR( TRITONBACKEND_ModelInstanceReportBatchStatistics( - instance_, total_batch_size_, exec_start_ns_, - compute_start_ns_, compute_end_ns_, exec_end_ns_), + instance_, total_batch_size_, exec_start_ns_, compute_start_ns_, + compute_end_ns_, exec_end_ns_), "failed reporting batch request statistics"); } } @@ -100,4 +108,10 @@ PbMetricReporter::SetExecEndNs(const uint64_t exec_end_ns) exec_end_ns_ = exec_end_ns; } +void +PbMetricReporter::SetSuccessStatus(const bool success_status) +{ + success_status_ = success_status; +} + }}} // namespace triton::backend::python diff --git a/src/pb_metric_reporter.h b/src/pb_metric_reporter.h index 978a949d..88062f86 100644 --- a/src/pb_metric_reporter.h +++ b/src/pb_metric_reporter.h @@ -42,6 +42,7 @@ class PbMetricReporter { uint64_t compute_start_ns_; uint64_t compute_end_ns_; uint64_t exec_end_ns_; + bool success_status_; public: PbMetricReporter( @@ -54,5 +55,6 @@ class PbMetricReporter { void SetComputeStartNs(const uint64_t compute_start_ns); void SetComputeEndNs(const uint64_t compute_end_ns); void SetExecEndNs(const uint64_t exec_end_ns); + void SetSuccessStatus(const bool success_status); }; }}}; // namespace triton::backend::python diff --git a/src/python_be.cc b/src/python_be.cc index 916be898..5d9f8800 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -926,7 +926,8 @@ ModelInstanceState::ResponseSendDecoupled( TRITONSERVER_Error* ModelInstanceState::ProcessRequestsDecoupled( TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_inference_requests) + std::vector>& pb_inference_requests, + PbMetricReporter& reporter) { NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); closed_requests_ = {}; @@ -954,6 +955,10 @@ ModelInstanceState::ProcessRequestsDecoupled( requests, request_count, pb_inference_requests, request_batch, responses)); + uint64_t compute_start_ns = 0; + SET_TIMESTAMP(compute_start_ns); + reporter.SetComputeStartNs(compute_start_ns); + std::unique_ptr ipc_message; RETURN_IF_EXCEPTION( ipc_message = @@ -971,6 +976,12 @@ ModelInstanceState::ProcessRequestsDecoupled( AllocatedSharedMemory response_batch = Stub()->ShmPool()->Load(received_message_->Args()); + + uint64_t compute_end_ns = 0; + SET_TIMESTAMP(compute_end_ns); + reporter.SetComputeEndNs(compute_end_ns); + reporter.SetBatchStatistics(request_count); + if (response_batch.data_->has_error) { if (response_batch.data_->is_error_set) { auto error = PbString::LoadFromSharedMemory( @@ -1819,10 +1830,24 @@ TRITONBACKEND_ModelInstanceExecute( } } else { std::vector> infer_requests; + + uint64_t exec_start_ns = 0; + SET_TIMESTAMP(exec_start_ns); + + PbMetricReporter reporter( + instance_state->TritonModelInstance(), requests, request_count, + nullptr); + reporter.SetExecStartNs(exec_start_ns); + error = instance_state->ProcessRequestsDecoupled( - requests, request_count, infer_requests); + requests, request_count, infer_requests, reporter); + + uint64_t exec_end_ns = 0; + SET_TIMESTAMP(exec_end_ns); + reporter.SetExecEndNs(exec_end_ns); if (error != nullptr) { + reporter.SetSuccessStatus(false); for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; if (!instance_state->ExistsInClosedRequests( diff --git a/src/python_be.h b/src/python_be.h index d66e169a..00ad0dd4 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -320,7 +320,8 @@ class ModelInstanceState : public BackendModelInstance { // Process all the requests in the decoupled mode. TRITONSERVER_Error* ProcessRequestsDecoupled( TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_infer_requests); + std::vector>& pb_infer_requests, + PbMetricReporter& pb_metric_reporter); bool ExistsInClosedRequests(intptr_t closed_request); From b1063019519681ff68770a44b5fb181ed1896c54 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 6 Jun 2022 18:12:33 -0400 Subject: [PATCH 046/216] Fix deferred callback when there is an error (#164) --- src/python_be.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/python_be.cc b/src/python_be.cc index 5d9f8800..a84d71ed 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1178,6 +1178,7 @@ ModelInstanceState::ProcessRequests( TRITONBACKEND_Response* response = (*responses)[r]; TRITONBACKEND_Request* request = requests[r]; uint32_t requested_output_count = 0; + requires_deferred_callback.push_back(false); shm_responses.emplace_back(nullptr); std::unique_ptr& infer_response = shm_responses.back(); @@ -1236,13 +1237,14 @@ ModelInstanceState::ProcessRequests( gpu_output_buffers[r], requested_output_names, response); GUARDED_RESPOND_IF_ERROR(responses, r, *error); + requires_deferred_callback[r] = require_deferred_callback; + // Error object will be deleted by the GUARDED_RESPOND macro *error = nullptr; error.reset(); - if (require_deferred_callback) { + if (requires_deferred_callback[r]) { has_gpu_output = true; } - requires_deferred_callback.push_back(require_deferred_callback); } // Finalize the execute. From 13ee861810b8808f8b635ccfab79fd0d40f78561 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Mon, 6 Jun 2022 20:07:52 -0700 Subject: [PATCH 047/216] Fix document (#166) --- examples/bls/README.md | 6 +++--- examples/bls/async_client.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/bls/README.md b/examples/bls/README.md index d27c0770..3876b027 100644 --- a/examples/bls/README.md +++ b/examples/bls/README.md @@ -1,5 +1,5 @@ + +# JAX Example + +In this section, we demonstrate an end-to-end example for using +[JAX](https://jax.readthedocs.io/en/latest/) in Python Backend. + +## Create a JAX AddSub model repository + +We will use the files that come with this example to create the model +repository. + +First, download the [client.py](client.py), [config.pbtxt](config.pbtxt) and +[model.py](model.py) to your local machine. + +Next, at the directory where the three files located, create the model +repository with the following commands: +``` +$ mkdir -p models/jax/1 +$ mv model.py models/jax/1 +$ mv config.pbtxt models/jax +``` + +## Pull the Triton Docker images + +We need to install Docker and NVIDIA Container Toolkit before proceeding, refer +to the +[installation steps](https://github.com/triton-inference-server/server/tree/main/docs#installation). + +To pull the latest containers, run the following commands: +``` +$ docker pull nvcr.io/nvidia/tritonserver:-py3 +$ docker pull nvcr.io/nvidia/tritonserver:-py3-sdk +``` +See the installation steps above for the `` version. + +At the time of writing, the latest version is `22.08`, which translates to the +following commands: +``` +$ docker pull nvcr.io/nvidia/tritonserver:22.08-py3 +$ docker pull nvcr.io/nvidia/tritonserver:22.08-py3-sdk +``` + +Be sure to replace the `` with the version pulled for all the remaining +parts of this example. + +## Start the Triton Server + +At the directory where we created the JAX models (at where the "models" folder +is located), run the following command: +``` +$ docker run --gpus all -it --rm -p 8000:8000 -v `pwd`:/jax nvcr.io/nvidia/tritonserver:-py3 /bin/bash +``` + +Inside the container, we need to install JAX to run this example. + +We recommend using the `pip` method mentioned in the +[JAX documentation](https://github.com/google/jax#pip-installation-gpu-cuda). +Make sure that JAX is available in the same Python environment as other +dependencies. + +To install for this example, run the following command: +``` +$ pip3 install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +``` + +Finally, we need to start the Triton Server, run the following command: +``` +$ tritonserver --model-repository=/jax/models +``` + +To leave the container for the next step, press: `CTRL + P + Q`. + +## Test inference + +At the directory where the client.py is located, run the following command: +``` +$ docker run --rm --net=host -v `pwd`:/jax nvcr.io/nvidia/tritonserver:-py3-sdk python3 /jax/client.py +``` + +A successful inference will print the following at the end: +``` +INPUT0 ([0.89262384 0.645457 0.18913145 0.17099917]) + INPUT1 ([0.5703733 0.21917151 0.22854741 0.97336507]) = OUTPUT0 ([1.4629972 0.86462855 0.41767886 1.1443642 ]) +INPUT0 ([0.89262384 0.645457 0.18913145 0.17099917]) - INPUT1 ([0.5703733 0.21917151 0.22854741 0.97336507]) = OUTPUT0 ([ 0.32225055 0.4262855 -0.03941596 -0.8023659 ]) +PASS: jax +``` +Note: You inputs can be different from the above, but the outputs always +correspond to its inputs. diff --git a/examples/jax/client.py b/examples/jax/client.py new file mode 100644 index 00000000..d3c19a8f --- /dev/null +++ b/examples/jax/client.py @@ -0,0 +1,77 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonclient.utils import * +import tritonclient.http as httpclient +import sys +import numpy as np + +model_name = "jax" +shape = [4] + +with httpclient.InferenceServerClient("localhost:8000") as client: + + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + inputs = [ + httpclient.InferInput("INPUT0", input0_data.shape, + np_to_triton_dtype(input0_data.dtype)), + httpclient.InferInput("INPUT1", input1_data.shape, + np_to_triton_dtype(input1_data.dtype)), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0"), + httpclient.InferRequestedOutput("OUTPUT1"), + ] + + response = client.infer(model_name, + inputs, + request_id=str(1), + outputs=outputs) + + result = response.get_response() + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + + print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data)) + print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data)) + + if not np.allclose(input0_data + input1_data, output0_data): + print("jax example error: incorrect sum") + sys.exit(1) + + if not np.allclose(input0_data - input1_data, output1_data): + print("jax example error: incorrect difference") + sys.exit(1) + + print('PASS: jax') + sys.exit(0) diff --git a/examples/jax/config.pbtxt b/examples/jax/config.pbtxt new file mode 100644 index 00000000..a7e5e5e2 --- /dev/null +++ b/examples/jax/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "jax" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/examples/jax/model.py b/examples/jax/model.py new file mode 100644 index 00000000..b6ea2d35 --- /dev/null +++ b/examples/jax/model.py @@ -0,0 +1,154 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import numpy as np +import jax.numpy as jnp + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +def AddSub(input_0, input_1): + """ + Simple AddSub operations in JAX. This outputs the sum and subtraction of + the inputs. + JAX API: https://jax.readthedocs.io/en/latest/jax.html + """ + output_0 = jnp.add(input_0, input_1) + output_1 = jnp.subtract(input_0, input_1) + return [output_0, output_1] + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Absolute model repository path + * model_version: Model version + * model_name: Model name + """ + + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args['model_config']) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT1") + + # Convert Triton types to numpy types + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + + # Every Python backend must iterate over every one of the requests and + # create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get INPUT0 + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + # Get INPUT1 + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = AddSub(in_0.as_numpy(), in_1.as_numpy()) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", + np.array(out_0).astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor( + "OUTPUT1", + np.array(out_1).astype(output1_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occured")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1]) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') From fdf08afea27fe5ed1bc40cf08b9a0911b207cad0 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 28 Sep 2022 17:42:26 -0700 Subject: [PATCH 063/216] Fix the handling for empty GPU tensor (#187) * Fix the handling for empty GPU tensor * Fix start_address initialization --- src/pb_memory.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pb_memory.cc b/src/pb_memory.cc index 8ca0b48a..2354391f 100644 --- a/src/pb_memory.cc +++ b/src/pb_memory.cc @@ -295,11 +295,15 @@ PbMemory::GetGPUStartAddress() { if (memory_shm_ptr_->memory_type == TRITONSERVER_MEMORY_GPU) { CUDAHandler& cuda_api = CUDAHandler::getInstance(); - CUdeviceptr start_address; - - cuda_api.PointerGetAttribute( - &start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, - reinterpret_cast(data_ptr_)); + CUdeviceptr start_address = 0; + + // Skip this step for empty tensor as the CUDA API 'cuPointerGetAttribute' + // we use in this function does not accept nullptr. + if (data_ptr_) { + cuda_api.PointerGetAttribute( + &start_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + reinterpret_cast(data_ptr_)); + } return reinterpret_cast(start_address); } From 9975a1272ad1f67769a61af863214e7ae5eeb305 Mon Sep 17 00:00:00 2001 From: Sazzad Hossain Date: Thu, 29 Sep 2022 16:02:00 +0600 Subject: [PATCH 064/216] Fix type in the decoupled example (#185) --- examples/decoupled/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decoupled/README.md b/examples/decoupled/README.md index 5e231c78..6336c21f 100644 --- a/examples/decoupled/README.md +++ b/examples/decoupled/README.md @@ -33,7 +33,7 @@ serving [decoupled models](../../README.md#decoupled-mode) in Python backend. [repeat_model.py](repeat_model.py) and [square_model.py](square_model.py) demonstrate how to write a decoupled model where each request can generate 0 to many responses. -These files are heavily commented to describe each fuinction call. +These files are heavily commented to describe each function call. These example models are designed to show the flexibility available to decoupled models and in no way should be used in production. These examples circumvents the restriction placed by the [instance count](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#instance-groups) From 33c98ba5e60a859a1da2cb9ea1984856923053c4 Mon Sep 17 00:00:00 2001 From: Neal Vaidya Date: Fri, 7 Oct 2022 12:21:33 -0400 Subject: [PATCH 065/216] Fix broken links after refactor (#189) --- README.md | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d6703b77..377131b4 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ class TritonPythonModel: ---------- auto_complete_model_config : pb_utils.ModelConfig An object containing the existing model configuration. You can build upon - the configuration given by this object when setting the properties for + the configuration given by this object when setting the properties for this model. Returns @@ -274,9 +274,9 @@ class TritonPythonModel: auto_complete_model_config.add_output(output) auto_complete_model_config.set_max_batch_size(0) - - # To enable a dynamic batcher with default settings, you can use - # auto_complete_model_config set_dynamic_batching() function. It is + + # To enable a dynamic batcher with default settings, you can use + # auto_complete_model_config set_dynamic_batching() function. It is # commented in this example because the max_batch_size is zero. # # auto_complete_model_config.set_dynamic_batching() @@ -348,22 +348,22 @@ Every Python backend can implement four main functions: ### `auto_complete_config` `auto_complete_config` is called only once when loading the model assuming -the server was not started with [`--disable-auto-complete-config`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#auto-generated-model-configuration). +the server was not started with [`--disable-auto-complete-config`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration). Implementing this function is optional. No implementation of -`auto_complete_config` will do nothing. This function can be used to set +`auto_complete_config` will do nothing. This function can be used to set [`max_batch_size`]( - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size), + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size), [dynamic_batching]( - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher), + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher), [`input`]( - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) and + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) and [`output`]( - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) properties of the model using `set_max_batch_size`, `set_dynamic_batching`, `add_input`, and `add_output`. These properties will allow Triton to load the model with [minimal model configuration]( - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#minimal-model-configuration) + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration) in absence of a configuration file. This function returns the `pb_utils.ModelConfig` object with these properties. You can use the `as_dict` function to gain read-only access to the `pb_utils.ModelConfig` object. @@ -463,7 +463,7 @@ This mode allows user to send multiple responses for a request or not send any responses for a request. A model may also send responses out-of-order relative to the order that the request batches are executed. Such models are called *decoupled* models. In -order to use this mode, the [transaction policy](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#model-transaction-policy) +order to use this mode, the [transaction policy](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-transaction-policy) in the model configuration must be set to decoupled. @@ -514,7 +514,7 @@ The decoupled mode is powerful and supports various other use cases: The [decoupled examples](examples/decoupled/README.md) demonstrate full power of what can be acheived from decoupled API. Read -[Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/decoupled_models.md) +[Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) for more details on how to host a decoupled model. ##### Known Issues @@ -611,7 +611,7 @@ linked shared libraries. If you use a different Python version, you should see that version instead. You need to copy the `triton_python_backend_stub` to the model directory of the models that want to use the custom Python backend stub. For example, if you have `model_a` in your -[model repository](https://github.com/triton-inference-server/server/blob/main/docs/model_repository.md), +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md), the folder structure should look like below: ``` @@ -778,7 +778,7 @@ Because of GIL, it is not possible have multiple threads running in the same Python interpreter simultaneously as each thread requires to acquire the GIL when accessing Python objects which will serialize all the operations. In order to work around this issue, Python backend spawns a separate process for each -[model instance](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#multiple-model-instances). +[model instance](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#multiple-model-instances). This is in contrast with how other Triton backends such as [ONNXRuntime](https://github.com/triton-inference-server/onnxruntime_backend), [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), and @@ -789,7 +789,7 @@ additional threads instead of spawning separate processes. # Business Logic Scripting Triton's -[ensemble](https://github.com/triton-inference-server/server/blob/main/docs/architecture.md#ensemble-models) +[ensemble](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models) feature supports many use cases where multiple models are composed into a pipeline (or more generally a DAG, directed acyclic graph). However, there are many other use cases that are not supported because as part of the model @@ -906,11 +906,11 @@ that you can execute in your model without running into the out of GPU or shared memory error. Note: Async BLS is not supported on Python 3.6 or lower due to the `async` keyword -and `asyncio.run` being introduced in Python 3.7. +and `asyncio.run` being introduced in Python 3.7. ## Using BLS with Stateful Models -[Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/architecture.md#stateful-models) +[Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models) require setting additional flags in the inference request to indicate the start and of a sequence. The `flags` argument in the `pb_utils.InferenceRequest` object can be used to indicate whether the request is the first or last request @@ -924,7 +924,7 @@ inference_request = pb_utils.InferenceRequest(model_name='model_name', request_id="1", correlation_id=4, flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START) ``` -For indicating the ending of the sequence you can use the +For indicating the ending of the sequence you can use the `pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END` flag. If the request is both starting and ending a sequence at the same time (i.e. the sequence has only a single request), you can use the bitwise OR operator to enable both of the @@ -1084,7 +1084,7 @@ Note that the Triton server's settings determine which log messages appear within the server log. For example, if a model attempts to log a verbose-level message, but Triton is not set to log verbose-level messages, it will not appear in the server log. For more information on Triton's log settings and -how to adjust them dynamically, please see Triton's +how to adjust them dynamically, please see Triton's [logging extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_logging.md) documentation. From f0de2110e5221314872c05f8cbe23e0b16162d55 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 7 Oct 2022 13:59:52 -0400 Subject: [PATCH 066/216] Fix link to the decoupled models (#190) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 377131b4..949af733 100644 --- a/README.md +++ b/README.md @@ -1053,7 +1053,7 @@ You can find the complete example instructions in [examples/preprocessing](examp ## Decoupled Models The examples of decoupled models shows how to develop and serve -[decoupled models](../../README.md#decoupled-mode) in Triton using Python backend. +[decoupled models](#decoupled-mode) in Triton using Python backend. You can find the complete example instructions in [examples/decoupled](examples/decoupled/README.md). # Running with Inferentia From 5225e39eecd35750c4e5872d293a47cd4bc7312f Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 14 Oct 2022 17:34:40 -0700 Subject: [PATCH 067/216] Update EXECUTION_ENV_PATH documentation (#191) * Improve doc on EXECUTION_ENV_PATH * Remove mentioning experiment feature on doc * Eliminate ambiguity to TRITON_MODEL_DIRECTORY --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 949af733..0e30162f 100644 --- a/README.md +++ b/README.md @@ -718,7 +718,16 @@ default version of the stub is Python 3.8. provide the path to the tar file in the `EXECUTION_ENV_PATH` in the `config.pbtxt` of all the models that want to use the execution environment. -4. If you need to compile the Python backend stub, it is recommended that you +4. If `$$TRITON_MODEL_DIRECTORY` is used in the `EXECUTION_ENV_PATH`, the final +`EXECUTION_ENV_PATH` **must not** escape from the `$$TRITON_MODEL_DIRECTORY`, +as the behavior of accessing anywhere outside the `$$TRITON_MODEL_DIRECTORY` is +**undefined**. + +5. If a non-`$$TRITON_MODEL_DIRECTORY` `EXECUTION_ENV_PATH` is used, only local +file system paths are currently supported. The behavior of using cloud paths is +**undefined**. + +6. If you need to compile the Python backend stub, it is recommended that you compile it in the official Triton NGC containers. Otherwise, your compiled stub may use dependencies that are not available in the Triton container that you are using for deployment. For example, compiling the Python backend stub on an OS From 6327d481cf41cc37a16c90e781820571a322edfb Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 31 Oct 2022 12:19:08 -0400 Subject: [PATCH 068/216] Improve the documentation for custom exec envs (#192) --- README.md | 77 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 0e30162f..ef0b12a4 100644 --- a/README.md +++ b/README.md @@ -50,9 +50,9 @@ any C++ code. - [Known Issues](#known-issues) - [`finalize`](#finalize) - [Model Config File](#model-config-file) - - [Using Custom Python Execution Environments](#using-custom-python-execution-environments) - - [1. Building Custom Python Backend Stub](#1-building-custom-python-backend-stub) - - [2. Packaging the Conda Environment](#2-packaging-the-conda-environment) + - [Managing Python Runtime and Libraries](#managing-python-runtime-and-libraries) + - [Building Custom Python Backend Stub](#building-custom-python-backend-stub) + - [Creating Custom Execution Environments](#creating-custom-execution-environments) - [Important Notes](#important-notes) - [Error Handling](#error-handling) - [Managing Shared Memory](#managing-shared-memory) @@ -547,21 +547,26 @@ models └── config.pbtxt ``` -## Using Custom Python Execution Environments +## Managing Python Runtime and Libraries Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/) -containers uses Python 3.8. If your Python model is compatible with Python 3.8 -and requires only modules already included in the Triton container, then you can -skip this section. If you need to use a different version of Python or if you -have additional dependencies, you need to recompile the stub executable and -create an execution environment as described below and include that with your -model. - -### 1. Building Custom Python Backend Stub - -**Important Note: If your Python model and its dependencies use Python 3.8, -you can skip this section and start from section 2 since the Python backend stub -shipped in Triton containers uses Python 3.8 by default.** +containers uses Python 3.8. Python backend is able to use the libaries +that exist in the current Python environment. These libraries can +be installed in a virtualenv, conda environment, or the global system +Python. These libraries will only be used if the Python version matches +the Python version of the Python backend's stub executable. For example, +if you install a set of libraries in a Python 3.9 environment and your +Python backend stub is compiled with Python 3.8 these libraries will NOT +be available in your Python model served using Triton. You would need to +compile the stub executble with Python 3.9 using the instructions in +[Building Custom Python Backend Stub](#building-custom-python-backend-stub) +section. + +### Building Custom Python Backend Stub + +**Important Note: You only need to compile a custom Python backend stub if the +Python version is different from Python 3.8 which is shipped by +default in the Triton containers.** Python backend uses a *stub* process to connect your `model.py` file to the Triton C++ core. This stub process has an embedded Python interpreter with @@ -570,19 +575,19 @@ different version from the default Python backend stub, you need to compile your Python backend stub by following the steps below: 1. Install the software packages below: -* [conda](https://docs.conda.io/en/latest/) * [cmake](https://cmake.org) * rapidjson and libarchive (instructions for installing these packages in Ubuntu or Debian are included in [Building from Source Section](#building-from-source)) +2. Make sure that the expected Python version is available in your environment. -2. Create and activate a [conda](https://docs.conda.io/en/latest/) environment with your desired Python version. In this example, we will be using Python 3.6: -```bash -conda create -n python-3-6 python=3.6 -conda activate python-3-6 +If you are using `conda`, you should make sure to activate the environment by +`conda activate `. Note that you don't have to use `conda` and +can install Python however you wish. Python backend relies on +[pybind11](https://github.com/pybind/pybind11) to find the correct Python +version. If you noticed that the correct Python version is not picked up, you +can read more on how +[pybind11 decides which Python to use](https://pybind11.readthedocs.io/en/stable/faq.html?highlight=cmake#cmake-doesn-t-detect-the-right-python-version). -# NumPy is required for Python models -conda install numpy -``` 3. Clone the Python backend repository and compile the Python backend stub (replace \ with the branch name that you want to use, for release branches it should be r\): @@ -595,7 +600,7 @@ $ cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG= -DTRI $ make triton-python-backend-stub ``` -Now, you have access to a Python backend stub with Python 3.6. You can verify +Now, you have access to a Python backend stub with your Python version. You can verify that using `ldd`: ``` @@ -606,10 +611,11 @@ libpython3.6m.so.1.0 => /home/ubuntu/envs/miniconda3/envs/python-3-6/lib/libpyth ``` There are many other shared libraries printed in addition to the library posted -above. However, it is important to see `libpython3.6m.so.1.0` in the list of -linked shared libraries. If you use a different Python version, you should see -that version instead. You need to copy the `triton_python_backend_stub` to the -model directory of the models that want to use the custom Python backend +above. However, it is important to see `libpython.m.so.1.0` in the +list of linked shared libraries. If you use a different Python version, you +should see that version instead. You need to copy the +`triton_python_backend_stub` to the model directory of the models that want to +use the custom Python backend stub. For example, if you have `model_a` in your [model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md), the folder structure should look like below: @@ -625,9 +631,11 @@ models Note the location of `triton_python_backend_stub` in the directory structure above. -### 2. Packaging the Conda Environment +### Creating Custom Execution Environments -It is also required to create a tar file that contains your conda environment. +If you want to create a tar file that contains all your Python dependencies or +you want to use different Python environments for each Python model you need to +create a *Custom Execution Environment* in Python backend. Currently, Python backend only supports [conda-pack](https://conda.github.io/conda-pack/) for this purpose. [conda-pack](https://conda.github.io/conda-pack/) ensures that your conda @@ -648,8 +656,9 @@ have exported [`PYTHONNOUSERSITE`](https://docs.python.org/3/using/cmdline.html# export PYTHONNOUSERSITE=True ``` -If this variable is not exported and similar packages are installed outside your conda environment, -your tar file may not contain all the dependencies required for an isolated Python environment. +If this variable is not exported and similar packages are installed outside your +conda environment, your tar file may not contain all the dependencies required +for an isolated Python environment. After creating the tar file from the conda environment, you need to tell Python backend to use that environment for your model. You can do this by adding the @@ -707,7 +716,7 @@ storage service. the version of triton_python_backend_stub. 2. If you don't want to use a different Python interpreter, you can skip -[Building Custom Python Backend Stub Step](#1-building-custom-python-backend-stub). +[Building Custom Python Backend Stub](#building-custom-python-backend-stub). In this case you only need to pack your environment using `conda-pack` and provide the path to tar file in the model config. However, the previous note still applies here and the version of the Python interpreter inside the conda From 8466d33ad69bd3f4db56789c6611f39467c3090e Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 31 Oct 2022 16:55:11 -0400 Subject: [PATCH 069/216] Fix broken links Python BE (#193) --- examples/auto_complete/README.md | 10 +++++----- examples/bls/README.md | 2 +- examples/decoupled/README.md | 5 +++-- inferentia/README.md | 9 +++++---- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/auto_complete/README.md b/examples/auto_complete/README.md index 5cfd2f51..f530da3a 100644 --- a/examples/auto_complete/README.md +++ b/examples/auto_complete/README.md @@ -31,15 +31,15 @@ This example shows how to implement [`auto_complete_config`](https://github.com/triton-inference-server/python_backend/#auto_complete_config) function in Python backend to provide -[`max_batch_size`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size), -[`input`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) -and [`output`](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#inputs-and-outputs) +[`max_batch_size`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size), +[`input`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) +and [`output`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) properties. These properties will allow Triton to load the Python model with -[Minimal Model Configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#minimal-model-configuration) +[Minimal Model Configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration) in absence of a configuration file. The -[model repository](https://github.com/triton-inference-server/server/blob/main/docs/model_repository.md) +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) should contain [nobatch_auto_complete](./nobatch_model.py), and [batch_auto_complete](./batch_model.py) models. The max_batch_size of [nobatch_auto_complete](./nobatch_model.py) model is set diff --git a/examples/bls/README.md b/examples/bls/README.md index 3876b027..b772257b 100644 --- a/examples/bls/README.md +++ b/examples/bls/README.md @@ -30,7 +30,7 @@ In this section we demonstrate an end-to-end example for [BLS](../../README.md#business-logic-scripting) in Python backend. The -[model repository](https://github.com/triton-inference-server/server/blob/main/docs/model_repository.md) +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) should contain [pytorch](../pytorch), [addsub](../add_sub). The [pytorch](../pytorch) and [addsub](../add_sub) models calculate the sum and difference of the `INPUT0` and `INPUT1` and put the results in `OUTPUT0` and diff --git a/examples/decoupled/README.md b/examples/decoupled/README.md index 6336c21f..22f4f68e 100644 --- a/examples/decoupled/README.md +++ b/examples/decoupled/README.md @@ -36,7 +36,8 @@ how to write a decoupled model where each request can generate 0 to many respons These files are heavily commented to describe each function call. These example models are designed to show the flexibility available to decoupled models and in no way should be used in production. These examples circumvents -the restriction placed by the [instance count](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#instance-groups) +the restriction placed by the +[instance count](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) and allows multiple requests to be in process even for single instance. In real deployment, the model should not allow the caller thread to return from `execute` until that instance is ready to handle another set of requests. @@ -341,4 +342,4 @@ stream stopped... Look how responses were delivered out-of-order of requests. The generated responses can be tracked to their request using -the `id` field. \ No newline at end of file +the `id` field. diff --git a/inferentia/README.md b/inferentia/README.md index f6b12d85..50d443e0 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -239,14 +239,14 @@ their need. To enable dynamic batching, `--enable_dynamic_batching` flag needs to be specified. `gen_triton_model.py` supports following three -options for configuring [Triton's dynamic batching](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md): +options for configuring [Triton's dynamic batching](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md): -1. `--preferred_batch_size`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes) for details on preferred batch size. To optimize +1. `--preferred_batch_size`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#preferred-batch-sizes) for details on preferred batch size. To optimize performance, this is recommended to be multiples of engaged neuron cores. For example, if each instance is using 2 neuron cores, `preferred_batch_size` could be 2, 4 or 6. 2. `--max_queue_delay_microseconds`: Please refer to - [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching) for details. + [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching) for details. 3. `--disable_batch_requests_to_neuron`: Enable the non-default way for Triton to handle batched requests. Triton backend will send each request to neuron separately, irrespective of if the Triton server requests are batched. @@ -254,7 +254,8 @@ options for configuring [Triton's dynamic batching](https://github.com/triton-in that do not perform well with batching without the flag. Additionally, `--max_batch_size` will affect the maximum batching limit. Please -refer to the [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size) +refer to the +[model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#maximum-batch-size) for details. ## Testing Inferentia Setup for Accuracy From c4adbfe55cf29195e29e0d96319c56b7e707d727 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Wed, 2 Nov 2022 12:13:08 -0700 Subject: [PATCH 070/216] Improve logger documentation (#194) --- README.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ef0b12a4..844b3d72 100644 --- a/README.md +++ b/README.md @@ -1084,13 +1084,24 @@ Your Python model can log information using the following methods: ```python import triton_python_backend_utils as pb_utils -... -logger = pb_utils.Logger -logger.log_info("Info Msg!") -logger.log_warn("Warning Msg!") -logger.log_error("Error Msg!") -logger.log_verbose("Verbose Msg!") + +class TritonPythonModel: + + def execute(self, requests): + ... + logger = pb_utils.Logger + logger.log_info("Info Msg!") + logger.log_warn("Warning Msg!") + logger.log_error("Error Msg!") + logger.log_verbose("Verbose Msg!") + ``` +*Note:* The logger can be defined and used in following class methods: + +* initialize +* execute +* finalize + Log messages can also be sent with their log-level explcitiy specified: ```python # log-level options: INFO, WARNING, ERROR, VERBOSE From 7668bdaf37dde11b2e9e966e57e9ce67186dd778 Mon Sep 17 00:00:00 2001 From: Hamid Ali Date: Sat, 5 Nov 2022 01:39:20 +0500 Subject: [PATCH 071/216] Logger release info and commands updated (#195) * Logger release info and commands updated README.md updated with following changes Logger release info added for better understanding Unnecessary $ sign removed from commands to support better copy/past using clipboard button * Update README.md Co-authored-by: Iman Tabrizian --- README.md | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 844b3d72..3ebb8fec 100644 --- a/README.md +++ b/README.md @@ -80,45 +80,45 @@ any C++ code. 1. Run the Triton Inference Server container. ``` -$ docker run --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 +docker run --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 ``` Replace \ with the Triton version (e.g. 21.05). 2. Inside the container, clone the Python backend repository. ``` -$ git clone https://github.com/triton-inference-server/python_backend -b r +git clone https://github.com/triton-inference-server/python_backend -b r ``` 3. Install example model. ``` -$ cd python_backend -$ mkdir -p models/add_sub/1/ -$ cp examples/add_sub/model.py models/add_sub/1/model.py -$ cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt +cd python_backend +mkdir -p models/add_sub/1/ +cp examples/add_sub/model.py models/add_sub/1/model.py +cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt ``` 4. Start the Triton server. ``` -$ tritonserver --model-repository `pwd`/models +tritonserver --model-repository `pwd`/models ``` 5. In the host machine, start the client container. ``` - docker run -ti --net host nvcr.io/nvidia/tritonserver:-py3-sdk /bin/bash +docker run -ti --net host nvcr.io/nvidia/tritonserver:-py3-sdk /bin/bash ``` 6. In the client container, clone the Python backend repository. ``` -$ git clone https://github.com/triton-inference-server/python_backend -b r +git clone https://github.com/triton-inference-server/python_backend -b r ``` 7. Run the example client. ``` -$ python3 python_backend/examples/add_sub/client.py +python3 python_backend/examples/add_sub/client.py ``` ## Building from Source @@ -145,10 +145,10 @@ sudo apt-get install rapidjson-dev libarchive-dev zlib1g-dev r21.06). ``` -$ mkdir build -$ cd build -$ cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG= -DTRITON_COMMON_REPO_TAG= -DTRITON_CORE_REPO_TAG= -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. -$ make install +mkdir build +cd build +cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG= -DTRITON_COMMON_REPO_TAG= -DTRITON_CORE_REPO_TAG= -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. +make install ``` The following required Triton repositories will be pulled and used in @@ -167,21 +167,21 @@ this location is `/opt/tritonserver`. 3. Copy example model and configuration ``` -$ mkdir -p models/add_sub/1/ -$ cp examples/add_sub/model.py models/add_sub/1/model.py -$ cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt +mkdir -p models/add_sub/1/ +cp examples/add_sub/model.py models/add_sub/1/model.py +cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt ``` 4. Start the Triton Server ``` -$ /opt/tritonserver/bin/tritonserver --model-repository=`pwd`/models +/opt/tritonserver/bin/tritonserver --model-repository=`pwd`/models ``` 5. Use the client app to perform inference ``` -$ python3 examples/add_sub/client.py +python3 examples/add_sub/client.py ``` ## Usage @@ -592,19 +592,19 @@ can read more on how (replace \ with the branch name that you want to use, for release branches it should be r\): ```bash -$ git clone https://github.com/triton-inference-server/python_backend -b +git clone https://github.com/triton-inference-server/python_backend -b -$ cd python_backend -$ mkdir build && cd build -$ cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG= -DTRITON_COMMON_REPO_TAG= -DTRITON_CORE_REPO_TAG= -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. -$ make triton-python-backend-stub +cd python_backend +mkdir build && cd build +cmake -DTRITON_ENABLE_GPU=ON -DTRITON_BACKEND_REPO_TAG= -DTRITON_COMMON_REPO_TAG= -DTRITON_CORE_REPO_TAG= -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. +make triton-python-backend-stub ``` Now, you have access to a Python backend stub with your Python version. You can verify that using `ldd`: ``` -$ ldd triton_python_backend_stub +ldd triton_python_backend_stub ... libpython3.6m.so.1.0 => /home/ubuntu/envs/miniconda3/envs/python-3-6/lib/libpython3.6m.so.1.0 (0x00007fbb69cf3000) ... @@ -643,7 +643,7 @@ environment is portable. You can create a tar file for your conda environment using `conda-pack` command: ``` -$ conda-pack +conda-pack Collecting packages... Packing environment at '/home/iman/miniconda3/envs/python-3-6' to 'python-3-6.tar.gz' [########################################] | 100% Completed | 4.5s @@ -1080,7 +1080,7 @@ Please see the [README.md](https://github.com/triton-inference-server/python_bac # Logging -Your Python model can log information using the following methods: +Starting from 22.09 release, your Python model can log information using the following methods: ```python import triton_python_backend_utils as pb_utils From 685711d2815c073259d74d91f6d6a9dc9f79158a Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 28 Nov 2022 17:54:31 -0500 Subject: [PATCH 072/216] Acquire GIL lock when releasing DLPack tensors (#198) --- CMakeLists.txt | 4 ++-- src/pb_tensor.cc | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ce6275da..3f9855f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,7 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend) FetchContent_Declare( pybind11 GIT_REPOSITORY "https://github.com/pybind/pybind11" - GIT_TAG "v2.6" + GIT_TAG "v2.10" GIT_SHALLOW ON ) FetchContent_MakeAvailable(pybind11) @@ -85,7 +85,7 @@ FetchContent_MakeAvailable(pybind11) FetchContent_Declare( dlpack GIT_REPOSITORY "https://github.com/dmlc/dlpack" - GIT_TAG "v0.5" + GIT_TAG "v0.7" GIT_SHALLOW ON ) FetchContent_MakeAvailable(dlpack) diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 248091b1..ae7e9678 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -247,6 +247,9 @@ PbTensor::ToDLPack() dlpack_tensor->dl_tensor.strides = nullptr; dlpack_tensor->manager_ctx = this; dlpack_tensor->deleter = [](DLManagedTensor* m) { + // We need to acquire GIL since the framework that deleted the dlpack tensor + // may not have acquired GIL when calling this function. + py::gil_scoped_acquire gil; if (m->manager_ctx == nullptr) { return; } From 54c693e5eac685ee343a3d11d4d37494e92286a3 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 14 Dec 2022 18:09:25 -0500 Subject: [PATCH 073/216] Fix GPU buffers when dynamic batching is enabled (#199) --- src/ipc_message.cc | 16 ---------------- src/ipc_message.h | 1 - src/pb_stub.cc | 9 +-------- src/python_be.cc | 34 ++++++++++++++++++---------------- 4 files changed, 19 insertions(+), 41 deletions(-) diff --git a/src/ipc_message.cc b/src/ipc_message.cc index a81e3a2e..ea1dc5b0 100644 --- a/src/ipc_message.cc +++ b/src/ipc_message.cc @@ -133,20 +133,4 @@ IPCMessage::IPCMessage( ipc_message_handle_ = ipc_message_shm_.handle_; } -void -IPCMessage::Release() -{ - if (ipc_message_shm_.data_ != nullptr) { - ipc_message_shm_.data_.release(); - } - - if (response_mutex_shm_.data_ != nullptr) { - response_mutex_shm_.data_.release(); - } - - if (response_cond_shm_.data_ != nullptr) { - response_cond_shm_.data_.release(); - } -} - }}}; // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index 3aad4904..8bb5c3d7 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -89,7 +89,6 @@ class IPCMessage { bi::interprocess_mutex* ResponseMutex(); bi::managed_external_buffer::handle_t& Args(); bi::managed_external_buffer::handle_t ShmHandle(); - void Release(); private: AllocatedSharedMemory ipc_message_shm_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a85f8b8c..c2e15340 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "pb_stub.h" + #include #include #include @@ -748,22 +749,14 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) str + "'."); } } - - response_batch_shm_ptr->batch_size = response_size; - std::vector> gpu_tensors; for (size_t i = 0; i < batch_size; i++) { InferResponse* infer_response = responses[i].cast(); InferRequest* infer_request = py_request_list[i].cast(); infer_response->PruneOutputTensors(infer_request->RequestedOutputNames()); ProcessResponse(infer_response); - for (auto output_tensor : infer_response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - gpu_tensors.push_back(output_tensor); - } - } responses_shm_handle[i] = infer_response->ShmHandle(); } } diff --git a/src/python_be.cc b/src/python_be.cc index fbd65a49..59941683 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -24,6 +24,7 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "python_be.h" + #include "pb_log.h" namespace triton { namespace backend { namespace python { @@ -400,7 +401,7 @@ ModelInstanceState::LaunchStubProcess() RETURN_IF_ERROR(Stub()->Setup()); StartLogMonitor(); RETURN_IF_ERROR(Stub()->Launch()); - + thread_pool_ = std::make_unique( model_state->StateForBackend()->thread_pool_size); @@ -881,9 +882,10 @@ ModelInstanceState::StartLogMonitor() log_monitor_ = std::thread(&ModelInstanceState::LogMessageQueueMonitor, this); } -void ModelInstanceState::TerminateLogMonitor() +void +ModelInstanceState::TerminateLogMonitor() { - if(log_thread_) { + if (log_thread_) { log_thread_ = false; Stub()->LogMessageQueue()->Push(DUMMY_MESSAGE); log_monitor_.join(); @@ -1252,9 +1254,8 @@ ModelInstanceState::ProcessRequests( std::vector requires_deferred_callback; std::vector> shm_responses; - std::unordered_map< - uint32_t, std::vector, void*>>> - gpu_output_buffers; + std::vector, void*>>> + gpu_output_buffers(request_count); for (uint32_t r = 0; r < request_count; ++r) { NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); @@ -1338,7 +1339,7 @@ ModelInstanceState::ProcessRequests( if (has_gpu_output) { size_t total_gpu_buffers_count = 0; for (auto& gpu_output_buffer : gpu_output_buffers) { - total_gpu_buffers_count += gpu_output_buffer.second.size(); + total_gpu_buffers_count += gpu_output_buffer.size(); } AllocatedSharedMemory gpu_buffers_handle = Stub()->ShmPool()->Construct( @@ -1354,7 +1355,7 @@ ModelInstanceState::ProcessRequests( size_t index = 0; for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer.second) { + for (auto& buffer_memory_pair : gpu_output_buffer) { gpu_buffers_handle_shm[index] = buffer_memory_pair.first->ShmHandle(); ++index; } @@ -1369,12 +1370,12 @@ ModelInstanceState::ProcessRequests( bool cuda_copy = false; index = 0; + uint32_t response_index = 0; for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer.second) { + for (auto& buffer_memory_pair : gpu_output_buffer) { auto& pb_memory = buffer_memory_pair.first; if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - bool cuda_used; - uint32_t response_index = gpu_output_buffer.first; + bool cuda_used = false; void* pointer = buffer_memory_pair.second; GUARDED_RESPOND_IF_ERROR( @@ -1389,6 +1390,7 @@ ModelInstanceState::ProcessRequests( gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); ++index; } + response_index++; #ifdef TRITON_ENABLE_GPU if (cuda_copy) { cudaStreamSynchronize(stream_); @@ -1910,7 +1912,7 @@ TRITONBACKEND_ModelInstanceExecute( instance_state->TerminateLogMonitor(); instance_state->Stub()->KillStubProcess(); TRITONSERVER_Error* err = instance_state->Stub()->Setup(); - if(err == nullptr) { + if (err == nullptr) { instance_state->StartLogMonitor(); } LOG_IF_ERROR(err, "Failed to restart the stub process."); @@ -2015,11 +2017,11 @@ TRITONBACKEND_GetBackendAttribute( // Other instance groups setting are set to "no value" so that Triton core // will auto-complete them with default policy. #ifdef TRITON_ENABLE_GPU - RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(backend_attributes, - TRITONSERVER_INSTANCEGROUPKIND_GPU, 0, nullptr, 0)); + RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup( + backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_GPU, 0, nullptr, 0)); #else - RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(backend_attributes, - TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0)); + RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup( + backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0)); #endif return nullptr; From 62fe7267df99599aafb7eab09718d18821312719 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 29 Dec 2022 12:31:15 -0500 Subject: [PATCH 074/216] Make sure to use correct shared memory mapping before deallocation (#200) * Make sure to use correct shared memory mapping before deallocation * Add comment --- src/shm_manager.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/shm_manager.h b/src/shm_manager.h index 705d0872..67156e1f 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -179,6 +179,11 @@ class SharedMemoryManager { shm_ownership_data](T* memory) { bool destroy = false; bi::scoped_lock gaurd{*shm_mutex_}; + // Before using any shared memory function you need to make sure that you + // are using the correct mapping. For example, shared memory growth may + // happen between the time an object was created and the time the object + // gets destructed. + GrowIfNeeded(0); shm_ownership_data->ref_count_ -= 1; if (shm_ownership_data->ref_count_ == 0) { destroy = true; From 00468e5f9d4d61c4b2fabf941b87424291e81e36 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 12 Jan 2023 10:26:58 -0800 Subject: [PATCH 075/216] Move child process code to before fork (#204) --- src/stub_launcher.cc | 171 +++++++++++++++++++++---------------------- 1 file changed, 84 insertions(+), 87 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index c2236a50..db3af88c 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -197,7 +197,6 @@ StubLauncher::Setup() TRITONSERVER_Error* StubLauncher::Launch() { - std::string stub_name; if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { stub_name = model_name_; @@ -205,100 +204,98 @@ StubLauncher::Launch() stub_name = model_instance_name_; } - pid_t pid = fork(); - if (pid < 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - "Failed to fork the stub process for auto-complete."); - } - if (pid == 0) { - const char* stub_args[4]; - stub_args[0] = "bash"; - stub_args[1] = "-c"; - stub_args[3] = nullptr; // Last argument must be nullptr + const char* stub_args[4]; + stub_args[0] = "bash"; + stub_args[1] = "-c"; + stub_args[3] = nullptr; // Last argument must be nullptr - // Default Python backend stub - std::string python_backend_stub = - python_lib_ + "/triton_python_backend_stub"; + // Default Python backend stub + std::string python_backend_stub = + python_lib_ + "/triton_python_backend_stub"; - // Path to alternative Python backend stub - std::string model_python_backend_stub = - std::string(model_repository_path_) + "/triton_python_backend_stub"; + // Path to alternative Python backend stub + std::string model_python_backend_stub = + std::string(model_repository_path_) + "/triton_python_backend_stub"; - if (FileExists(model_python_backend_stub)) { - python_backend_stub = model_python_backend_stub; - } - - std::string bash_argument; - - // This shared memory variable indicates whether the stub process should - // revert the LD_LIBRARY_PATH changes to avoid shared library issues in - // executables and libraries. - ipc_control_->uses_env = false; - if (python_execution_env_ != "") { - std::stringstream ss; - - // Need to properly set the LD_LIBRARY_PATH so that Python environments - // using different python versions load properly. - ss << "source " << path_to_activate_ - << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ - << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ - << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " - << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name; - ipc_control_->uses_env = true; - bash_argument = ss.str(); - } else { - std::stringstream ss; - ss << " exec " << python_backend_stub << " " << model_path_ << " " - << shm_region_name_ << " " << shm_default_byte_size_ << " " - << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name; - bash_argument = ss.str(); - } - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("Starting Python backend stub: ") + bash_argument) - .c_str()); - - stub_args[2] = bash_argument.c_str(); - - int stub_status_code = - system((python_backend_stub + "> /dev/null 2>&1").c_str()); - - // If running stub process without any arguments returns any status code, - // other than 1, it can indicate a permission issue as a result of - // downloading the stub process from a cloud object storage service. - if (WEXITSTATUS(stub_status_code) != 1) { - // Give the execute permission for the triton_python_backend_stub to the - // owner. - int error = chmod(python_backend_stub.c_str(), S_IXUSR); - if (error != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("Failed to give execute permission to " - "triton_python_backend_stub in ") + - python_backend_stub + " " + stub_name + - " Error No.: " + std::to_string(error)) - .c_str()); - } - } + if (FileExists(model_python_backend_stub)) { + python_backend_stub = model_python_backend_stub; + } - if (execvp("bash", (char**)stub_args) != 0) { - std::stringstream ss; - ss << "Failed to run python backend stub. Errno = " << errno << '\n' - << "Python backend stub path: " << python_backend_stub << '\n' - << "Shared Memory Region Name: " << shm_region_name_ << '\n' - << "Shared Memory Default Byte Size: " << shm_default_byte_size_ - << '\n' - << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; - std::string log_message = ss.str(); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, log_message.c_str()); + std::string bash_argument; + // This shared memory variable indicates whether the stub process should + // revert the LD_LIBRARY_PATH changes to avoid shared library issues in + // executables and libraries. + ipc_control_->uses_env = false; + if (python_execution_env_ != "") { + std::stringstream ss; + + // Need to properly set the LD_LIBRARY_PATH so that Python environments + // using different python versions load properly. + ss << "source " << path_to_activate_ + << " && exec env LD_LIBRARY_PATH=" << path_to_libpython_ + << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ + << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " + << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ + << " " << ipc_control_handle_ << " " << stub_name; + ipc_control_->uses_env = true; + bash_argument = ss.str(); + } else { + std::stringstream ss; + ss << " exec " << python_backend_stub << " " << model_path_ << " " + << shm_region_name_ << " " << shm_default_byte_size_ << " " + << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ + << " " << ipc_control_handle_ << " " << stub_name; + bash_argument = ss.str(); + } + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Starting Python backend stub: ") + bash_argument).c_str()); + + stub_args[2] = bash_argument.c_str(); + + int stub_status_code = + system((python_backend_stub + "> /dev/null 2>&1").c_str()); + + // If running stub process without any arguments returns any status code, + // other than 1, it can indicate a permission issue as a result of + // downloading the stub process from a cloud object storage service. + if (WEXITSTATUS(stub_status_code) != 1) { + // Give the execute permission for the triton_python_backend_stub to the + // owner. + int error = chmod(python_backend_stub.c_str(), S_IXUSR); + if (error != 0) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, - (std::string("Failed to initialize ") + stub_name).c_str()); + (std::string("Failed to give execute permission to " + "triton_python_backend_stub in ") + + python_backend_stub + " " + stub_name + + " Error No.: " + std::to_string(error)) + .c_str()); } + } + + pid_t pid = fork(); + if (pid < 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + "Failed to fork the stub process for auto-complete."); + } + if (pid == 0) { + // Replace this child process with the new stub process. + execvp("bash", (char**)stub_args); + // execvp() never return if succeeded. Otherwise, an error has occured. + std::stringstream ss; + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Shared Memory Region Name: " << shm_region_name_ << '\n' + << "Shared Memory Default Byte Size: " << shm_default_byte_size_ << '\n' + << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; + // Print the error message directly because the underlying mutexes in + // LOG_MESSAGE() could be forked when it is locked by other thread(s). + std::cerr << '\n' << ss.str() << '\n'; + // Terminate the child execution immediately to avoid any issues. + _Exit(1); } else { ScopedDefer _([&] { // Push a dummy message to the message queue so that the stub From 32974481c4875ad22bf318dcb2b80e316cdf5065 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 26 Jan 2023 11:42:37 -0800 Subject: [PATCH 076/216] Atomically increase/read stub process count and fix random boost truncate fail (#205) * Atomically increase and read stub process count * Allow retry if truncate fail * Remove retry and remove shm region before create * Upgrade boost to 1.79.0 --- CMakeLists.txt | 6 +++--- src/shm_manager.cc | 7 +++++-- src/shm_manager.h | 3 ++- src/stub_launcher.cc | 10 +++++----- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f9855f3..88411b76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -95,8 +95,8 @@ FetchContent_MakeAvailable(dlpack) # ExternalProject_Add( boostorg - URL https://boostorg.jfrog.io/artifactory/main/release/1.76.0/source/boost_1_76_0.tar.gz - URL_HASH SHA256=7bd7ddceec1a1dfdcbdb3e609b60d01739c38390a5f956385a12f3122049f0ca + URL https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz + URL_HASH SHA256=273f1be93238a068aba4f9735a4a2b003019af067b9c183ed227780b8f36062c PREFIX "boost-src" CONFIGURE_COMMAND ${CMAKE_COMMAND} -E copy_directory /boost/ ${CMAKE_BINARY_DIR}/boost diff --git a/src/shm_manager.cc b/src/shm_manager.cc index 2f3b3f96..555bd023 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -43,10 +43,13 @@ SharedMemoryManager::SharedMemoryManager( try { if (create) { + // Remove (if any) and create the region. + bi::shared_memory_object::remove(shm_region_name.c_str()); shm_obj_ = std::make_unique( - bi::open_or_create, shm_region_name.c_str(), bi::read_write); + bi::create_only, shm_region_name.c_str(), bi::read_write); shm_obj_->truncate(shm_size); } else { + // Open the existing region. shm_obj_ = std::make_unique( bi::open_only, shm_region_name.c_str(), bi::read_write); } diff --git a/src/shm_manager.h b/src/shm_manager.h index 67156e1f..108a3a44 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -35,6 +35,7 @@ #include #include #include +#include #include "pb_exception.h" namespace triton { namespace backend { namespace python { diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index db3af88c..e6e5872f 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -62,12 +62,12 @@ StubLauncher::Initialize(ModelState* model_state) is_decoupled_ = model_state->IsDecoupled(); model_repository_path_ = model_state->RepositoryPath(); - // Increase the stub process count to avoid shared memory region name - // collision - model_state->StateForBackend()->number_of_instance_inits++; + // Atomically increase and read the stub process count to avoid shared memory + // region name collision + int num_init = ++model_state->StateForBackend()->number_of_instance_inits; shm_region_name_ = model_state->StateForBackend()->shared_memory_region_prefix + - std::to_string(model_state->StateForBackend()->number_of_instance_inits); + std::to_string(num_init); model_version_ = model_state->Version(); From 0aef2c4cbee83984ce9302d4563a3c40f351d992 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 10 Feb 2023 21:30:50 -0800 Subject: [PATCH 077/216] Add decoupled support for BLS (#203) * Add CMAKE_BUILD_TYPE flag to CMakeLists.txt * Add decoupled support for BLS * Add execution timeout to the API * Update copyright * Remove the wrong condition check for exec * Add examples * Use Release as default CMAKE_BUILD_TYPE * Rename variable * Update example models * Add documentation for BLS decoupled support * Returns generator from stream_exec function * Fix for completed response * Set futures in the constructor of InferResponse * Use the server API to set timeout * Format * Add 'decoupled' argument to exec() function. Remove stream_exec() and async_stream_exec() * Address comments * Rename 'execution_timeout' to 'timeout' * Remove unused variable and functions * Make 'timeout' be part of the InferRequest constructor * Move class 'ResponseGenerator' to a new file * Fix up * Update document for 'timeout' changes * Remove the len() function for ResponseGenerator * Remove promise from InferRequest object * Wording * Fix up * Address comment * Fix up * Change the release version --- CMakeLists.txt | 8 + README.md | 363 ++++++++++++++++------ examples/bls_decoupled/README.md | 163 ++++++++++ examples/bls_decoupled/async_client.py | 65 ++++ examples/bls_decoupled/async_config.pbtxt | 45 +++ examples/bls_decoupled/async_model.py | 162 ++++++++++ examples/bls_decoupled/sync_client.py | 64 ++++ examples/bls_decoupled/sync_config.pbtxt | 45 +++ examples/bls_decoupled/sync_model.py | 144 +++++++++ src/infer_payload.cc | 75 +++++ src/infer_payload.h | 49 +++ src/infer_request.cc | 94 ++++-- src/infer_request.h | 17 +- src/infer_response.cc | 27 +- src/infer_response.h | 16 +- src/ipc_message.h | 3 +- src/pb_generator.cc | 62 ++++ src/pb_generator.h | 47 +++ src/pb_stub.cc | 59 +++- src/pb_utils.h | 4 +- src/python_be.cc | 108 +++++-- src/python_be.h | 12 +- src/request_executor.cc | 229 ++++++++------ src/request_executor.h | 9 +- 24 files changed, 1596 insertions(+), 274 deletions(-) create mode 100644 examples/bls_decoupled/README.md create mode 100644 examples/bls_decoupled/async_client.py create mode 100644 examples/bls_decoupled/async_config.pbtxt create mode 100644 examples/bls_decoupled/async_model.py create mode 100644 examples/bls_decoupled/sync_client.py create mode 100644 examples/bls_decoupled/sync_config.pbtxt create mode 100644 examples/bls_decoupled/sync_model.py create mode 100644 src/infer_payload.cc create mode 100644 src/infer_payload.h create mode 100644 src/pb_generator.cc create mode 100644 src/pb_generator.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 88411b76..1a5406bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,10 @@ set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + # # Dependencies # @@ -170,6 +174,8 @@ set( src/request_executor.h src/stub_launcher.h src/stub_launcher.cc + src/infer_payload.h + src/infer_payload.cc ) list(APPEND @@ -190,6 +196,8 @@ set( src/response_sender.h src/pb_stub.h src/pb_stub.cc + src/pb_generator.h + src/pb_generator.cc ) list(APPEND diff --git a/README.md b/README.md index 3ebb8fec..14ecfe4b 100644 --- a/README.md +++ b/README.md @@ -161,8 +161,8 @@ as the Python backend repository branch that you are trying to compile. * triton-inference-server/core: `-DTRITON_CORE_REPO_TAG=` -Set `-DCMAKE_INSTALL_PREFIX` to the location where the Triton Server is installed. In the released containers, -this location is `/opt/tritonserver`. +Set `-DCMAKE_INSTALL_PREFIX` to the location where the Triton Server is +installed. In the released containers, this location is `/opt/tritonserver`. 3. Copy example model and configuration @@ -200,28 +200,31 @@ class TritonPythonModel: @staticmethod def auto_complete_config(auto_complete_model_config): - """`auto_complete_config` is called only once when loading the model assuming - the server was not started with `--disable-auto-complete-config`. Implementing - this function is optional. No implementation of `auto_complete_config` will - do nothing. This function can be used to set `max_batch_size`, `input` and - `output` properties of the model using `set_max_batch_size`, `add_input`, and - `add_output`. These properties will allow Triton to load the model with minimal - model configuration in absence of a configuration file. This function returns - the `pb_utils.ModelConfig` object with these properties. You can use the `as_dict` - function to gain read-only access to the `pb_utils.ModelConfig` object. - The `pb_utils.ModelConfig` object being returned from here will be used as - the final configuration for the model. - - Note: The Python interpreter used to invoke this function will be destroyed - upon returning from this function and as a result none of the objects created - here will be available in the `initialize`, `execute`, or `finalize` functions. + """`auto_complete_config` is called only once when loading the model + assuming the server was not started with + `--disable-auto-complete-config`. Implementing this function is + optional. No implementation of `auto_complete_config` will do nothing. + This function can be used to set `max_batch_size`, `input` and `output` + properties of the model using `set_max_batch_size`, `add_input`, and + `add_output`. These properties will allow Triton to load the model with + minimal model configuration in absence of a configuration file. This + function returns the `pb_utils.ModelConfig` object with these + properties. You can use the `as_dict` function to gain read-only access + to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object + being returned from here will be used as the final configuration for + the model. + + Note: The Python interpreter used to invoke this function will be + destroyed upon returning from this function and as a result none of the + objects created here will be available in the `initialize`, `execute`, + or `finalize` functions. Parameters ---------- auto_complete_model_config : pb_utils.ModelConfig - An object containing the existing model configuration. You can build upon - the configuration given by this object when setting the properties for - this model. + An object containing the existing model configuration. You can build + upon the configuration given by this object when setting the + properties for this model. Returns ------- @@ -294,7 +297,8 @@ class TritonPythonModel: Both keys and values are strings. The dictionary keys and values are: * model_config: A JSON string containing the model configuration * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID + * model_instance_device_id: A string containing model instance device + ID * model_repository: Model repository path * model_version: Model version * model_name: Model name @@ -328,7 +332,8 @@ class TritonPythonModel: # make a copy of the underlying NumPy array and store it if it is # required. for request in requests: - # Perform inference on the request and append it to responses list... + # Perform inference on the request and append it to responses + # list... # You must return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. @@ -348,7 +353,8 @@ Every Python backend can implement four main functions: ### `auto_complete_config` `auto_complete_config` is called only once when loading the model assuming -the server was not started with [`--disable-auto-complete-config`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration). +the server was not started with +[`--disable-auto-complete-config`](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration). Implementing this function is optional. No implementation of `auto_complete_config` will do nothing. This function can be used to set @@ -357,11 +363,13 @@ Implementing this function is optional. No implementation of [dynamic_batching]( https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher), [`input`]( - https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) and + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) +and [`output`]( https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#inputs-and-outputs) -properties of the model using `set_max_batch_size`, `set_dynamic_batching`, `add_input`, and -`add_output`. These properties will allow Triton to load the model with +properties of the model using `set_max_batch_size`, `set_dynamic_batching`, +`add_input`, and `add_output`. These properties will allow Triton to load the +model with [minimal model configuration]( https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration) in absence of a configuration file. This function returns the @@ -396,11 +404,12 @@ below: ### `execute` -`execute` function is called whenever an inference request is made. Every Python -model must implement `execute` function. In the `execute` function you are given -a list of `InferenceRequest` objects. There are two modes of implementing this -function. The mode you choose should depend on your use case. That is whether -or not you want to return decoupled responses from this model or not. +`execute` function is called whenever an inference request is made. Every +Python model must implement `execute` function. In the `execute` function you +are given a list of `InferenceRequest` objects. There are two modes of +implementing this function. The mode you choose should depend on your use case. +That is whether or not you want to return decoupled responses from this model +or not. #### Default Mode @@ -451,7 +460,8 @@ class TritonPythonModel: if an_error_occurred: # If there is an error, the output_tensors are ignored responses.append(pb_utils.InferenceResponse( - output_tensors=[], error=pb_utils.TritonError("An Error Occurred"))) + output_tensors=[], + error=pb_utils.TritonError("An Error Occurred"))) return responses ``` @@ -463,7 +473,8 @@ This mode allows user to send multiple responses for a request or not send any responses for a request. A model may also send responses out-of-order relative to the order that the request batches are executed. Such models are called *decoupled* models. In -order to use this mode, the [transaction policy](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-transaction-policy) +order to use this mode, the +[transaction policy](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-transaction-policy) in the model configuration must be set to decoupled. @@ -483,9 +494,10 @@ request. The workflow in this mode may look like: 2. Create and populate pb_utils.InferenceResponse to be sent back. 3. Use InferenceResponseSender.send() to send the above response. If - this is the last request then pass pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - as a flag with InferenceResponseSender.send(). Otherwise continue with - Step 1 for sending next request. + this is the last request then pass + pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL as a flag with + InferenceResponseSender.send(). Otherwise continue with Step 1 for sending + next request. * The return value for `execute` function in this mode should be None. @@ -534,9 +546,9 @@ necessary functions, you should save this file as `model.py`. ## Model Config File Every Python Triton model must provide a `config.pbtxt` file describing -the model configuration. In order to use this backend you must set the `backend` -field of your model `config.pbtxt` file to `python`. You shouldn't set -`platform` field of the configuration. +the model configuration. In order to use this backend you must set the +`backend` field of your model `config.pbtxt` file to `python`. You +shouldn't set `platform` field of the configuration. Your models directory should look like below: ``` @@ -571,12 +583,14 @@ default in the Triton containers.** Python backend uses a *stub* process to connect your `model.py` file to the Triton C++ core. This stub process has an embedded Python interpreter with a fixed Python version. If you intend to use a Python interpreter with -different version from the default Python backend stub, you need to compile your own -Python backend stub by following the steps below: +different version from the default Python backend stub, you need to compile +your own Python backend stub by following the steps below: 1. Install the software packages below: * [cmake](https://cmake.org) -* rapidjson and libarchive (instructions for installing these packages in Ubuntu or Debian are included in [Building from Source Section](#building-from-source)) +* rapidjson and libarchive (instructions for installing these packages in +Ubuntu or Debian are included in +[Building from Source Section](#building-from-source)) 2. Make sure that the expected Python version is available in your environment. @@ -629,7 +643,8 @@ models `-- triton_python_backend_stub ``` -Note the location of `triton_python_backend_stub` in the directory structure above. +Note the location of `triton_python_backend_stub` in the directory structure +above. ### Creating Custom Execution Environments @@ -649,8 +664,10 @@ Packing environment at '/home/iman/miniconda3/envs/python-3-6' to 'python-3-6.ta [########################################] | 100% Completed | 4.5s ``` -**Important Note:** Before installing the packages in your conda environment, make sure that you -have exported [`PYTHONNOUSERSITE`](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONNOUSERSITE) environment variable: +**Important Note:** Before installing the packages in your conda environment, +make sure that you have exported +[`PYTHONNOUSERSITE`](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONNOUSERSITE) +environment variable: ``` export PYTHONNOUSERSITE=True @@ -704,7 +721,8 @@ models | `-- triton_python_backend_stub ``` -In the example above, `$$TRITON_MODEL_DIRECTORY` is resolved to `$pwd/models/model_a`. +In the example above, `$$TRITON_MODEL_DIRECTORY` is resolved to +`$pwd/models/model_a`. This is useful if you want to use S3, GCS, or Azure and you do not have access to the absolute path of the execution env that is stored in the cloud object @@ -723,8 +741,8 @@ still applies here and the version of the Python interpreter inside the conda environment must match the Python version of stub used by Python backend. The default version of the stub is Python 3.8. -3. You can share a single execution environment across multiple models. You need to -provide the path to the tar file in the `EXECUTION_ENV_PATH` in the +3. You can share a single execution environment across multiple models. You +need to provide the path to the tar file in the `EXECUTION_ENV_PATH` in the `config.pbtxt` of all the models that want to use the execution environment. 4. If `$$TRITON_MODEL_DIRECTORY` is used in the `EXECUTION_ENV_PATH`, the final @@ -757,7 +775,8 @@ class TritonPythonModel: def finalize(self): if error_during_finalize: - raise pb_utils.TritonModelException("An error occurred during finalize.") + raise pb_utils.TritonModelException( + "An error occurred during finalize.") ``` ## Managing Shared Memory @@ -776,8 +795,8 @@ You can also configure the timeout used for connecting Triton main process to the Python backend stubs using the `stub-timeout-seconds`. The default value is 30 seconds. -The config values described above can be passed to Triton using `--backend-config` -flag: +The config values described above can be passed to Triton using +`--backend-config` flag: ``` /opt/tritonserver/bin/tritonserver --model-repository=`pwd`/models --backend-config=python,= @@ -799,10 +818,10 @@ to work around this issue, Python backend spawns a separate process for each [model instance](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#multiple-model-instances). This is in contrast with how other Triton backends such as [ONNXRuntime](https://github.com/triton-inference-server/onnxruntime_backend), -[TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), and -[PyTorch](https://github.com/triton-inference-server/pytorch_backend) handle -multiple instances. Increasing the instance count for these backends will create -additional threads instead of spawning separate processes. +[TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), +and [PyTorch](https://github.com/triton-inference-server/pytorch_backend) +handle multiple instances. Increasing the instance count for these backends +will create additional threads instead of spawning separate processes. # Business Logic Scripting @@ -817,9 +836,9 @@ call this combination of custom logic and model executions *Business Logic Scripting (BLS)*. Starting from 21.08, you can implement BLS in your Python model. A new set of -utility functions allows you to execute inference requests on other models being -served by Triton as a part of executing your Python model. Example below shows -how to use this feature: +utility functions allows you to execute inference requests on other models +being served by Triton as a part of executing your Python model. Example below +shows how to use this feature: ```python import triton_python_backend_utils as pb_utils @@ -831,35 +850,39 @@ class TritonPythonModel: ... # Create an InferenceRequest object. `model_name`, # `requested_output_names`, and `inputs` are the required arguments and - # must be provided when constructing an InferenceRequest object. Make sure - # to replace `inputs` argument with a list of `pb_utils.Tensor` objects. + # must be provided when constructing an InferenceRequest object. Make + # sure to replace `inputs` argument with a list of `pb_utils.Tensor` + # objects. inference_request = pb_utils.InferenceRequest( model_name='model_name', requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], inputs=[]) - # `pb_utils.InferenceRequest` supports request_id, correlation_id, and model - # version in addition to the arguments described above. These arguments - # are optional. An example containing all the arguments: + # `pb_utils.InferenceRequest` supports request_id, correlation_id, + # model version and timeout in addition to the arguments described above. + # These arguments are optional. An example containing all the arguments: # inference_request = pb_utils.InferenceRequest(model_name='model_name', # requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], # inputs=[], - # request_id="1", correlation_id=4, model_version=1, flags=0) + # request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5) # Execute the inference_request and wait for the response inference_response = inference_request.exec() # Check if the inference response has an error if inference_response.has_error(): - raise pb_utils.TritonModelException(inference_response.error().message()) + raise pb_utils.TritonModelException( + inference_response.error().message()) else: # Extract the output tensors from the inference response. - output1 = pb_utils.get_output_tensor_by_name(inference_response, 'REQUESTED_OUTPUT_1') - output2 = pb_utils.get_output_tensor_by_name(inference_response, 'REQUESTED_OUTPUT_2') - - # Decide the next steps for model execution based on the received output - # tensors. It is possible to use the same output tensors to for the final - # inference response too. + output1 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_1') + output2 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_2') + + # Decide the next steps for model execution based on the received + # output tensors. It is possible to use the same output tensors + # to for the final inference response too. ``` @@ -884,8 +907,9 @@ class TritonPythonModel: ... # Create an InferenceRequest object. `model_name`, # `requested_output_names`, and `inputs` are the required arguments and - # must be provided when constructing an InferenceRequest object. Make sure - # to replace `inputs` argument with a list of `pb_utils.Tensor` objects. + # must be provided when constructing an InferenceRequest object. Make + # sure to replace `inputs` argument with a list of `pb_utils.Tensor` + # objects. inference_request = pb_utils.InferenceRequest( model_name='model_name', requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], @@ -904,27 +928,155 @@ class TritonPythonModel: for infer_response in infer_responses: # Check if the inference response has an error if inference_response.has_error(): - raise pb_utils.TritonModelException(inference_response.error().message()) + raise pb_utils.TritonModelException( + inference_response.error().message()) else: # Extract the output tensors from the inference response. - output1 = pb_utils.get_output_tensor_by_name(inference_response, 'REQUESTED_OUTPUT_1') - output2 = pb_utils.get_output_tensor_by_name(inference_response, 'REQUESTED_OUTPUT_2') + output1 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_1') + output2 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_2') - # Decide the next steps for model execution based on the received output - # tensors. + # Decide the next steps for model execution based on the received + # output tensors. ``` A complete example for sync and async BLS in Python backend is included in the [Examples](#examples) section. +Starting from 23.03 release, you can execute inference requests on decoupled +models in both [default mode](#default-mode) and +[decoupled mode](#decoupled-mode). By setting the `decoupled` parameter to +`True`, the `exec` and `async_exec` function will return a +[generator](https://docs.python.org/3/glossary.html#term-generator) of +inference responses returned by a decoupled model. If the `decoupled` parameter +is set to `False`, the `exec` and `async_exec` function will return a single +response as shown in the example above. + +Besides, you can set the timeout via the parameter 'timeout' in microseconds +within the constructor of `InferenceRequest`. If the request times out, the +request will respond with an error. The default of 'timeout' is 0 which +indicates that the request has no timeout. Example below shows how to use this +feature: + +```python +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + ... + def execute(self, requests): + ... + # Create an InferenceRequest object. `model_name`, + # `requested_output_names`, and `inputs` are the required arguments and + # must be provided when constructing an InferenceRequest object. Make + # sure to replace `inputs` argument with a list of `pb_utils.Tensor` + # objects. + inference_request = pb_utils.InferenceRequest( + model_name='model_name', + requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], + inputs=[]) + + # `pb_utils.InferenceRequest` supports request_id, correlation_id, + # model version and timeout in addition to the arguments described above. + # These arguments are optional. An example containing all the arguments: + # inference_request = pb_utils.InferenceRequest(model_name='model_name', + # requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], + # inputs=[], + # request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5) + + # Execute the inference_request and wait for the response. Here we are + # running a BLS request on a decoupled model, hence setting the parameter + # 'decoupled' to 'True'. + inference_responses = inference_request.exec(decoupled=True) + + for inference_response in inference_responses: + # Check if the inference response has an error + if inference_response.has_error(): + raise pb_utils.TritonModelException( + inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + output1 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_1') + output2 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_2') + + # Decide the next steps for model execution based on the received + # output tensors. It is possible to use the same output tensors to + # for the final inference response too. +``` + + +In addition to the `inference_request.exec(decoupled=True)` function that +allows you to execute blocking inference requests on decoupled models, +`inference_request.async_exec(decoupled=True)` allows you to perform async +inference requests. This can be useful when you do not need the result of the +inference immediately. Using `async_exec` function, it is possible to have +multiple inflight inference requests and wait for the responses only when +needed. Example below shows how to use `async_exec`: + +```python +import triton_python_backend_utils as pb_utils +import asyncio + + +class TritonPythonModel: + ... + + # You must add the Python 'async' keyword to the beginning of `execute` + # function if you want to use `async_exec` function. + async def execute(self, requests): + ... + # Create an InferenceRequest object. `model_name`, + # `requested_output_names`, and `inputs` are the required arguments and + # must be provided when constructing an InferenceRequest object. Make + # sure to replace `inputs` argument with a list of `pb_utils.Tensor` + # objects. + inference_request = pb_utils.InferenceRequest( + model_name='model_name', + requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], + inputs=[]) + + infer_response_awaits = [] + for i in range(4): + # async_exec function returns an + # [Awaitable](https://docs.python.org/3/library/asyncio-task.html#awaitables) + # object. + infer_response_awaits.append( + inference_request.async_exec(decoupled=True)) + + # Wait for all of the inference requests to complete. + async_responses = await asyncio.gather(*infer_response_awaits) + + for infer_responses in async_responses: + for infer_response in infer_responses: + # Check if the inference response has an error + if inference_response.has_error(): + raise pb_utils.TritonModelException( + inference_response.error().message()) + else: + # Extract the output tensors from the inference response. + output1 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_1') + output2 = pb_utils.get_output_tensor_by_name( + inference_response, 'REQUESTED_OUTPUT_2') + + # Decide the next steps for model execution based on the received + # output tensors. +``` + +A complete example for sync and async BLS for decoupled models is included in +the [Examples](#examples) section. + Starting from the 22.04 release, the lifetime of the BLS output tensors have been improved such that if a tensor is no longer needed in your Python model it will be automatically deallocated. This can increase the number of BLS requests -that you can execute in your model without running into the out of GPU or shared -memory error. +that you can execute in your model without running into the out of GPU or +shared memory error. -Note: Async BLS is not supported on Python 3.6 or lower due to the `async` keyword -and `asyncio.run` being introduced in Python 3.7. +Note: Async BLS is not supported on Python 3.6 or lower due to the `async` +keyword and `asyncio.run` being introduced in Python 3.7. ## Using BLS with Stateful Models @@ -939,7 +1091,8 @@ sequence: inference_request = pb_utils.InferenceRequest(model_name='model_name', requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], inputs=[], - request_id="1", correlation_id=4, flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START) + request_id="1", correlation_id=4, + flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START) ``` For indicating the ending of the sequence you can use the @@ -954,13 +1107,13 @@ flags = pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START | pb_utils.TRITONSERVE ## Limitation -- You need to make sure that the inference requests performed as a part of your model -do not create a circular dependency. For example, if model A performs an inference request -on itself and there are no more model instances ready to execute the inference request, the -model will block on the inference execution forever. - -- Currently, BLS can not run inference on a decoupled model. +- You need to make sure that the inference requests performed as a part of your +model do not create a circular dependency. For example, if model A performs an +inference request on itself and there are no more model instances ready to +execute the inference request, the model will block on the inference execution +forever. +- Async BLS is not supported when running a Python model in decoupled mode. # Interoperability and GPU Support @@ -1036,7 +1189,8 @@ parameters: { key: "FORCE_CPU_ONLY_INPUT_TENSORS" value: {string_value:"no"}} # Examples For using the Triton Python client in these examples you need to install -the [Triton Python Client Library](https://github.com/triton-inference-server/client#getting-the-client-libraries-and-examples). +the +[Triton Python Client Library](https://github.com/triton-inference-server/client#getting-the-client-libraries-and-examples). The Python client for each of the examples is in the `client.py` file. ## AddSub in NumPy @@ -1048,39 +1202,50 @@ find the files in [examples/add_sub](examples/add_sub). ## AddSubNet in PyTorch In order to use this model, you need to install PyTorch. We recommend using -`pip` method mentioned in the [PyTorch website](https://pytorch.org/get-started/locally/). +`pip` method mentioned in the +[PyTorch website](https://pytorch.org/get-started/locally/). Make sure that PyTorch is available in the same Python environment as other -dependencies. Alternatively, you can create a [Python Execution Environment](#using-custom-python-execution-environments). +dependencies. Alternatively, you can create a +[Python Execution Environment](#using-custom-python-execution-environments). You can find the files for this example in [examples/pytorch](examples/pytorch). ## AddSub in JAX The JAX example shows how to serve JAX in Triton using Python Backend. -You can find the complete example instructions in [examples/jax](examples/jax/README.md). +You can find the complete example instructions in +[examples/jax](examples/jax/README.md). ## Business Logic Scripting The BLS example needs the dependencies required for both of the above examples. -You can find the complete example instructions in [examples/bls](examples/bls/README.md). +You can find the complete example instructions in +[examples/bls](examples/bls/README.md) and +[examples/bls_decoupled](examples/bls_decoupled/README.md). ## Preprocessing -The Preprocessing example shows how to use Python Backend to do model preprocessing. -You can find the complete example instructions in [examples/preprocessing](examples/preprocessing/README.md). +The Preprocessing example shows how to use Python Backend to do model +preprocessing. +You can find the complete example instructions in +[examples/preprocessing](examples/preprocessing/README.md). ## Decoupled Models The examples of decoupled models shows how to develop and serve [decoupled models](#decoupled-mode) in Triton using Python backend. -You can find the complete example instructions in [examples/decoupled](examples/decoupled/README.md). +You can find the complete example instructions in +[examples/decoupled](examples/decoupled/README.md). # Running with Inferentia -Please see the [README.md](https://github.com/triton-inference-server/python_backend/tree/main/inferentia/README.md) located in the python_backend/inferentia sub folder. +Please see the +[README.md](https://github.com/triton-inference-server/python_backend/tree/main/inferentia/README.md) +located in the python_backend/inferentia sub folder. # Logging -Starting from 22.09 release, your Python model can log information using the following methods: +Starting from 22.09 release, your Python model can log information using the +following methods: ```python import triton_python_backend_utils as pb_utils diff --git a/examples/bls_decoupled/README.md b/examples/bls_decoupled/README.md new file mode 100644 index 00000000..6f5fc57b --- /dev/null +++ b/examples/bls_decoupled/README.md @@ -0,0 +1,163 @@ + + +# Example of using BLS with decoupled models + +In this section we demonstrate an end-to-end example for +[BLS](../../README.md#business-logic-scripting) in Python backend. The +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) +should contain [square](../decoupled) model. The [square](../decoupled) model +will send 'n' responses where 'n' is the value of input `IN`. For each response, +output `OUT` will equal the value of `IN`. This example is broken into two +sections. The first section demonstrates how to perform synchronous BLS requests +and the second section shows how to execute asynchronous BLS requests. + +## Synchronous BLS Requests with Decoupled Models + +The goal of `bls_decoupled_sync` model is to caculate the sum of the responses +returned from the [square](../decoupled) model and return the summation as the final response. The value of input 'IN' will be passed as an input to the +[square](../decoupled) model which determines how many responses the +[square](../decoupled) model will generate. + +1. Create the model repository: + +```console +mkdir -p models/bls_decoupled_sync/1 +mkdir -p models/square_int32/1 + +# Copy the Python models +cp examples/bls_decoupled/sync_model.py models/bls_decoupled_sync/1/model.py +cp examples/bls_decoupled/sync_config.pbtxt models/bls_decoupled_sync/config.pbtxt +cp examples/decoupled/square_model.py models/square_int32/1/model.py +cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt +``` + +2. Start the tritonserver: + +``` +tritonserver --model-repository `pwd`/models +``` + +3. Send inference requests to server: + +``` +python3 examples/bls_decoupled/sync_client.py +``` + +You should see an output similar to the output below: + +``` +==========model result========== +The square value of [4] is [16] + +==========model result========== +The square value of [2] is [4] + +==========model result========== +The square value of [0] is [0] + +==========model result========== +The square value of [1] is [1] + +PASS: BLS Decoupled Sync +``` + +The [sync_model.py](./sync_model.py) model file is heavily commented with +explanations about each of the function calls. + +### Explanation of the Client Output + +The [client.py](./sync_client.py) sends 4 inference requests to the +`bls_decoupled_sync` model with the input as: [4], [2], [0] and [1] +respectively. In compliance with the behavior of the sync BLS model, +it will expect the output to be the square value of the input. + +## Asynchronous BLS Requests with Decoupled Models + +In this section we explain how to send multiple BLS requests without waiting for +their response. Asynchronous execution of BLS requests will not block your +model execution and can lead to speedups under certain conditions. + +The `bls_decoupled_async` model will perform two async BLS requests on the +[square](../decoupled) model. Then, it will wait until the inference requests +are completed. It will caculate the sum of the output `OUT` from the +[square](../decoupled) model in both two requests to construct the final +inference response object using these tensors. + +1. Create the model repository: + +```console +mkdir -p models/bls_decoupled_async/1 +mkdir -p models/square_int32/1 + +# Copy the Python models +cp examples/bls_decoupled/async_model.py models/bls_decoupled_async/1/model.py +cp examples/bls_decoupled/async_config.pbtxt models/bls_decoupled_async/config.pbtxt +cp examples/decoupled/square_model.py models/square_int32/1/model.py +cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt +``` + +2. Start the tritonserver: + +``` +tritonserver --model-repository `pwd`/models +``` + +3. Send inference requests to server: + +``` +python3 examples/bls_decoupled/async_client.py +``` + +You should see an output similar to the output below: + +``` +==========model result========== +Two times the square value of [4] is [32] + +==========model result========== +Two times the square value of [2] is [8] + +==========model result========== +Two times the square value of [0] is [0] + +==========model result========== +Two times the square value of [1] is [2] + +PASS: BLS Decoupled Async +``` + +The [async_model.py](./async_model.py) model file is heavily commented with +explanations about each of the function calls. + +### Explanation of the Client Output + +The [client.py](./async_client.py) sends 4 inference requests to the +'bls_decoupled_sync' model with the input as: [4], [2], [0] and [1] +respectively. In compliance with the behavior of sync BLS model model, +it will expect the output to be two time the square value of the input. diff --git a/examples/bls_decoupled/async_client.py b/examples/bls_decoupled/async_client.py new file mode 100644 index 00000000..aede17b4 --- /dev/null +++ b/examples/bls_decoupled/async_client.py @@ -0,0 +1,65 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonclient.utils import * +import tritonclient.http as httpclient +import numpy as np +import sys + +model_name = "bls_decoupled_async" +shape = [1] + +with httpclient.InferenceServerClient("localhost:8000") as client: + in_values = [4, 2, 0, 1] + + for in_value in in_values: + input_data = np.array([in_value], dtype=np.int32) + inputs = [ + httpclient.InferInput("IN", input_data.shape, + np_to_triton_dtype(input_data.dtype)) + ] + inputs[0].set_data_from_numpy(input_data) + outputs = [httpclient.InferRequestedOutput("SUM")] + + response = client.infer(model_name, + inputs, + request_id=str(1), + outputs=outputs) + + result = response.get_response() + # output_data contains two times of the square value of the input value. + output_data = response.as_numpy("SUM") + print("==========model result==========") + print("Two times the square value of {} is {}\n".format(input_data, output_data)) + + if not np.allclose((2*input_data*input_data), output_data): + print( + "BLS Decoupled Async example error: incorrect output value. Expected {}, got {}." + .format((2*input_data*input_data), output_data)) + sys.exit(1) + + print('PASS: BLS Decoupled Async') + sys.exit(0) diff --git a/examples/bls_decoupled/async_config.pbtxt b/examples/bls_decoupled/async_config.pbtxt new file mode 100644 index 00000000..fb999104 --- /dev/null +++ b/examples/bls_decoupled/async_config.pbtxt @@ -0,0 +1,45 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_decoupled_async" +backend: "python" + +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "SUM" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/examples/bls_decoupled/async_model.py b/examples/bls_decoupled/async_model.py new file mode 100644 index 00000000..d94e772f --- /dev/null +++ b/examples/bls_decoupled/async_model.py @@ -0,0 +1,162 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils +import numpy as np +import asyncio +import json + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + + This model demonstrates how to use BLS with decoupled models. + + This model has a single input and a single output. The model does not + support batching. + - Input 'IN' shape must be equal to [1], datatype must be INT32. + - For each response, output 'SUM' shape must be equal to [1], datatype + must be INT32. + + For every request, the model will send a single response that contains an + output named 'SUM'. We will send two BLS requests to the square model and + the 'SUM' will contain the summation of the 'OUT' response output returned + by the square model in the two BLS requests. The input 'IN' determines how + many responses the square model will generate. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args['model_config']) + + # You must add the Python 'async' keyword to the beginning of `execute` + # function if you want to use `async_exec` function. + async def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + # This model does not support batching, so 'request_count' should + # always be 1. + if len(requests) != 1: + raise pb_utils.TritonModelException("unsupported batch size " + + len(requests)) + + response_num = pb_utils.get_input_tensor_by_name(requests[0], "IN") + + # List of awaitables containing inflight inference responses. + inference_response_awaits = [] + + # For detailed explanation about the inputs of the repeat model, refer + # to the example below: + # https://github.com/triton-inference-server/python_backend/blob/r22.12/examples/decoupled/square_model.py + # Construct two BLS requests + for _ in range(2): + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", + inputs=[response_num], + requested_output_names=["OUT"]) + # Store the awaitable inside the array. We don't need + # the inference response immediately so we do not `await` + # here. + inference_response_awaits.append( + infer_request.async_exec(decoupled=True)) + + # Wait for all the inference requests to finish. The execution + # of the Python script will be blocked until all the awaitables + # are resolved. + async_responses = await asyncio.gather( + *inference_response_awaits) + + # The variable that will store the sum of the responses. + response_sum = np.array([0]) + + # Iterate over the list of generators of responses returned by the BLS + # request. This interface can support zero, one, and many inference + # responses per request. + for infer_responses in async_responses: + for infer_response in infer_responses: + # If inference response has an error, raise an exception + if infer_response.has_error(): + raise pb_utils.TritonModelException( + infer_response.error().message()) + + response_sum += pb_utils.get_output_tensor_by_name( + infer_response, "OUT").as_numpy() + + response = [ + pb_utils.InferenceResponse( + output_tensors=[pb_utils.Tensor("SUM", response_sum)]) + ] + + # Since the model is using the default mode in this example, we + # will be returning a single response. + return response + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/examples/bls_decoupled/sync_client.py b/examples/bls_decoupled/sync_client.py new file mode 100644 index 00000000..10fd4ffa --- /dev/null +++ b/examples/bls_decoupled/sync_client.py @@ -0,0 +1,64 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonclient.utils import * +import tritonclient.http as httpclient +import numpy as np +import sys + +model_name = "bls_decoupled_sync" +shape = [1] + +with httpclient.InferenceServerClient("localhost:8000") as client: + in_values = [4, 2, 0, 1] + + for in_value in in_values: + input_data = np.array([in_value], dtype=np.int32) + inputs = [ + httpclient.InferInput("IN", input_data.shape, + np_to_triton_dtype(input_data.dtype)) + ] + inputs[0].set_data_from_numpy(input_data) + outputs = [httpclient.InferRequestedOutput("SUM")] + + response = client.infer(model_name, + inputs, + request_id=str(1), + outputs=outputs) + + result = response.get_response() + output_data = response.as_numpy("SUM") + print("==========model result==========") + print("The square value of {} is {}\n".format(input_data, output_data)) + + if not np.allclose(input_data * input_data, output_data): + print( + "BLS Decoupled Sync example error: incorrect output value. Expected {}, got {}." + ).format(input_data * input_data, output_data) + sys.exit(1) + + print('PASS: BLS Decoupled Sync') + sys.exit(0) diff --git a/examples/bls_decoupled/sync_config.pbtxt b/examples/bls_decoupled/sync_config.pbtxt new file mode 100644 index 00000000..f9fe85ea --- /dev/null +++ b/examples/bls_decoupled/sync_config.pbtxt @@ -0,0 +1,45 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_decoupled_sync" +backend: "python" + +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "SUM" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/examples/bls_decoupled/sync_model.py b/examples/bls_decoupled/sync_model.py new file mode 100644 index 00000000..6d6765e3 --- /dev/null +++ b/examples/bls_decoupled/sync_model.py @@ -0,0 +1,144 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils +import numpy as np +import json + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + + This model demonstrates how to use BLS with decoupled models. + + This model has a single input and a single output. The model does not + support batching. + - Input 'IN' shape must be equal to [1], datatype must be INT32. + - For each response, output 'SUM' shape must be equal to [1], datatype + must be INT32. + + For every request, the model will send a single response that contains an + output named 'SUM'. The 'SUM' will contain the summation of the 'OUT' + response output returned by the square model. The input 'IN' determines how + many responses the square model will generate. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args['model_config']) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + # This model does not support batching, so 'request_count' should + # always be 1. + if len(requests) != 1: + raise pb_utils.TritonModelException("unsupported batch size " + + len(requests)) + + response_num = pb_utils.get_input_tensor_by_name(requests[0], "IN") + + # For detailed explanation about the inputs of the repeat model, refer + # to the example below: + # https://github.com/triton-inference-server/python_backend/blob/r22.12/examples/decoupled/square_model.py + # Construct the BLS request + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", + inputs=[response_num], + requested_output_names=["OUT"]) + + # The variable that will store the sum of the responses. + response_sum = np.array([0]) + + # Iterate over the generator of responses returned by the BLS request. + # This interface can support zero, one, and many inference responses + # per request. + infer_responses = infer_request.exec(decoupled=True) + + for infer_response in infer_responses: + # If inference response has an error, raise an exception + if infer_response.has_error(): + raise pb_utils.TritonModelException( + infer_response.error().message()) + + response_sum += pb_utils.get_output_tensor_by_name( + infer_response, "OUT").as_numpy() + + response = [ + pb_utils.InferenceResponse( + output_tensors=[pb_utils.Tensor("SUM", response_sum)]) + ] + + # Since the model is using the default mode in this example, we + # will be returning a single response. + return response + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/src/infer_payload.cc b/src/infer_payload.cc new file mode 100644 index 00000000..6cc74b59 --- /dev/null +++ b/src/infer_payload.cc @@ -0,0 +1,75 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "infer_payload.h" + +namespace triton { namespace backend { namespace python { + +InferPayload::InferPayload(const bool is_decoupled) + : is_decoupled_(is_decoupled) +{ + prev_promise_.reset(new std::promise>()); +} + +InferPayload::~InferPayload() +{ + prev_promise_.reset(); +} + +void +InferPayload::SetPrevPromise( + std::promise>** promise) +{ + prev_promise_.reset(std::move(*promise)); +} + +void +InferPayload::SetValueForPrevPromise( + std::unique_ptr infer_response) +{ + prev_promise_->set_value(std::move(infer_response)); +} + +void +InferPayload::ResetPrevPromise() +{ + prev_promise_.reset(); +} + +void +InferPayload::SetFuture( + std::future>& response_future) +{ + response_future = prev_promise_->get_future(); +} + +bool +InferPayload::IsDecoupled() +{ + return is_decoupled_; +} + +}}} // namespace triton::backend::python diff --git a/src/infer_payload.h b/src/infer_payload.h new file mode 100644 index 00000000..03e9336b --- /dev/null +++ b/src/infer_payload.h @@ -0,0 +1,49 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "infer_response.h" + +namespace triton { namespace backend { namespace python { + +class InferPayload { + public: + InferPayload(const bool is_decoupled); + ~InferPayload(); + + void SetPrevPromise(std::promise>** promise); + void SetValueForPrevPromise(std::unique_ptr infer_response); + void ResetPrevPromise(); + void SetFuture(std::future>& response_future); + bool IsDecoupled(); + + private: + std::unique_ptr>> prev_promise_; + bool is_decoupled_; +}; + +}}} // namespace triton::backend::python diff --git a/src/infer_request.cc b/src/infer_request.cc index de034aed..47ad9b0d 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -40,11 +40,11 @@ InferRequest::InferRequest( const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const uint32_t flags, const intptr_t response_factory_address, - const intptr_t request_address) + const uint32_t flags, const int32_t timeout, + const intptr_t response_factory_address, const intptr_t request_address) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), - model_version_(model_version), flags_(flags), + model_version_(model_version), flags_(flags), timeout_(timeout), response_factory_address_(response_factory_address), request_address_(request_address) { @@ -133,6 +133,24 @@ InferRequest::ShmHandle() return shm_handle_; } +int32_t +InferRequest::Timeout() +{ + return timeout_; +} + +void +InferRequest::SetIsDecoupled(const bool is_decoupled) +{ + is_decoupled_ = is_decoupled; +} + +bool +InferRequest::IsDecoupled() +{ + return is_decoupled_; +} + void InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { @@ -154,6 +172,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) infer_request_shm_ptr_->flags = Flags(); infer_request_shm_ptr_->address = request_address_; infer_request_shm_ptr_->response_factory_address = response_factory_address_; + infer_request_shm_ptr_->is_decoupled = is_decoupled_; + infer_request_shm_ptr_->timeout = timeout_; output_names_handle_shm_ptr_ = reinterpret_cast( @@ -312,6 +332,8 @@ InferRequest::InferRequest( correlation_id_ = infer_request_shm_ptr_->correlation_id; request_address_ = infer_request_shm_ptr_->address; response_factory_address_ = infer_request_shm_ptr_->response_factory_address; + is_decoupled_ = infer_request_shm_ptr_->is_decoupled; + timeout_ = infer_request_shm_ptr_->timeout; #ifdef TRITON_PB_STUB response_sender_ = std::make_shared( @@ -348,15 +370,15 @@ InferRequest::GetResponseSender() return response_sender_; } - -std::shared_ptr -InferRequest::Exec() +std::vector> +InferRequest::Exec(const bool is_decoupled) { ResponseBatch* response_batch = nullptr; bool responses_is_set = false; std::unique_ptr& stub = Stub::GetOrCreateInstance(); std::unique_ptr& shm_pool = stub->SharedMemory(); bi::managed_external_buffer::handle_t* response_handle = nullptr; + std::vector> infer_responses; PythonBackendException pb_exception(std::string{}); std::unique_ptr ipc_message; @@ -374,8 +396,13 @@ InferRequest::Exec() bool has_exception = false; PythonBackendException pb_exception(std::string{}); - ipc_message->Command() = - PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest; + if (is_decoupled) { + ipc_message->Command() = + PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest; + } else { + ipc_message->Command() = + PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest; + } request_batch = shm_pool->Construct( sizeof(RequestBatch) + sizeof(bi::managed_external_buffer::handle_t)); @@ -472,45 +499,62 @@ InferRequest::Exec() if (response_batch->is_error_set) { std::unique_ptr pb_string = PbString::LoadFromSharedMemory(shm_pool, response_batch->error); - return std::make_unique( + auto error_response = std::make_unique( std::vector>{}, std::make_shared(pb_string->String())); + infer_responses.emplace_back(std::move(error_response)); + + return infer_responses; } else { - return std::make_unique( + auto error_response = std::make_unique( std::vector>{}, std::make_shared( "An error occurred while performing BLS request.")); + infer_responses.emplace_back(std::move(error_response)); + + return infer_responses; } } } catch (const PythonBackendException& pb_exception) { - return std::make_unique( + auto error_response = std::make_unique( std::vector>{}, std::make_shared(pb_exception.what())); + infer_responses.emplace_back(std::move(error_response)); + + return infer_responses; } if (responses_is_set) { - std::unique_ptr infer_response = - InferResponse::LoadFromSharedMemory( - shm_pool, *response_handle, true /* open cuda handle */); + uint32_t response_count = response_batch->response_size; auto& memory_manager_message_queue = stub->MemoryManagerQueue(); - - for (auto& output_tensor : infer_response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId(); - output_tensor->Memory()->SetMemoryReleaseCallback( - [&memory_manager_message_queue, memory_release_id]() { - memory_manager_message_queue->Push(memory_release_id); - }); + for (size_t idx = 0; idx < response_count; idx++) { + std::unique_ptr response = + InferResponse::LoadFromSharedMemory( + shm_pool, response_handle[idx], true /* open cuda handle */); + + for (auto& output_tensor : response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + uint64_t memory_release_id = + output_tensor->Memory()->MemoryReleaseId(); + output_tensor->Memory()->SetMemoryReleaseCallback( + [&memory_manager_message_queue, memory_release_id]() { + memory_manager_message_queue->Push(memory_release_id); + }); + } } + infer_responses.emplace_back(std::move(response)); } - return infer_response; + return infer_responses; } else { - return std::make_unique( + auto error_response = std::make_unique( std::vector>{}, std::make_shared( "An error occurred while performing BLS request.")); + infer_responses.emplace_back(std::move(error_response)); + + return infer_responses; } } diff --git a/src/infer_request.h b/src/infer_request.h index 5fc188a3..0d2fdc93 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ #pragma once +#include #include #include "infer_response.h" #include "pb_tensor.h" @@ -36,6 +37,8 @@ namespace triton { namespace backend { namespace python { +class Stub; + // // Inference Request // @@ -47,6 +50,8 @@ struct InferRequestShm { uint32_t flags; intptr_t address; intptr_t response_factory_address; + bool is_decoupled; + int32_t timeout; }; class InferRequest { @@ -56,7 +61,8 @@ class InferRequest { const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const uint32_t flags = 0, const intptr_t response_factory_address = 0, + const uint32_t flags = 0, const int32_t timeout = 0, + const intptr_t response_factory_address = 0, const intptr_t request_address = 0); const std::vector>& Inputs(); @@ -68,9 +74,12 @@ class InferRequest { void SetFlags(uint32_t flags); const std::set& RequestedOutputNames(); bi::managed_external_buffer::handle_t ShmHandle(); + int32_t Timeout(); + bool IsDecoupled(); + void SetIsDecoupled(const bool is_decoupled); #ifdef TRITON_PB_STUB - std::shared_ptr Exec(); + std::vector> Exec(const bool is_decoupled); std::shared_ptr GetResponseSender(); #endif @@ -116,8 +125,10 @@ class InferRequest { std::string model_name_; int64_t model_version_; uint32_t flags_; + int32_t timeout_; intptr_t response_factory_address_; intptr_t request_address_; + bool is_decoupled_; // Shared Memory Data Structures AllocatedSharedMemory infer_request_shm_; diff --git a/src/infer_response.cc b/src/infer_response.cc index 2a4cd3a7..7df8bfd2 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -39,6 +39,22 @@ namespace triton { namespace backend { namespace python { InferResponse::InferResponse( const std::vector>& output_tensors, std::shared_ptr error) + : error_(error), next_response_future_(nullptr) +{ + for (auto& output : output_tensors) { + if (!output) { + throw PythonBackendException( + "Output tensor for inference response should not be empty."); + } + } + + output_tensors_ = output_tensors; +} + +InferResponse::InferResponse( + const std::vector>& output_tensors, + std::promise>* promise, + std::shared_ptr error) : error_(error) { for (auto& output : output_tensors) { @@ -49,6 +65,9 @@ InferResponse::InferResponse( } output_tensors_ = output_tensors; + next_response_future_ = + std::make_unique>>( + promise->get_future()); } std::vector>& @@ -179,6 +198,12 @@ InferResponse::Error() return error_; } +std::unique_ptr>> +InferResponse::GetNextResponse() +{ + return std::move(next_response_future_); +} + #ifndef TRITON_PB_STUB std::shared_ptr InferResponse::Send( diff --git a/src/infer_response.h b/src/infer_response.h index f3b60847..60f87e00 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ #pragma once +#include #include "pb_error.h" #include "pb_tensor.h" #include "pb_utils.h" @@ -68,6 +69,10 @@ class InferResponse { InferResponse( const std::vector>& output_tensors, std::shared_ptr error = nullptr); + InferResponse( + const std::vector>& output_tensors, + std::promise>* promise, + std::shared_ptr error = nullptr); std::vector>& OutputTensors(); void SaveToSharedMemory( std::unique_ptr& shm_pool, bool copy_gpu = true); @@ -79,6 +84,11 @@ class InferResponse { std::shared_ptr& Error(); bi::managed_external_buffer::handle_t ShmHandle(); void PruneOutputTensors(const std::set& requested_output_names); + std::unique_ptr>> + GetNextResponse(); + void SetNextResponseHandle( + bi::managed_external_buffer::handle_t next_response_handle); + bi::managed_external_buffer::handle_t NextResponseHandle(); #ifndef TRITON_PB_STUB /// Send an inference response. If the response has a GPU tensor, sending the @@ -110,5 +120,9 @@ class InferResponse { AllocatedSharedMemory response_shm_; std::vector, void*>> gpu_output_buffers_; std::unique_ptr deferred_send_callback_; + + std::unique_ptr>> + next_response_future_; }; + }}} // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index 8bb5c3d7..a7438133 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -44,6 +44,7 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_FinalizeResponse, PYTHONSTUB_LoadGPUBuffers, PYTHONSTUB_InferExecRequest, + PYTHONSTUB_InferStreamExecRequest, PYTHONSTUB_InferExecResponse, PYTHONSTUB_ResponseSend, PYTHONSTUB_ResponseClose, diff --git a/src/pb_generator.cc b/src/pb_generator.cc new file mode 100644 index 00000000..19b70e0f --- /dev/null +++ b/src/pb_generator.cc @@ -0,0 +1,62 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_generator.h" + +#include +namespace py = pybind11; + +namespace triton { namespace backend { namespace python { + +ResponseGenerator::ResponseGenerator( + const std::vector>& responses) + : responses_(responses), index_(0) +{ +} + +std::shared_ptr +ResponseGenerator::Next() +{ + if (index_ == responses_.size()) { + throw py::stop_iteration("Iteration is done for the responses."); + } + + return responses_[index_++]; +} + +std::vector>::iterator +ResponseGenerator::Begin() +{ + return responses_.begin(); +} + +std::vector>::iterator +ResponseGenerator::End() +{ + return responses_.end(); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_generator.h b/src/pb_generator.h new file mode 100644 index 00000000..f17808d4 --- /dev/null +++ b/src/pb_generator.h @@ -0,0 +1,47 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "infer_response.h" + +namespace triton { namespace backend { namespace python { + +class ResponseGenerator { + public: + ResponseGenerator( + const std::vector>& responses); + + std::shared_ptr Next(); + std::vector>::iterator Begin(); + std::vector>::iterator End(); + + private: + std::vector> responses_; + size_t index_; +}; + +}}} // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index c2e15340..113bb9d4 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -43,6 +43,7 @@ #include #include "infer_response.h" #include "pb_error.h" +#include "pb_generator.h" #include "pb_map.h" #include "pb_string.h" #include "pb_utils.h" @@ -1065,14 +1066,15 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::vector>& inputs, const std::vector& requested_output_names, const std::string& model_name, - const int64_t model_version, const uint32_t flags) { + const int64_t model_version, const uint32_t flags, + const int32_t timeout) { std::set requested_outputs; for (auto& requested_output_name : requested_output_names) { requested_outputs.emplace(requested_output_name); } return std::make_shared( request_id, correlation_id, inputs, requested_outputs, - model_name, model_version, flags); + model_name, model_version, flags, timeout); }), py::arg("request_id").none(false) = "", py::arg("correlation_id").none(false) = 0, @@ -1080,7 +1082,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("requested_output_names").none(false), py::arg("model_name").none(false), py::arg("model_version").none(false) = -1, - py::arg("flags").none(false) = 0) + py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) @@ -1088,10 +1090,27 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("correlation_id", &InferRequest::CorrelationId) .def("flags", &InferRequest::Flags) .def("set_flags", &InferRequest::SetFlags) - .def("exec", &InferRequest::Exec) + .def("timeout", &InferRequest::Timeout) + .def( + "exec", + [](std::shared_ptr& infer_request, + const bool decoupled) { + std::vector> responses = + infer_request->Exec(decoupled); + py::object response_object; + if (decoupled) { + response_object = py::cast(ResponseGenerator(responses)); + } else { + response_object = py::cast(responses[0]); + } + + return response_object; + }, + py::arg("decoupled").none(false) = false) .def( "async_exec", - [](std::shared_ptr& infer_request) { + [](std::shared_ptr& infer_request, + const bool decoupled) { std::unique_ptr& stub = Stub::GetOrCreateInstance(); if (stub->IsDecoupled()) { throw PythonBackendException( @@ -1100,14 +1119,23 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) } py::object loop = py::module_::import("asyncio").attr("get_running_loop")(); - py::cpp_function callback = [infer_request]() { - auto response = infer_request->Exec(); - return response; + py::cpp_function callback = [infer_request, decoupled]() { + std::vector> responses = + infer_request->Exec(decoupled); + py::object response_object; + if (decoupled) { + response_object = py::cast(ResponseGenerator(responses)); + } else { + response_object = py::cast(responses[0]); + } + + return response_object; }; py::object future = loop.attr("run_in_executor")(py::none(), callback); return future; - }) + }, + py::arg("decoupled").none(false) = false) .def( "requested_output_names", &InferRequest::RequestedOutputNames, py::return_value_policy::reference_internal) @@ -1149,6 +1177,17 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) "send", &ResponseSender::Send, py::arg("response") = nullptr, py::arg("flags") = 0); + py::class_>( + module, "ResponseGenerator") + .def(py::init>&>()) + .def( + "__iter__", + [](ResponseGenerator& self) { + return py::make_iterator(self.Begin(), self.End()); + }, + py::keep_alive<0, 1>()) + .def("__next__", &ResponseGenerator::Next); + py::class_ logger(module, "Logger"); py::enum_(logger, "LogLevel") .value("INFO", LogLevel::INFO) diff --git a/src/pb_utils.h b/src/pb_utils.h index 2fde0cf8..6ab1920b 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -152,6 +152,8 @@ struct ResponseBatch { // Indicates whether this error has a message or not. bool is_error_set; + + uint32_t response_size; }; struct LogSendMessageBase { diff --git a/src/python_be.cc b/src/python_be.cc index 59941683..d13e360a 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "python_be.h" +#include "infer_payload.h" #include "pb_log.h" namespace triton { namespace backend { namespace python { @@ -373,12 +374,13 @@ ModelInstanceState::SaveRequestsToSharedMemory( infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), flags, - reinterpret_cast(factory_ptr), + 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), reinterpret_cast(request)); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), flags, 0, + model_state->Name(), model_state->Version(), flags, + 0 /* BLS request timeout*/, 0 /* response_factory_address */, reinterpret_cast(request)); } @@ -595,15 +597,33 @@ ModelInstanceState::GetInputTensor( } void -ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) +ModelInstanceState::GetBLSResponses( + std::vector>& responses, + std::future> future) +{ + responses.push_back(future.get()); + size_t size = responses.size(); + for (size_t i = 0; i < size; i++) { + if (responses[i]) { + auto next_future = responses[i]->GetNextResponse(); + if (next_future) { + responses.push_back(next_future->get()); + size++; + } + } + } +} + +void +ModelInstanceState::ExecuteBLSRequest( + std::shared_ptr ipc_message, const bool is_decoupled) { ModelState* model_state = reinterpret_cast(Model()); auto request_executor = std::make_unique( Stub()->ShmPool(), model_state->TritonServer()); bool is_response_batch_set = false; - std::unique_ptr infer_response; + std::vector> infer_responses; ResponseBatch* response_batch; - TRITONSERVER_InferenceResponse* inference_response = nullptr; std::unique_ptr pb_error_message; std::unique_ptr bls_response; AllocatedSharedMemory response_batch_shm; @@ -633,6 +653,7 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) response_batch->has_error = false; response_batch->is_error_set = false; response_batch->cleanup = false; + response_batch->response_size = 1; is_response_batch_set = true; bool has_gpu_tensor = false; @@ -713,15 +734,44 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) } if (pb_exception.what() != nullptr) { - infer_response = - request_executor->Infer(infer_request, &inference_response); + std::shared_ptr infer_payload = + std::make_shared(is_decoupled); + auto response_future = + request_executor->Infer(infer_request, infer_payload); + GetBLSResponses(infer_responses, std::move(response_future)); + + size_t response_length = infer_responses.size(); + // It is possible that the last response from the decoupled model is an + // empty response. + if (infer_responses.back() == nullptr) { + response_length--; + } + + if (response_length != 1) { + // Construct the response_batch_shm based on the length of the + // responses for decoupled support. + response_batch_shm = Stub()->ShmPool()->Construct( + sizeof(ResponseBatch) + + response_length * sizeof(bi::managed_external_buffer::handle_t)); + response_batch = + reinterpret_cast(response_batch_shm.data_.get()); + response_handle = + reinterpret_cast( + response_batch_shm.data_.get() + sizeof(ResponseBatch)); + bls_response->Args() = response_batch_shm.handle_; + response_batch->batch_size = 1; + response_batch->has_error = false; + response_batch->is_error_set = false; + response_batch->cleanup = false; + response_batch->response_size = response_length; + } - if (infer_response) { - infer_response->SaveToSharedMemory(Stub()->ShmPool()); + for (size_t i = 0; i < response_length; i++) { + infer_responses[i]->SaveToSharedMemory(Stub()->ShmPool()); - for (auto& output_tensor : infer_response->OutputTensors()) { - // For GPU tensors we need to store the memory release id in memory - // manager. + for (auto& output_tensor : infer_responses[i]->OutputTensors()) { + // For GPU tensors we need to store the memory release id in + // memory manager. if (!output_tensor->IsCPU()) { #ifdef TRITON_ENABLE_GPU std::unique_ptr gpu_memory_record = @@ -734,9 +784,8 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) #endif } } - *response_handle = infer_response->ShmHandle(); + response_handle[i] = infer_responses[i]->ShmHandle(); } - } else { throw pb_exception; } @@ -766,12 +815,6 @@ ModelInstanceState::ExecuteBLSRequest(std::shared_ptr ipc_message) ipc_message->ResponseCondition()->notify_all(); ipc_message->ResponseCondition()->wait(lock); } - - if (inference_response != nullptr) { - LOG_IF_ERROR( - TRITONSERVER_InferenceResponseDelete(inference_response), - " failed to release BLS inference response."); - } } void @@ -800,10 +843,15 @@ ModelInstanceState::DecoupledMessageQueueMonitor() std::future future = boost::asio::post(*thread_pool_, std::move(task)); futures_.emplace_back(std::move(future)); - } else if (message->Command() == PYTHONSTUB_InferExecRequest) { + } else if ( + message->Command() == PYTHONSTUB_InferExecRequest || + message->Command() == PYTHONSTUB_InferStreamExecRequest) { std::shared_ptr bls_execute = std::move(message); - std::packaged_task task( - [this, bls_execute] { ExecuteBLSRequest(bls_execute); }); + std::packaged_task task([this, bls_execute] { + ExecuteBLSRequest( + bls_execute, + (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); + }); std::future future = boost::asio::post(*thread_pool_, std::move(task)); futures_.emplace_back(std::move(future)); @@ -1188,9 +1236,15 @@ ModelInstanceState::ProcessRequests( // requests to execute. Otherwise, the Python backend will continuosly execute // BLS requests pushed to the message queue. while (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest) { - std::packaged_task task( - [this, ipc_message] { ExecuteBLSRequest(ipc_message); }); + PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest || + ipc_message->Command() == + PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest) { + std::packaged_task task([this, ipc_message] { + ExecuteBLSRequest( + ipc_message, + (ipc_message->Command() == + PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest)); + }); std::future future = boost::asio::post(*thread_pool_, std::move(task)); futures_.emplace_back(std::move(future)); diff --git a/src/python_be.h b/src/python_be.h index bdb35b57..9aa1dbd8 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -309,7 +309,7 @@ class ModelInstanceState : public BackendModelInstance { void DecoupledMessageQueueMonitor(); // This function is executed on a separate thread and monitors the log message - // queue. When it receives a message from the stub, it will load it from + // queue. When it receives a message from the stub, it will load it from // shared memory and log it using the triton server core logging facilities. void LogMessageQueueMonitor(); @@ -333,7 +333,8 @@ class ModelInstanceState : public BackendModelInstance { bool ExistsInClosedRequests(intptr_t closed_request); // Execute a BLS Request - void ExecuteBLSRequest(std::shared_ptr ipc_message); + void ExecuteBLSRequest( + std::shared_ptr ipc_message, const bool is_stream); // Cleanup BLS responses void CleanupBLSResponses(); @@ -341,6 +342,11 @@ class ModelInstanceState : public BackendModelInstance { // Wait for BLS requests to complete void WaitForBLSRequestsToFinish(); + // Get BLS responses + void GetBLSResponses( + std::vector>& responses, + std::future> future); + // Check the incoming requests for errors TRITONSERVER_Error* CheckIncomingRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, diff --git a/src/request_executor.cc b/src/request_executor.cc index 4e1212df..8e020596 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -56,12 +56,112 @@ void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) { + auto p = reinterpret_cast*>(userp); + std::unique_ptr infer_response; + std::vector> output_tensors; + std::shared_ptr pb_error; + if (response != nullptr) { - // Send 'response' to the future. - std::promise* p = - reinterpret_cast*>(userp); - p->set_value(response); - delete p; + try { + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseError(response)); + + uint32_t output_count; + THROW_IF_TRITON_ERROR( + TRITONSERVER_InferenceResponseOutputCount(response, &output_count)); + + for (uint32_t idx = 0; idx < output_count; ++idx) { + const char* cname; + TRITONSERVER_DataType datatype; + const int64_t* shape; + uint64_t dim_count; + const void* base; + size_t byte_size; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + void* userp; + + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseOutput( + response, idx, &cname, &datatype, &shape, &dim_count, &base, + &byte_size, &memory_type, &memory_type_id, &userp)); + std::string sname = cname; + std::vector dims_vector{shape, shape + dim_count}; + + // userp is only set for the CPU tensors + if (memory_type != TRITONSERVER_MEMORY_GPU) { + if (byte_size != 0) { + std::shared_ptr pb_tensor = std::make_shared( + sname, dims_vector, datatype, memory_type, memory_type_id, + const_cast(base), byte_size, + nullptr /* DLManagedTensor */); + + // Load the data so that it is deallocated automatically. + std::unique_ptr pb_memory( + reinterpret_cast(userp)); + pb_tensor->SetMemory(std::move(pb_memory)); + output_tensors.push_back(pb_tensor); + } else { + output_tensors.push_back(std::make_shared( + sname, dims_vector, datatype, memory_type, memory_type_id, + const_cast(base), byte_size, + nullptr /* DLManagedTensor */)); + } + } else { + output_tensors.push_back(std::make_shared( + sname, dims_vector, datatype, memory_type, memory_type_id, + const_cast(base), byte_size, + nullptr /* DLManagedTensor */)); + } + } + } + catch (const PythonBackendException& pb_exception) { + if (response != nullptr) { + LOG_IF_ERROR( + TRITONSERVER_InferenceResponseDelete(response), + "Failed to delete inference response."); + + response = nullptr; + } + pb_error = std::make_shared(pb_exception.what()); + output_tensors.clear(); + } + + if (!(*p)->IsDecoupled()) { + infer_response = + std::make_unique(output_tensors, pb_error); + (*p)->SetValueForPrevPromise(std::move(infer_response)); + (*p)->ResetPrevPromise(); + } else { + if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + // Not the last reponse. Need to store the promise associated with the + // next future. + auto promise = new std::promise>(); + infer_response = + std::make_unique(output_tensors, promise, pb_error); + (*p)->SetValueForPrevPromise(std::move(infer_response)); + (*p)->SetPrevPromise(&promise); + } else { + // The last response. + infer_response = + std::make_unique(output_tensors, pb_error); + (*p)->SetValueForPrevPromise(std::move(infer_response)); + (*p)->ResetPrevPromise(); + } + } + + LOG_IF_ERROR( + TRITONSERVER_InferenceResponseDelete(response), + "Failed to release BLS inference response."); + } else if ( + (*p)->IsDecoupled() && + (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { + // An empty response may be the last reponse for decoupled models. + (*p)->SetValueForPrevPromise(std::unique_ptr{}); + (*p)->ResetPrevPromise(); + } else { + pb_error = std::make_shared("Unexpected empty response."); + infer_response = std::make_unique(output_tensors, pb_error); + (*p)->SetValueForPrevPromise(std::move(infer_response)); + (*p)->ResetPrevPromise(); } } @@ -171,21 +271,16 @@ RequestExecutor::RequestExecutor( response_allocator_ = allocator; } -std::unique_ptr +std::future> RequestExecutor::Infer( - const std::shared_ptr& infer_request, - TRITONSERVER_InferenceResponse** triton_response) + std::shared_ptr& infer_request, + std::shared_ptr& infer_payload) { + std::future> response_future; std::unique_ptr infer_response; bool is_ready = false; const char* model_name = infer_request->ModelName().c_str(); TRITONSERVER_InferenceRequest* irequest = nullptr; - TRITONSERVER_InferenceResponse* response = nullptr; - - // This variable indicates whether the InferenceRequest should be deleted as a - // part of the catch block or it will be automatically deleted using the - // InferResponseComplete callback. - bool delete_inference_request = true; try { int64_t model_version = infer_request->ModelVersion(); @@ -202,13 +297,17 @@ RequestExecutor::Infer( uint32_t txn_flags; THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties( server_, model_name, model_version, &txn_flags, nullptr /* voidp */)); + infer_request->SetIsDecoupled( + (txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0); - // Decoupled API is not supported in the current BLS interface - if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) { + if (!infer_payload->IsDecoupled() && infer_request->IsDecoupled()) { + // Decoupled API is only supported by using stream API throw PythonBackendException( std::string("Model ") + model_name + - " is using the decoupled. BLS doesn't support models using the " - "decoupled transaction policy."); + " is using the decoupled. The current BLS request call doesn't " + "support models using the decoupled transaction policy. Please use " + "stream API 'stream_exec()' or 'async_stream_exec() for decoupled " + "models.'"); } // Inference @@ -224,6 +323,9 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetFlags( irequest, infer_request->Flags())); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetTimeoutMicroseconds( + irequest, infer_request->Timeout())); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, nullptr /* request_release_userp */)); @@ -245,98 +347,27 @@ RequestExecutor::Infer( } { - auto p = new std::promise(); - std::future completed = p->get_future(); + infer_payload->SetFuture(response_future); THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback( irequest, response_allocator_, shm_pool_.get(), InferResponseComplete, - reinterpret_cast(p))); + reinterpret_cast(&infer_payload))); THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( server_, irequest, nullptr /* trace */)); - - // Wait for the inference to complete. - response = completed.get(); - *triton_response = response; - delete_inference_request = false; - THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseError(response)); - - uint32_t output_count; - THROW_IF_TRITON_ERROR( - TRITONSERVER_InferenceResponseOutputCount(response, &output_count)); - - std::vector> output_tensors; - for (uint32_t idx = 0; idx < output_count; ++idx) { - const char* cname; - TRITONSERVER_DataType datatype; - const int64_t* shape; - uint64_t dim_count; - const void* base; - size_t byte_size; - TRITONSERVER_MemoryType memory_type; - int64_t memory_type_id; - void* userp; - - THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceResponseOutput( - response, idx, &cname, &datatype, &shape, &dim_count, &base, - &byte_size, &memory_type, &memory_type_id, &userp)); - std::string sname = cname; - std::vector dims_vector{shape, shape + dim_count}; - - // userp is only set for the CPU tensors - if (memory_type != TRITONSERVER_MEMORY_GPU) { - if (byte_size != 0) { - std::shared_ptr pb_tensor = std::make_shared( - sname, dims_vector, datatype, memory_type, memory_type_id, - const_cast(base), byte_size, - nullptr /* DLManagedTensor */); - - // Load the data so that it is deallocated automatically. - std::unique_ptr pb_memory( - reinterpret_cast(userp)); - pb_tensor->SetMemory(std::move(pb_memory)); - output_tensors.push_back(pb_tensor); - } else { - output_tensors.push_back(std::make_shared( - sname, dims_vector, datatype, memory_type, memory_type_id, - const_cast(base), byte_size, - nullptr /* DLManagedTensor */)); - } - } else { - output_tensors.push_back(std::make_shared( - sname, dims_vector, datatype, memory_type, memory_type_id, - const_cast(base), byte_size, - nullptr /* DLManagedTensor */)); - } - } - - std::shared_ptr pb_error; - infer_response = - std::make_unique(output_tensors, pb_error); } } catch (const PythonBackendException& pb_exception) { - if (response != nullptr) { - LOG_IF_ERROR( - TRITONSERVER_InferenceResponseDelete(response), - "Failed to delete inference response."); - - *triton_response = nullptr; - } - - if (delete_inference_request) { - LOG_IF_ERROR( - TRITONSERVER_InferenceRequestDelete(irequest), - "Failed to delete inference request."); - } + LOG_IF_ERROR( + TRITONSERVER_InferenceRequestDelete(irequest), + "Failed to delete inference request."); - std::shared_ptr pb_error = - std::make_shared(pb_exception.what()); - infer_response = std::make_unique( - std::vector>{}, pb_error); + throw PythonBackendException( + std::string("Model ") + model_name + + " - Error when running inference: " + pb_exception.what()); } - return infer_response; + return response_future; } RequestExecutor::~RequestExecutor() diff --git a/src/request_executor.h b/src/request_executor.h index bb3e1e60..56ed5ca5 100644 --- a/src/request_executor.h +++ b/src/request_executor.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,7 @@ #pragma once #include +#include "infer_payload.h" #include "infer_request.h" #include "infer_response.h" @@ -41,9 +42,9 @@ class RequestExecutor { std::unique_ptr& shm_pool_; public: - std::unique_ptr Infer( - const std::shared_ptr& infer_request, - TRITONSERVER_InferenceResponse** response); + std::future> Infer( + std::shared_ptr& infer_request, + std::shared_ptr& infer_payload); RequestExecutor( std::unique_ptr& shm_pool, From 2bd5a05226b773dcc46369497444a5dae6b3fe2a Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 6 Mar 2023 15:19:04 -0500 Subject: [PATCH 078/216] Add documentation for shared memory region name prefix (#210) * Add documentation for shared memory region name prefix * Review edit --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 14ecfe4b..4e5098e2 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ any C++ code. - [Error Handling](#error-handling) - [Managing Shared Memory](#managing-shared-memory) - [Multiple Model Instance Support](#multiple-model-instance-support) + - [Running Multiple Instances of Triton Server](#running-multiple-instances-of-triton-server) - [Business Logic Scripting](#business-logic-scripting) - [Using BLS with Stateful Models](#using-bls-with-stateful-models) - [Limitation](#limitation) @@ -823,6 +824,26 @@ and [PyTorch](https://github.com/triton-inference-server/pytorch_backend) handle multiple instances. Increasing the instance count for these backends will create additional threads instead of spawning separate processes. +## Running Multiple Instances of Triton Server + +Python backend uses shared memory to transfer requests to the stub process. +When running multiple instances of Triton Server on the same machine that use +Python models, there would be shared memory region name conflicts that can +result in segmentation faults or hangs. In order to avoid this issue, you need +to specify different `shm-region-prefix-name` using the `--backend-config` flag. + +``` +# Triton instance 1 +tritonserver --model-repository=/models --backend-config=python,shm-region-prefix-name=prefix1 + +# Triton instance 2 +tritonserver --model-repository=/models --backend-config=python,shm-region-prefix-name=prefix2 +``` + +Note that the hangs would only occur if the `/dev/shm` is shared between +the two instances of the server. If you run the servers in different containers that +don't share this location, you don't need to specify `shm-region-prefix-name`. + # Business Logic Scripting Triton's From 23d2b8f073b73def7a926bfd29aa5b0cb5bf4a4d Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 8 Mar 2023 08:10:49 -0800 Subject: [PATCH 079/216] Enhancement for BLS decoupled support (#208) * WIP: use bls_response_queue to initiate sending decoupled responses from PYBE to stub * Return single InferResponse object from InferRequest.exec() * Complete the implementation * Rename the logging-specific objects * Clean up completed object from bls decoupled response * Renaming * Move the GIL * Address comment * Remove timeout and while loop when pushing message to ParentToStubMessageQueue * Fix up * Make request_executor a member variable * Return self in __iter__. Make it possible to call for loop multiple times to get responses. * Rename ResponseGenerator to ResponseIterator * Address comment * Revert previous changes to retrieve all the responses in ResponseIterator destructor * Return the last empty response --- CMakeLists.txt | 4 +- README.md | 30 +- examples/bls_decoupled/async_model.py | 6 +- examples/bls_decoupled/sync_model.py | 6 +- src/infer_payload.cc | 31 +- src/infer_payload.h | 12 +- src/infer_request.cc | 45 +-- src/infer_request.h | 2 +- src/infer_response.cc | 50 ++- src/infer_response.h | 21 +- src/ipc_message.h | 4 +- src/pb_generator.cc | 62 --- src/pb_log.cc | 6 +- src/pb_response_iterator.cc | 147 +++++++ ...{pb_generator.h => pb_response_iterator.h} | 21 +- src/pb_stub.cc | 320 ++++++++++++--- src/pb_stub.h | 64 ++- src/pb_utils.h | 24 +- src/python_be.cc | 380 +++++++++++------- src/python_be.h | 48 ++- src/request_executor.cc | 39 +- src/stub_launcher.cc | 45 ++- src/stub_launcher.h | 23 +- 23 files changed, 917 insertions(+), 473 deletions(-) delete mode 100644 src/pb_generator.cc create mode 100644 src/pb_response_iterator.cc rename src/{pb_generator.h => pb_response_iterator.h} (80%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a5406bc..26221055 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,8 +196,8 @@ set( src/response_sender.h src/pb_stub.h src/pb_stub.cc - src/pb_generator.h - src/pb_generator.cc + src/pb_response_iterator.h + src/pb_response_iterator.cc ) list(APPEND diff --git a/README.md b/README.md index 4e5098e2..5332f6c4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ + +# Model Instance Kind Example + +Triton model configuration allows users to provide kind to [instance group +settings.](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) +A python backend model can be written to respect the kind setting to control +the execution of a model instance either on CPU or GPU. + +In this example, we demonstrate how this can be achieved for your python model. +We will use a `ResNet50` model as our base model for this example. + +## Create a ResNet50 model repository + +We will use the files that come with this example to create the model +repository. + +First, download the [client.py](client.py), [config.pbtxt](config.pbtxt), +[resnet50_labels.txt](resnet50_labels.txt), and [model.py](model.py) +to your local machine. + +Next, in the same directory with the four aformentioned files, create the model +repository with the following commands: +``` +mkdir -p models/resnet50/1 && +mv model.py models/resnet50/1/ && +mv config.pbtxt models/resnet50/ +``` + +## Pull the Triton Docker images + +We need to install Docker and NVIDIA Container Toolkit before proceeding, refer +to the +[installation steps](https://github.com/triton-inference-server/server/tree/main/docs#installation). + +To pull the latest containers, run the following commands: +``` +docker pull nvcr.io/nvidia/tritonserver:-py3 +docker pull nvcr.io/nvidia/tritonserver:-py3-sdk +``` +See the installation steps above for the `` version. + +For example, if the latest version is `23.01`, the above commands translate +to the following: +``` +docker pull nvcr.io/nvidia/tritonserver:23.01-py3 +docker pull nvcr.io/nvidia/tritonserver:23.01-py3-sdk +``` + +Be sure to replace the `` with the version pulled for all the remaining +parts of this example. + +## Start the Triton Server + +At the directory where we copied our resnet50 model (at where the "models" +folder is located), run the following command: +``` +docker run --gpus all --shm-size 1G -it --rm -p 8000:8000 -v `pwd`:/instance_kind nvcr.io/nvidia/tritonserver:-py3 /bin/bash +``` + +Inside the container, we need to install `torch` and `pillow` to run +this example. We recommend to use `pip` method for the installation: + +``` +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html pillow +``` + +Finally, we need to start the Triton Server: +``` +tritonserver --model-repository /instance_kind/models +``` + +To leave the container for the next step, press: `CTRL + P + Q`. + +## Start the Triton SDK Container and Test Inference + +To start the sdk container, run the following command: +``` +docker run --gpus all --network=host --pid=host --ipc=host -v `pwd`:/instance_kind -ti nvcr.io/nvidia/tritonserver:-py3-sdk /bin/bash +``` + +The `client.py` requires the following packages to be installed: `torch`, +`torchvision`, `pillow` and `validators`. Similarly, we recommend to use `pip` +method for the installation: + +``` +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html torchvision==0.14.0+cu117 pillow validators +``` + +Finally, let's test an inference call with the following command: +``` +python client.py +``` +On a first run, a successful inference will print the following at the end: +``` +Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" to /root/.cache/torch/hub/torchhub.zip +Results is class: TABBY +PASS: ResNet50 +``` +It may take some time due to `torchhub` downloads, but any future calls +will be quicker, since the client will use already downloaded artifacts. + +## Test Instance Kind + +Provided `config.pbtxt` sets the instance group setting to `KIND_CPU`, +which enables the execution of a model on the CPU. +To test that your model is actually loaded onto CPU, run the following: +``` +python client.py -v +``` +The `-v` argument asks the client to request model's confiuration from +the server and prints it in your console: +``` +{ + ..., + "instance_group": [ + { + "name": "resnet50_0", + "kind": "KIND_CPU", + "count": 1, + "gpus": [], + "secondary_devices": [], + "profile": [], + "passive": false, + "host_policy": "" + } + ], + ... +} +Results is class: TABBY +PASS: ResNet50 instance kind +``` + +Based on the printed model config, we can see that `instance_group` field +has `kind` entry, which is set to `KIND_CPU`. + +To change an `instance_group` parameter to `KIND_GPU`, a user can simply replace +`KIND_CPU` with `KIND_GPU` in the `config.pbtxt`. After restarting the server +with an updated config file, a successful inference request with `-v` argument +will result into the similar output, but with an updated `instance_group` entry: +``` +{ + ..., + "instance_group": [ + { + "name": "resnet50_0", + "kind": "KIND_GPU", + "count": 1, + "gpus": [ + 0 + ], + "secondary_devices": [], + "profile": [], + "passive": false, + "host_policy": "" + } + ], + ... +} +Results is class: TABBY +PASS: ResNet50 instance kind +``` +It is also possible to load multiple model instances on CPU and GPU +if neccessary. + +Below the instance group setting will create two model instances, +one on CPU and other on GPU. +``` +instance_group [{ kind: KIND_CPU }, { kind: KIND_GPU}] +``` + +For more information on possible model configurations, +check out the Triton Server documentation [here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-configuration) \ No newline at end of file diff --git a/examples/instance_kind/client.py b/examples/instance_kind/client.py new file mode 100644 index 00000000..376ee47f --- /dev/null +++ b/examples/instance_kind/client.py @@ -0,0 +1,109 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import sys +import warnings + +import numpy as np +import torch +import tritonclient.http as httpclient +from tritonclient.utils import * + +warnings.filterwarnings('ignore') + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", + type=str, + required=False, + default="resnet50", + help="Model name") + parser.add_argument("--image_url", + type=str, + required=False, + default=\ + "http://images.cocodataset.org/test2017/000000557146.jpg", + help=\ + "Image URL. Default is:\ + http://images.cocodataset.org/test2017/000000557146.jpg" + ) + parser.add_argument("--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.") + parser.add_argument('-v', + "--verbose", + action="http://github.com/store_true", + required=False, + default=False, + help='Enable verbose output') + parser.add_argument("--label_file", + type=str, + required=False, + default="./resnet50_labels.txt", + help="Path to the file with text representation \ + of available labels") + args = parser.parse_args() + + utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', + 'nvidia_convnets_processing_utils', + skip_validation=True) + + try: + triton_client = httpclient.InferenceServerClient(args.url) + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit(1) + + with open(args.label_file) as f: + labels_dict = {idx: line.strip() for idx, line in enumerate(f)} + + if args.verbose: + print( + json.dumps(triton_client.get_model_config(args.model_name), + indent=4)) + + input_name = "INPUT" + output_name = "OUTPUT" + batch = np.asarray(utils.prepare_input_from_uri(args.image_url)) + + input = httpclient.InferInput(input_name, batch.shape, "FP32") + output = httpclient.InferRequestedOutput(output_name) + + input.set_data_from_numpy(batch) + results = triton_client.infer(model_name=args.model_name, + inputs=[input], + outputs=[output]) + + output_data = results.as_numpy(output_name) + max_id = np.argmax(output_data, axis=1)[0] + print("Results is class: {}".format(labels_dict[max_id])) + + print('PASS: ResNet50 instance kind') + sys.exit(0) diff --git a/examples/instance_kind/config.pbtxt b/examples/instance_kind/config.pbtxt new file mode 100755 index 00000000..f3aee058 --- /dev/null +++ b/examples/instance_kind/config.pbtxt @@ -0,0 +1,42 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "resnet50" +backend: "python" +max_batch_size: 128 +input { + name: "INPUT" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 3, 224, 224 ] + } +output { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ 1000 ] + } + +instance_group [{ kind: KIND_CPU }] diff --git a/examples/instance_kind/model.py b/examples/instance_kind/model.py new file mode 100644 index 00000000..6ebfb6bc --- /dev/null +++ b/examples/instance_kind/model.py @@ -0,0 +1,70 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack + + +class TritonPythonModel: + + def initialize(self, args): + """ + This function initializes pre-trained ResNet50 model, + depending on the value specified by an `instance_group` parameter + in `config.pbtxt`. + + Depending on what `instance_group` was specified in + the config.pbtxt file (KIND_CPU or KIND_GPU), the model instance + will be initialised on a cpu, a gpu, or both. If `instance_group` was + not specified in the config file, then models will be loaded onto + the default device of the framework. + """ + self.device = 'cuda' if args["model_instance_kind"] == "GPU" else 'cpu' + self.model = torch.hub.load("pytorch/vision", + "resnet50", + weights="IMAGENET1K_V2", + skip_validation=True)\ + .to(self.device)\ + .eval() + + def execute(self, requests): + """ + This function receives a list of requests (`pb_utils.InferenceRequest`), + performs inference on every request and appends it to responses. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT") + with torch.no_grad(): + result = self.model( + torch.as_tensor(input_tensor.as_numpy(), + device=self.device)) + out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", + to_dlpack(result)) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/examples/instance_kind/resnet50_labels.txt b/examples/instance_kind/resnet50_labels.txt new file mode 100755 index 00000000..e59113f7 --- /dev/null +++ b/examples/instance_kind/resnet50_labels.txt @@ -0,0 +1,1000 @@ +TENCH +GOLDFISH +WHITE SHARK +TIGER SHARK +HAMMERHEAD SHARK +ELECTRIC RAY +STINGRAY +ROOSTER +HEN +OSTRICH +BRAMBLING +GOLDFINCH +HOUSE FINCH +SNOWBIRD +INDIGO FINCH +ROBIN +BULBUL +JAY +MAGPIE +CHICKADEE +WATER OUZEL +KITE +BALD EAGLE +VULTURE +GREAT GREY OWL +FIRE SALAMANDER +NEWT +EFT +SPOTTED SALAMANDER +AXOLOTL +BULL FROG +TREE FROG +TAILED FROG +LOGGERHEAD +LEATHERBACK TURTLE +MUD TURTLE +TERRAPIN +BOX TURTLE +BANDED GECKO +COMMON IGUANA +AMERICAN CHAMELEON +WHIPTAIL +AGAMA +FRILLED LIZARD +ALLIGATOR LIZARD +GILA MONSTER +GREEN LIZARD +AFRICAN CHAMELEON +KOMODO DRAGON +AFRICAN CROCODILE +AMERICAN ALLIGATOR +TRICERATOPS +THUNDER SNAKE +RINGNECK SNAKE +HOGNOSE SNAKE +GREEN SNAKE +KING SNAKE +GARTER SNAKE +WATER SNAKE +VINE SNAKE +NIGHT SNAKE +BOA +ROCK PYTHON +COBRA +GREEN MAMBA +SEA SNAKE +HORNED VIPER +DIAMONDBACK +SIDEWINDER +TRILOBITE +HARVESTMAN +SCORPION +GARDEN SPIDER +BARN SPIDER +GARDEN SPIDER +BLACK WIDOW +TARANTULA +WOLF SPIDER +TICK +CENTIPEDE +GROUSE +PTARMIGAN +RUFFED GROUSE +PRAIRIE CHICKEN +PEACOCK +QUAIL +PARTRIDGE +AFRICAN GREY +MACAW +COCKATOO +LORIKEET +COUCAL +BEE EATER +HORNBILL +HUMMINGBIRD +JACAMAR +TOUCAN +DRAKE +MERGANSER +GOOSE +BLACK SWAN +TUSKER +ECHIDNA +PLATYPUS +WALLABY +KOALA +WOMBAT +JELLYFISH +SEA ANEMONE +BRAIN CORAL +FLATWORM +NEMATODE +CONCH +SNAIL +SLUG +SEA SLUG +CHITON +CHAMBERED NAUTILUS +DUNGENESS CRAB +ROCK CRAB +FIDDLER CRAB +KING CRAB +AMERICAN LOBSTER +SPINY LOBSTER +CRAYFISH +HERMIT CRAB +ISOPOD +WHITE STORK +BLACK STORK +SPOONBILL +FLAMINGO +LITTLE BLUE HERON +AMERICAN EGRET +BITTERN +CRANE +LIMPKIN +EUROPEAN GALLINULE +AMERICAN COOT +BUSTARD +RUDDY TURNSTONE +RED-BACKED SANDPIPER +REDSHANK +DOWITCHER +OYSTERCATCHER +PELICAN +KING PENGUIN +ALBATROSS +GREY WHALE +KILLER WHALE +DUGONG +SEA LION +CHIHUAHUA +JAPANESE SPANIEL +MALTESE DOG +PEKINESE +SHIH-TZU +BLENHEIM SPANIEL +PAPILLON +TOY TERRIER +RHODESIAN RIDGEBACK +AFGHAN HOUND +BASSET +BEAGLE +BLOODHOUND +BLUETICK +COONHOUND +WALKER HOUND +ENGLISH FOXHOUND +REDBONE +BORZOI +IRISH WOLFHOUND +ITALIAN GREYHOUND +WHIPPET +IBIZAN HOUND +NORWEGIAN ELKHOUND +OTTERHOUND +SALUKI +SCOTTISH DEERHOUND +WEIMARANER +STAFFORDSHIRE BULLTERRIER +STAFFORDSHIRE TERRIER +BEDLINGTON TERRIER +BORDER TERRIER +KERRY BLUE TERRIER +IRISH TERRIER +NORFOLK TERRIER +NORWICH TERRIER +YORKSHIRE TERRIER +WIRE-HAIRED FOX TERRIER +LAKELAND TERRIER +SEALYHAM TERRIER +AIREDALE +CAIRN +AUSTRALIAN TERRIER +DANDIE DINMONT +BOSTON BULL +MINIATURE SCHNAUZER +GIANT SCHNAUZER +STANDARD SCHNAUZER +SCOTCH TERRIER +TIBETAN TERRIER +SILKY TERRIER +WHEATEN TERRIER +WHITE TERRIER +LHASA +RETRIEVER +CURLY-COATED RETRIEVER +GOLDEN RETRIEVER +LABRADOR RETRIEVER +CHESAPEAKE BAY RETRIEVER +SHORT-HAIRED POINTER +VISLA +ENGLISH SETTER +IRISH SETTER +GORDON SETTER +BRITTANY SPANIEL +CLUMBER +ENGLISH SPRINGER +WELSH SPRINGER SPANIEL +COCKER SPANIEL +SUSSEX SPANIEL +IRISH WATERSPANIEL +KUVASZ +SCHIPPERKE +GROENENDAEL +MALINOIS +BRIARD +KELPIE +KOMONDOR +OLD ENGLISH SHEEPDOG +SHETLAND SHEEPDOG +COLLIE +BORDER COLLIE +BOUVIER DES FLANDRES +ROTTWEILER +GERMAN SHEPHERD +DOBERMAN +MINIATURE PINSCHER +GREATER SWISS MOUNTAIN DOG +BERNESE MOUNTAIN DOG +APPENZELLER +ENTLEBUCHER +BOXER +BULL MASTIFF +TIBETAN MASTIFF +FRENCH BULLDOG +GREAT DANE +SAINT BERNARD +ESKIMO DOG +MALAMUTE +SIBERIAN HUSKY +DALMATIAN +AFFENPINSCHER +BASENJI +PUG +LEONBERG +NEWFOUNDLAND +GREAT PYRENEES +SAMOYED +POMERANIAN +CHOW +KEESHOND +BRABANCON GRIFFON +PEMBROKE +CARDIGAN +TOY POODLE +MINIATURE POODLE +STANDARD POODLE +MEXICAN HAIRLESS +TIMBER WOLF +WHITE WOLF +RED WOLF +COYOTE +DINGO +DHOLE +AFRICAN HUNTING DOG +HYENA +RED FOX +KIT FOX +ARCTIC FOX +GREY FOX +TABBY +TIGER CAT +PERSIAN CAT +SIAMESE CAT +EGYPTIAN CAT +COUGAR +LYNX +LEOPARD +SNOW LEOPARD +JAGUAR +LION +TIGER +CHEETAH +BROWN BEAR +AMERICAN BLACK BEAR +ICE BEAR +SLOTH BEAR +MONGOOSE +MEERKAT +TIGER BEETLE +LADYBUG +GROUND BEETLE +LONG-HORNED BEETLE +LEAF BEETLE +DUNG BEETLE +RHINOCEROS BEETLE +WEEVIL +FLY +BEE +ANT +GRASSHOPPER +CRICKET +WALKING STICK +COCKROACH +MANTIS +CICADA +LEAFHOPPER +LACEWING +DRAGONFLY +DAMSELFLY +ADMIRAL +RINGLET +MONARCH +CABBAGE BUTTERFLY +SULPHUR BUTTERFLY +LYCAENID +STARFISH +SEA URCHIN +SEA CUCUMBER +WOOD RABBIT +HARE +ANGORA +HAMSTER +PORCUPINE +FOX SQUIRREL +MARMOT +BEAVER +GUINEA PIG +SORREL +ZEBRA +HOG +WILD BOAR +WARTHOG +HIPPOPOTAMUS +OX +WATER BUFFALO +BISON +RAM +BIGHORN +IBEX +HARTEBEEST +IMPALA +GAZELLE +ARABIAN CAMEL +LLAMA +WEASEL +MINK +POLECAT +BLACK-FOOTED FERRET +OTTER +SKUNK +BADGER +ARMADILLO +THREE-TOED SLOTH +ORANGUTAN +GORILLA +CHIMPANZEE +GIBBON +SIAMANG +GUENON +PATAS +BABOON +MACAQUE +LANGUR +COLOBUS +PROBOSCIS MONKEY +MARMOSET +CAPUCHIN +HOWLER MONKEY +TITI +SPIDER MONKEY +SQUIRREL MONKEY +MADAGASCAR CAT +INDRI +INDIAN ELEPHANT +AFRICAN ELEPHANT +LESSER PANDA +GIANT PANDA +BARRACOUTA +EEL +COHO +ROCK BEAUTY +ANEMONE FISH +STURGEON +GAR +LIONFISH +PUFFER +ABACUS +ABAYA +ACADEMIC GOWN +ACCORDION +ACOUSTIC GUITAR +AIRCRAFT CARRIER +AIRLINER +AIRSHIP +ALTAR +AMBULANCE +AMPHIBIAN +ANALOG CLOCK +APIARY +APRON +ASHCAN +ASSAULT RIFLE +BACKPACK +BAKERY +BALANCE BEAM +BALLOON +BALLPOINT +BAND AID +BANJO +BANNISTER +BARBELL +BARBER CHAIR +BARBERSHOP +BARN +BAROMETER +BARREL +BARROW +BASEBALL +BASKETBALL +BASSINET +BASSOON +BATHING CAP +BATH TOWEL +BATHTUB +BEACH WAGON +BEACON +BEAKER +BEARSKIN +BEER BOTTLE +BEER GLASS +BELL COTE +BIB +BICYCLE-BUILT-FOR-TWO +BIKINI +BINDER +BINOCULARS +BIRDHOUSE +BOATHOUSE +BOBSLED +BOLO TIE +BONNET +BOOKCASE +BOOKSHOP +BOTTLECAP +BOW +BOW TIE +BRASS +BRASSIERE +BREAKWATER +BREASTPLATE +BROOM +BUCKET +BUCKLE +BULLETPROOF VEST +BULLET TRAIN +BUTCHER SHOP +CAB +CALDRON +CANDLE +CANNON +CANOE +CAN OPENER +CARDIGAN +CAR MIRROR +CAROUSEL +CARPENTERS KIT +CARTON +CAR WHEEL +CASH MACHINE +CASSETTE +CASSETTE PLAYER +CASTLE +CATAMARAN +CD PLAYER +CELLO +CELLULAR TELEPHONE +CHAIN +CHAINLINK FENCE +CHAIN MAIL +CHAIN SAW +CHEST +CHIFFONIER +CHIME +CHINA CABINET +CHRISTMAS STOCKING +CHURCH +CINEMA +CLEAVER +CLIFF DWELLING +CLOAK +CLOG +COCKTAIL SHAKER +COFFEE MUG +COFFEEPOT +COIL +COMBINATION LOCK +COMPUTER KEYBOARD +CONFECTIONERY +CONTAINER SHIP +CONVERTIBLE +CORKSCREW +CORNET +COWBOY BOOT +COWBOY HAT +CRADLE +CRANE +CRASH HELMET +CRATE +CRIB +CROCK POT +CROQUET BALL +CRUTCH +CUIRASS +DAM +DESK +DESKTOP COMPUTER +DIAL TELEPHONE +DIAPER +DIGITAL CLOCK +DIGITAL WATCH +DINING TABLE +DISHRAG +DISHWASHER +DISK BRAKE +DOCK +DOGSLED +DOME +DOORMAT +DRILLING PLATFORM +DRUM +DRUMSTICK +DUMBBELL +DUTCH OVEN +ELECTRIC FAN +ELECTRIC GUITAR +ELECTRIC LOCOMOTIVE +ENTERTAINMENT CENTER +ENVELOPE +ESPRESSO MAKER +FACE POWDER +FEATHER BOA +FILE +FIREBOAT +FIRE ENGINE +FIRE SCREEN +FLAGPOLE +FLUTE +FOLDING CHAIR +FOOTBALL HELMET +FORKLIFT +FOUNTAIN +FOUNTAIN PEN +FOUR-POSTER +FREIGHT CAR +FRENCH HORN +FRYING PAN +FUR COAT +GARBAGE TRUCK +GASMASK +GAS PUMP +GOBLET +GO-KART +GOLF BALL +GOLFCART +GONDOLA +GONG +GOWN +GRAND PIANO +GREENHOUSE +GRILLE +GROCERY STORE +GUILLOTINE +HAIR SLIDE +HAIR SPRAY +HALF TRACK +HAMMER +HAMPER +HAND BLOWER +HAND-HELD COMPUTER +HANDKERCHIEF +HARD DISC +HARMONICA +HARP +HARVESTER +HATCHET +HOLSTER +HOME THEATER +HONEYCOMB +HOOK +HOOPSKIRT +HORIZONTAL BAR +HORSE CART +HOURGLASS +IPOD +IRON +JACK-O-LANTERN +JEAN +JEEP +JERSEY +JIGSAW PUZZLE +JINRIKISHA +JOYSTICK +KIMONO +KNEE PAD +KNOT +LAB COAT +LADLE +LAMPSHADE +LAPTOP +LAWN MOWER +LENS CAP +LETTER OPENER +LIBRARY +LIFEBOAT +LIGHTER +LIMOUSINE +LINER +LIPSTICK +LOAFER +LOTION +LOUDSPEAKER +LOUPE +LUMBERMILL +MAGNETIC COMPASS +MAILBAG +MAILBOX +MAILLOT +MAILLOT +MANHOLE COVER +MARACA +MARIMBA +MASK +MATCHSTICK +MAYPOLE +MAZE +MEASURING CUP +MEDICINE CHEST +MEGALITH +MICROPHONE +MICROWAVE +MILITARY UNIFORM +MILK CAN +MINIBUS +MINISKIRT +MINIVAN +MISSILE +MITTEN +MIXING BOWL +MOBILE HOME +MODEL T +MODEM +MONASTERY +MONITOR +MOPED +MORTAR +MORTARBOARD +MOSQUE +MOSQUITO NET +MOTOR SCOOTER +MOUNTAIN BIKE +MOUNTAIN TENT +MOUSE +MOUSETRAP +MOVING VAN +MUZZLE +NAIL +NECK BRACE +NECKLACE +NIPPLE +NOTEBOOK +OBELISK +OBOE +OCARINA +ODOMETER +OIL FILTER +ORGAN +OSCILLOSCOPE +OVERSKIRT +OXCART +OXYGEN MASK +PACKET +PADDLE +PADDLEWHEEL +PADLOCK +PAINTBRUSH +PAJAMA +PALACE +PANPIPE +PAPER TOWEL +PARACHUTE +PARALLEL BARS +PARK BENCH +PARKING METER +PASSENGER CAR +PATIO +PAY-PHONE +PEDESTAL +PENCIL BOX +PENCIL SHARPENER +PERFUME +PETRI DISH +PHOTOCOPIER +PICK +PICKELHAUBE +PICKET FENCE +PICKUP +PIER +PIGGY BANK +PILL BOTTLE +PILLOW +PING-PONG BALL +PINWHEEL +PIRATE +PITCHER +PLANE +PLANETARIUM +PLASTIC BAG +PLATE RACK +PLOW +PLUNGER +POLAROID CAMERA +POLE +POLICE VAN +PONCHO +POOL TABLE +POP BOTTLE +POT +POTTERS WHEEL +POWER DRILL +PRAYER RUG +PRINTER +PRISON +PROJECTILE +PROJECTOR +PUCK +PUNCHING BAG +PURSE +QUILL +QUILT +RACER +RACKET +RADIATOR +RADIO +RADIO TELESCOPE +RAIN BARREL +RECREATIONAL VEHICLE +REEL +REFLEX CAMERA +REFRIGERATOR +REMOTE CONTROL +RESTAURANT +REVOLVER +RIFLE +ROCKING CHAIR +ROTISSERIE +RUBBER ERASER +RUGBY BALL +RULE +RUNNING SHOE +SAFE +SAFETY PIN +SALTSHAKER +SANDAL +SARONG +SAX +SCABBARD +SCALE +SCHOOL BUS +SCHOONER +SCOREBOARD +SCREEN +SCREW +SCREWDRIVER +SEAT BELT +SEWING MACHINE +SHIELD +SHOE SHOP +SHOJI +SHOPPING BASKET +SHOPPING CART +SHOVEL +SHOWER CAP +SHOWER CURTAIN +SKI +SKI MASK +SLEEPING BAG +SLIDE RULE +SLIDING DOOR +SLOT +SNORKEL +SNOWMOBILE +SNOWPLOW +SOAP DISPENSER +SOCCER BALL +SOCK +SOLAR DISH +SOMBRERO +SOUP BOWL +SPACE BAR +SPACE HEATER +SPACE SHUTTLE +SPATULA +SPEEDBOAT +SPIDER WEB +SPINDLE +SPORTS CAR +SPOTLIGHT +STAGE +STEAM LOCOMOTIVE +STEEL ARCH BRIDGE +STEEL DRUM +STETHOSCOPE +STOLE +STONE WALL +STOPWATCH +STOVE +STRAINER +STREETCAR +STRETCHER +STUDIO COUCH +STUPA +SUBMARINE +SUIT +SUNDIAL +SUNGLASS +SUNGLASSES +SUNSCREEN +SUSPENSION BRIDGE +SWAB +SWEATSHIRT +SWIMMING TRUNKS +SWING +SWITCH +SYRINGE +TABLE LAMP +TANK +TAPE PLAYER +TEAPOT +TEDDY +TELEVISION +TENNIS BALL +THATCH +THEATER CURTAIN +THIMBLE +THRESHER +THRONE +TILE ROOF +TOASTER +TOBACCO SHOP +TOILET SEAT +TORCH +TOTEM POLE +TOW TRUCK +TOYSHOP +TRACTOR +TRAILER TRUCK +TRAY +TRENCH COAT +TRICYCLE +TRIMARAN +TRIPOD +TRIUMPHAL ARCH +TROLLEYBUS +TROMBONE +TUB +TURNSTILE +TYPEWRITER KEYBOARD +UMBRELLA +UNICYCLE +UPRIGHT +VACUUM +VASE +VAULT +VELVET +VENDING MACHINE +VESTMENT +VIADUCT +VIOLIN +VOLLEYBALL +WAFFLE IRON +WALL CLOCK +WALLET +WARDROBE +WARPLANE +WASHBASIN +WASHER +WATER BOTTLE +WATER JUG +WATER TOWER +WHISKEY JUG +WHISTLE +WIG +WINDOW SCREEN +WINDOW SHADE +WINDSOR TIE +WINE BOTTLE +WING +WOK +WOODEN SPOON +WOOL +WORM FENCE +WRECK +YAWL +YURT +WEB SITE +COMIC BOOK +CROSSWORD PUZZLE +STREET SIGN +TRAFFIC LIGHT +BOOK JACKET +MENU +PLATE +GUACAMOLE +CONSOMME +HOT POT +TRIFLE +ICE CREAM +ICE LOLLY +FRENCH LOAF +BAGEL +PRETZEL +CHEESEBURGER +HOTDOG +MASHED POTATO +HEAD CABBAGE +BROCCOLI +CAULIFLOWER +ZUCCHINI +SPAGHETTI SQUASH +ACORN SQUASH +BUTTERNUT SQUASH +CUCUMBER +ARTICHOKE +BELL PEPPER +CARDOON +MUSHROOM +GRANNY SMITH +STRAWBERRY +ORANGE +LEMON +FIG +PINEAPPLE +BANANA +JACKFRUIT +CUSTARD APPLE +POMEGRANATE +HAY +CARBONARA +CHOCOLATE SAUCE +DOUGH +MEAT LOAF +PIZZA +POTPIE +BURRITO +RED WINE +ESPRESSO +CUP +EGGNOG +ALP +BUBBLE +CLIFF +CORAL REEF +GEYSER +LAKESIDE +PROMONTORY +SANDBAR +SEASHORE +VALLEY +VOLCANO +BALLPLAYER +GROOM +SCUBA DIVER +RAPESEED +DAISY +LADY SLIPPER +CORN +ACORN +HIP +BUCKEYE +CORAL FUNGUS +AGARIC +GYROMITRA +STINKHORN +EARTHSTAR +HEN-OF-THE-WOODS +BOLETE +EAR +TOILET TISSUE From f007255d977dc913aacff651d72a942335a37280 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 10 Mar 2023 13:10:55 -0800 Subject: [PATCH 081/216] Improve error message when BLS is used in 'initialize' or 'finalize' function (#211) * Improve error message when BLS is used in 'initialize' or 'finalize' function * Address comment --- src/infer_request.cc | 11 ++++++++--- src/pb_stub.cc | 14 ++++++++++++++ src/pb_stub.h | 7 +++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 5620f342..52a723f3 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -373,9 +373,15 @@ InferRequest::GetResponseSender() std::shared_ptr InferRequest::Exec(const bool is_decoupled) { + // BLS should not be used in "initialize" or "finalize" function. + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + if (!stub->IsInitialized() || stub->IsFinalizing()) { + throw PythonBackendException( + "BLS is only supported during the 'execute' function."); + } + ResponseBatch* response_batch = nullptr; bool responses_is_set = false; - std::unique_ptr& stub = Stub::GetOrCreateInstance(); std::unique_ptr& shm_pool = stub->SharedMemory(); bi::managed_external_buffer::handle_t* response_handle = nullptr; @@ -529,8 +535,7 @@ InferRequest::Exec(const bool is_decoupled) for (auto& output_tensor : error_response->OutputTensors()) { if (!output_tensor->IsCPU()) { - uint64_t memory_release_id = - output_tensor->Memory()->MemoryReleaseId(); + uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId(); output_tensor->Memory()->SetMemoryReleaseCallback( [&memory_manager_message_queue, memory_release_id]() { memory_manager_message_queue->Push(memory_release_id); diff --git a/src/pb_stub.cc b/src/pb_stub.cc index b3af6311..b718ba6c 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -84,6 +84,7 @@ Stub::Instantiate( name_ = name; health_mutex_ = nullptr; initialized_ = false; + finalizing_ = false; stub_to_parent_thread_ = false; parent_to_stub_thread_ = false; @@ -799,6 +800,7 @@ Stub::UpdateHealth() void Stub::Finalize() { + finalizing_ = true; // Call finalize if exists. if (initialized_ && py::hasattr(model_instance_, "finalize")) { try { @@ -1120,6 +1122,18 @@ Stub::SaveResponseIterator(std::shared_ptr response_iterator) response_iterator->Id(), response_iterator)); } +bool +Stub::IsInitialized() +{ + return initialized_; +} + +bool +Stub::IsFinalizing() +{ + return finalizing_; +} + std::unique_ptr Logger::log_instance_; std::unique_ptr& diff --git a/src/pb_stub.h b/src/pb_stub.h index b7dc83d0..c321bcce 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -256,6 +256,12 @@ class Stub { /// Add cleanup id to queue void EnqueueCleanupId(void* id); + /// Is the stub initialized + bool IsInitialized(); + + /// Is the stub in the finalize stage + bool IsFinalizing(); + private: bi::interprocess_mutex* stub_mutex_; @@ -282,6 +288,7 @@ class Stub { parent_to_stub_mq_; std::unique_ptr> memory_manager_message_queue_; bool initialized_; + bool finalizing_; static std::unique_ptr stub_instance_; std::vector> gpu_tensors_; std::queue> log_request_buffer_; From 7f5f32ebaa12bdacb70dcedd5e25da10d814063d Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 15 Mar 2023 15:51:10 -0400 Subject: [PATCH 082/216] Add request parameters to Python models (#213) * Add request parameters to Python models * Add documentation about the inference request parameters --- README.md | 12 ++++++++++++ src/infer_request.cc | 38 +++++++++++++++++++++++++++++++------- src/infer_request.h | 10 +++++++--- src/pb_stub.cc | 6 +++++- src/python_be.cc | 37 +++++++++++++++++++++++++++++++++++-- 5 files changed, 90 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 28356e39..d8367b2c 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ any C++ code. - [Known Issues](#known-issues) - [`finalize`](#finalize) - [Model Config File](#model-config-file) + - [Inference Request Parameters](#inference-request-parameters) - [Managing Python Runtime and Libraries](#managing-python-runtime-and-libraries) - [Building Custom Python Backend Stub](#building-custom-python-backend-stub) - [Creating Custom Execution Environments](#creating-custom-execution-environments) @@ -560,6 +561,17 @@ models └── config.pbtxt ``` +## Inference Request Parameters + +You can retrieve the parameters associated with an inference request +using the `inference_request.parameters()` function. This function +returns a JSON object where the keys are the keys of the parameters +object and the values are the values for the parameters field. + +You can read more about the inference request parameters in the [parameters +extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md) +documentation. + ## Managing Python Runtime and Libraries Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/) diff --git a/src/infer_request.cc b/src/infer_request.cc index 52a723f3..6af771a0 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -27,6 +27,7 @@ #include "infer_request.h" #include + #include "pb_utils.h" #include "scoped_defer.h" #ifdef TRITON_PB_STUB @@ -40,12 +41,12 @@ InferRequest::InferRequest( const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const uint32_t flags, const int32_t timeout, + const std::string& parameters, const uint32_t flags, const int32_t timeout, const intptr_t response_factory_address, const intptr_t request_address) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), - model_version_(model_version), flags_(flags), timeout_(timeout), - response_factory_address_(response_factory_address), + model_version_(model_version), parameters_(parameters), flags_(flags), + timeout_(timeout), response_factory_address_(response_factory_address), request_address_(request_address) { for (auto& input : inputs) { @@ -79,6 +80,12 @@ InferRequest::Inputs() return inputs_; } +const std::string& +InferRequest::Parameters() +{ + return parameters_; +} + const std::string& InferRequest::RequestId() { @@ -160,7 +167,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) sizeof(bi::managed_external_buffer::handle_t)) + (Inputs().size() * sizeof(bi::managed_external_buffer::handle_t)) + PbString::ShmStructSize(ModelName()) + - PbString::ShmStructSize(RequestId())); + PbString::ShmStructSize(RequestId()) + + PbString::ShmStructSize(Parameters())); infer_request_shm_ptr_ = reinterpret_cast(infer_request_shm.data_.get()); @@ -222,10 +230,18 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) reinterpret_cast(infer_request_shm_ptr_) + request_id_offset, infer_request_shm.handle_ + request_id_offset); + size_t parameters_offset = + request_id_offset + PbString::ShmStructSize(RequestId()); + std::unique_ptr parameters_shm = PbString::Create( + Parameters(), + reinterpret_cast(infer_request_shm_ptr_) + parameters_offset, + infer_request_shm.handle_ + parameters_offset); + // Save the references to shared memory. infer_request_shm_ = std::move(infer_request_shm); request_id_shm_ = std::move(request_id_shm); model_name_shm_ = std::move(model_name_shm); + parameters_shm_ = std::move(parameters_shm); shm_handle_ = infer_request_shm_.handle_; requested_output_names_shm_ = std::move(requested_output_names_shm); } @@ -286,9 +302,14 @@ InferRequest::LoadFromSharedMemory( request_handle + request_id_offset, reinterpret_cast(infer_request_shm_ptr) + request_id_offset); + size_t parameters_offset = request_id_offset + request_id_shm->Size(); + std::unique_ptr parameters_shm = PbString::LoadFromSharedMemory( + request_handle + request_id_offset, + reinterpret_cast(infer_request_shm_ptr) + parameters_offset); + return std::unique_ptr(new InferRequest( infer_request_shm, request_id_shm, requested_output_names_shm, - model_name_shm, input_tensors)); + model_name_shm, input_tensors, parameters_shm)); } InferRequest::InferRequest( @@ -296,11 +317,13 @@ InferRequest::InferRequest( std::unique_ptr& request_id_shm, std::vector>& requested_output_names_shm, std::unique_ptr& model_name_shm, - std::vector>& input_tensors) + std::vector>& input_tensors, + std::unique_ptr& parameters_shm) : infer_request_shm_(std::move(infer_request_shm)), request_id_shm_(std::move(request_id_shm)), requested_output_names_shm_(std::move(requested_output_names_shm)), - model_name_shm_(std::move(model_name_shm)) + model_name_shm_(std::move(model_name_shm)), + parameters_shm_(std::move(parameters_shm)) { infer_request_shm_ptr_ = reinterpret_cast(infer_request_shm_.data_.get()); @@ -325,6 +348,7 @@ InferRequest::InferRequest( } request_id_ = request_id_shm_->String(); + parameters_ = parameters_shm_->String(); requested_output_names_ = std::move(requested_output_names); model_name_ = model_name_shm_->String(); flags_ = infer_request_shm_ptr_->flags; diff --git a/src/infer_request.h b/src/infer_request.h index dc18435a..6792e54a 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -61,12 +61,13 @@ class InferRequest { const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const uint32_t flags = 0, const int32_t timeout = 0, - const intptr_t response_factory_address = 0, + const std::string& parameters, const uint32_t flags = 0, + const int32_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0); const std::vector>& Inputs(); const std::string& RequestId(); + const std::string& Parameters(); uint64_t CorrelationId(); const std::string& ModelName(); int64_t ModelVersion(); @@ -116,7 +117,8 @@ class InferRequest { std::unique_ptr& request_id_shm, std::vector>& requested_output_names_shm, std::unique_ptr& model_name_shm, - std::vector>& input_tensors); + std::vector>& input_tensors, + std::unique_ptr& parameters_shm); std::string request_id_; uint64_t correlation_id_; @@ -124,6 +126,7 @@ class InferRequest { std::set requested_output_names_; std::string model_name_; int64_t model_version_; + std::string parameters_; uint32_t flags_; int32_t timeout_; intptr_t response_factory_address_; @@ -140,6 +143,7 @@ class InferRequest { bi::managed_external_buffer::handle_t* output_names_handle_shm_ptr_; bi::managed_external_buffer::handle_t* input_tensors_handle_ptr_; bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr parameters_shm_; #ifdef TRITON_PB_STUB std::shared_ptr response_sender_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index b718ba6c..c33cc1a5 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -29,6 +29,7 @@ #include #include #include + #include #include #include @@ -41,6 +42,7 @@ #include #include #include + #include "infer_response.h" #include "pb_error.h" #include "pb_map.h" @@ -1272,9 +1274,10 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) for (auto& requested_output_name : requested_output_names) { requested_outputs.emplace(requested_output_name); } + // FIXME: InferenceRequest parameters are not supported in BLS now. return std::make_shared( request_id, correlation_id, inputs, requested_outputs, - model_name, model_version, flags, timeout); + model_name, model_version, "" /*parameters*/, flags, timeout); }), py::arg("request_id").none(false) = "", py::arg("correlation_id").none(false) = 0, @@ -1291,6 +1294,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("flags", &InferRequest::Flags) .def("set_flags", &InferRequest::SetFlags) .def("timeout", &InferRequest::Timeout) + .def("parameters", &InferRequest::Parameters) .def( "exec", [](std::shared_ptr& infer_request, diff --git a/src/python_be.cc b/src/python_be.cc index d5d9bfc1..d2332a17 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -356,6 +356,39 @@ ModelInstanceState::SaveRequestsToSharedMemory( requested_output_names.emplace(requested_output_name); } + triton::common::TritonJson::Value parameters_json( + triton::common::TritonJson::ValueType::OBJECT); + uint32_t parameter_count; + RETURN_IF_ERROR( + TRITONBACKEND_RequestParameterCount(request, ¶meter_count)); + for (size_t i = 0; i < parameter_count; i++) { + const char* name; + TRITONSERVER_ParameterType type; + const void* vvalue; + RETURN_IF_ERROR( + TRITONBACKEND_RequestParameter(request, i, &name, &type, &vvalue)); + if (type == TRITONSERVER_PARAMETER_INT) { + RETURN_IF_ERROR(parameters_json.AddInt( + name, *(reinterpret_cast(vvalue)))); + } else if (type == TRITONSERVER_PARAMETER_BOOL) { + RETURN_IF_ERROR(parameters_json.AddBool( + name, *(reinterpret_cast(vvalue)))); + } else if (type == TRITONSERVER_PARAMETER_STRING) { + std::string string = reinterpret_cast(vvalue); + RETURN_IF_ERROR(parameters_json.AddString(name, string)); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + (std::string("Unsupported parameter type for parameter '") + name + + "'.") + .c_str()); + } + } + + triton::common::TritonJson::WriteBuffer buffer; + RETURN_IF_ERROR(parameters_json.Write(&buffer)); + const auto& parameters_string = buffer.Contents(); + // request id const char* id; RETURN_IF_ERROR(TRITONBACKEND_RequestId(request, &id)); @@ -373,13 +406,13 @@ ModelInstanceState::SaveRequestsToSharedMemory( RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), flags, + model_state->Name(), model_state->Version(), parameters_string, flags, 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), reinterpret_cast(request)); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), flags, + model_state->Name(), model_state->Version(), parameters_string, flags, 0 /* BLS request timeout*/, 0 /* response_factory_address */, reinterpret_cast(request)); } From 62f846ee1a11fe0dc8670f20fb28fe6becf1f567 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Thu, 16 Mar 2023 15:28:32 -0700 Subject: [PATCH 083/216] Re-extract environment if the archive has been updated (#212) * Re-extract environment if the archive has been updated * Add verbose logging * Use the identical destination directory when re-extracting --- src/pb_env.cc | 60 +++++++++++++++++++++++++++++++++++++++++++++------ src/pb_env.h | 2 +- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/pb_env.cc b/src/pb_env.cc index b0bc5578..2065e6db 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -157,6 +157,19 @@ FileExists(std::string& path) return stat(path.c_str(), &buffer) == 0; } +void +LastModifiedTime(const std::string& path, time_t* last_modified_time) +{ + struct stat result; + if (stat(path.c_str(), &result) == 0) { + *last_modified_time = result.st_mtime; + } else { + throw PythonBackendException(std::string( + "LastModifiedTime() failed as file \'" + path + + std::string("\' does not exists."))); + } +} + void RecursiveDirectoryDelete(const char* dir) @@ -233,10 +246,39 @@ EnvironmentManager::ExtractIfNotExtracted(std::string env_path) std::string("Failed to get the canonical path for ") + env_path + "."); } + time_t last_modified_time; + LastModifiedTime(canonical_env_path, &last_modified_time); + + bool env_extracted = false; + bool re_extraction = false; + const auto env_itr = env_map_.find(canonical_env_path); + if (env_itr != env_map_.end()) { + // Check if the environment has been modified and would + // need to be extracted again. + if (env_itr->second.second == last_modified_time) { + env_extracted = true; + } else { + // Environment file has been updated. Need to clear + // the previously extracted environment and extract + // the environment to the same destination directory. + RecursiveDirectoryDelete(env_itr->second.first.c_str()); + re_extraction = true; + } + } + // Extract only if the env has not been extracted yet. - if (env_map_.find(canonical_env_path) == env_map_.end()) { - std::string dst_env_path( - std::string(base_path_) + "/" + std::to_string(env_map_.size())); + if (!env_extracted) { + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Extracting Python execution env ") + canonical_env_path) + .c_str()); + std::string dst_env_path; + if (re_extraction) { + dst_env_path = env_map_[canonical_env_path].first; + } else { + dst_env_path = + std::string(base_path_) + "/" + std::to_string(env_map_.size()); + } std::string canonical_env_path_str(canonical_env_path); @@ -249,12 +291,16 @@ EnvironmentManager::ExtractIfNotExtracted(std::string env_path) std::string("Failed to create environment directory for '") + dst_env_path.c_str() + "'."); } - - // Add the path to the list of environments - env_map_.insert({canonical_env_path, dst_env_path}); + if (re_extraction) { + // Just update the last modified timestamp + env_map_[canonical_env_path].second = last_modified_time; + } else { + // Add the path to the list of environments + env_map_.insert({canonical_env_path, {dst_env_path, last_modified_time}}); + } return dst_env_path; } else { - return env_map_.find(canonical_env_path)->second; + return env_map_.find(canonical_env_path)->second.first; } } diff --git a/src/pb_env.h b/src/pb_env.h index 9e2e5750..668d05ef 100644 --- a/src/pb_env.h +++ b/src/pb_env.h @@ -40,7 +40,7 @@ bool FileExists(std::string& path); // A class that manages Python environments // class EnvironmentManager { - std::map env_map_; + std::map> env_map_; char base_path_[PATH_MAX + 1]; std::mutex mutex_; From bdf75da3dfb95947245417966d7793be02e3355d Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 17 Mar 2023 15:57:49 -0700 Subject: [PATCH 084/216] Fix shared_ptr thread-safe issue for InferResponseComplete callback (#215) * Remove unused variables * Pass InferPayload pointer as userp instead of shared_ptr --- src/python_be.h | 2 -- src/request_executor.cc | 15 +++++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/python_be.h b/src/python_be.h index 32669e2d..b1bef0a4 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -257,8 +257,6 @@ class ModelInstanceState : public BackendModelInstance { TRITONBACKEND_Model* triton_model_; std::unique_ptr model_instance_stub_; - std::vector bls_inference_responses_; - std::mutex bls_responses_mutex_; std::vector closed_requests_; std::mutex closed_requests_mutex_; diff --git a/src/request_executor.cc b/src/request_executor.cc index d3c11772..ba183cab 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -56,7 +56,7 @@ void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) { - auto p = reinterpret_cast*>(userp); + auto p = reinterpret_cast(userp); std::unique_ptr infer_response; std::vector> output_tensors; std::shared_ptr pb_error; @@ -125,7 +125,7 @@ InferResponseComplete( output_tensors.clear(); } - if (!(*p)->IsDecoupled()) { + if (!p->IsDecoupled()) { infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */); } else { @@ -146,8 +146,7 @@ InferResponseComplete( TRITONSERVER_InferenceResponseDelete(response), "Failed to release BLS inference response."); } else if ( - (*p)->IsDecoupled() && - (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { + p->IsDecoupled() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { // An empty response may be the last reponse for decoupled models. infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */, userp /* id */); @@ -159,10 +158,10 @@ InferResponseComplete( // Only set value to the promise with the first response. Call the callback // function to send decoupled response to the stub. - if ((*p)->IsPromiseSet()) { - (*p)->Callback(std::move(infer_response)); + if (p->IsPromiseSet()) { + p->Callback(std::move(infer_response)); } else { - (*p)->SetValueForPrevPromise(std::move(infer_response)); + p->SetValueForPrevPromise(std::move(infer_response)); } } @@ -352,7 +351,7 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback( irequest, response_allocator_, shm_pool_.get(), InferResponseComplete, - reinterpret_cast(&infer_payload))); + reinterpret_cast(infer_payload.get()))); THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( server_, irequest, nullptr /* trace */)); From 63cc43797877039fc7d0fa19dc4c3a9d84e8f48f Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 21 Mar 2023 17:18:25 -0700 Subject: [PATCH 085/216] Fix L0_backend_python timeout issue (#218) * Fix up missing mutex * Rename variable * Fix for the case where returning the first decoupled response is slower than the following responses * Address comment --- src/infer_request.cc | 6 ++--- src/pb_response_iterator.cc | 19 ++++++++++++-- src/pb_response_iterator.h | 3 ++- src/pb_stub.cc | 51 ++++++++++++++++++++++++------------- src/pb_stub.h | 6 ++--- 5 files changed, 59 insertions(+), 26 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 6af771a0..5a71ee33 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -553,11 +553,11 @@ InferRequest::Exec(const bool is_decoupled) if (responses_is_set) { auto& memory_manager_message_queue = stub->MemoryManagerQueue(); - std::unique_ptr error_response = + std::unique_ptr return_response = InferResponse::LoadFromSharedMemory( shm_pool, *response_handle, true /* open cuda handle */); - for (auto& output_tensor : error_response->OutputTensors()) { + for (auto& output_tensor : return_response->OutputTensors()) { if (!output_tensor->IsCPU()) { uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId(); output_tensor->Memory()->SetMemoryReleaseCallback( @@ -567,7 +567,7 @@ InferRequest::Exec(const bool is_decoupled) } } - return error_response; + return return_response; } else { auto error_response = std::make_unique( std::vector>{}, diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc index f81a6200..27a6c64b 100644 --- a/src/pb_response_iterator.cc +++ b/src/pb_response_iterator.cc @@ -114,11 +114,11 @@ ResponseIterator::Iter() } void -ResponseIterator::EnqueueResponse(std::unique_ptr infer_response) +ResponseIterator::EnqueueResponse(std::shared_ptr infer_response) { { std::lock_guard lock{mu_}; - response_buffer_.push(std::move(infer_response)); + response_buffer_.push(infer_response); } cv_.notify_one(); } @@ -144,4 +144,19 @@ ResponseIterator::Clear() is_cleared_ = true; } +std::vector> +ResponseIterator::GetExistingResponses() +{ + std::vector> responses; + std::unique_lock lock{mu_}; + while (!response_buffer_.empty()) { + responses.push_back(response_buffer_.front()); + response_buffer_.pop(); + } + is_finished_ = true; + is_cleared_ = true; + + return responses; +} + }}} // namespace triton::backend::python diff --git a/src/pb_response_iterator.h b/src/pb_response_iterator.h index b11f8e45..98351369 100644 --- a/src/pb_response_iterator.h +++ b/src/pb_response_iterator.h @@ -38,9 +38,10 @@ class ResponseIterator { std::shared_ptr Next(); py::iterator Iter(); - void EnqueueResponse(std::unique_ptr infer_response); + void EnqueueResponse(std::shared_ptr infer_response); void* Id(); void Clear(); + std::vector> GetExistingResponses(); private: std::vector> responses_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index c33cc1a5..5fa48f39 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -918,8 +918,11 @@ Stub::ServiceStubToParentRequests() break; } else { bls_response_cleanup_buffer_.pop(); + { + std::lock_guard lock(response_iterator_map_mu_); + response_iterator_map_.erase(id); + } SendCleanupId(id); - response_iterator_map_.erase(id); } } } @@ -1093,7 +1096,11 @@ Stub::ParentToStubMQMonitor() response_iterator_map_[infer_response->Id()]->EnqueueResponse( std::move(infer_response)); } else { - LOG_INFO << "Failed to enqueue the response to its response iterator."; + auto response_iterator = + std::make_shared(std::move(infer_response)); + response_iterator_map_.insert( + std::pair>( + response_iterator->Id(), response_iterator)); } } @@ -1115,13 +1122,31 @@ Stub::ParentToStubServiceActive() return parent_to_stub_thread_; } -void -Stub::SaveResponseIterator(std::shared_ptr response_iterator) +std::shared_ptr +Stub::GetResponseIterator(std::shared_ptr infer_response) { std::lock_guard lock(response_iterator_map_mu_); - response_iterator_map_.insert( - std::pair>( - response_iterator->Id(), response_iterator)); + if (response_iterator_map_.find(infer_response->Id()) != + response_iterator_map_.end()) { + // Need to re-construct the 'ResponseIterator' and update the + // 'response_iterator_map_' to make sure the 'ResponseIterator' object has + // the correct first response. + auto response_iterator = std::make_shared(infer_response); + std::vector> existing_responses = + response_iterator_map_[infer_response->Id()]->GetExistingResponses(); + for (auto& response : existing_responses) { + response_iterator->EnqueueResponse(response); + } + + response_iterator_map_[infer_response->Id()] = response_iterator; + } else { + auto response_iterator = std::make_shared(infer_response); + response_iterator_map_.insert( + std::pair>( + response_iterator->Id(), response_iterator)); + } + + return response_iterator_map_[infer_response->Id()]; } bool @@ -1304,12 +1329,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) infer_request->Exec(decoupled); py::object response_object; if (decoupled) { - auto response_iterator = - std::make_shared(response); + auto response_iterator = stub->GetResponseIterator(response); response_object = py::cast(response_iterator); - if (response_iterator->Id() != nullptr) { - stub->SaveResponseIterator(response_iterator); - } } else { response_object = py::cast(response); } @@ -1334,12 +1355,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) infer_request->Exec(decoupled); py::object response_object; if (decoupled) { - auto response_iterator = - std::make_shared(response); + auto response_iterator = stub->GetResponseIterator(response); response_object = py::cast(response_iterator); - if (response_iterator->Id() != nullptr) { - stub->SaveResponseIterator(response_iterator); - } } else { response_object = py::cast(response); } diff --git a/src/pb_stub.h b/src/pb_stub.h index c321bcce..24d94eb6 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -246,9 +246,9 @@ class Stub { /// Thread process void ParentToStubMQMonitor(); - /// Keep track of the ResponseIterator object - void SaveResponseIterator( - std::shared_ptr response_iterator); + /// Get the ResponseIterator object associated with the infer response + std::shared_ptr GetResponseIterator( + std::shared_ptr infer_response); /// Send the id to the python backend for object cleanup void SendCleanupId(void* id); From aa685c4715d87a7360bc5494c4975d5f363d9d8d Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Thu, 23 Mar 2023 07:58:59 -0700 Subject: [PATCH 086/216] Add codeql static analysis (#219) * Add codeql static analysis * Run daily jobs Sunday-Friday 6pm * Move codeql to right folder --- .github/workflows/codeql.yml | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000..a724718d --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,90 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "CodeQL" + +on: + push: + branches: [ 'main' ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ 'main' ] + schedule: + - cron: '0 1 * * 1-6' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to: + # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + queries: +security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # Command-line programs to run using the OS shell. + # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" From 076e2f7d3f57d77e36f9f4c146429db087b49c71 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 27 Mar 2023 17:49:47 -0400 Subject: [PATCH 087/216] Initialize CUDA driver API before using it (#222) * Initialize CUDA driver before using it * Fix copyright --- src/pb_tensor.cc | 3 ++- src/pb_utils.cc | 21 ++++++++++++++++++++- src/pb_utils.h | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index ae7e9678..c4b08b7f 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -388,6 +388,7 @@ PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor) PbTensor::~PbTensor() noexcept(false) { + pb_memory_.reset(); DeleteDLPack(); } diff --git a/src/pb_utils.cc b/src/pb_utils.cc index db6f83a4..3c607dea 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -78,6 +78,25 @@ CUDAHandler::CUDAHandler() dlerror()); } *((void**)&cu_get_error_string_fn_) = cu_get_error_string_fn; + + void* cu_init_fn = dlsym(dl_open_handle_, "cuInit"); + if (cu_init_fn == nullptr) { + throw PythonBackendException( + std::string("Failed to dlsym 'cuInit'. Error: ") + dlerror()); + } + *((void**)&cu_init_fn_) = cu_init_fn; + + // Initialize the driver API. + CUresult cuda_err = (*cu_init_fn_)(0 /* flags */); + if (cuda_err != CUDA_SUCCESS) { + const char* error_string; + (*cu_get_error_string_fn_)(cuda_err, &error_string); + throw PythonBackendException( + std::string( + "failed to get cuda pointer device attribute: " + + std::string(error_string)) + .c_str()); + } } } diff --git a/src/pb_utils.h b/src/pb_utils.h index d7e360cf..20f17795 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -225,6 +225,7 @@ class CUDAHandler { CUresult (*cu_pointer_get_attribute_fn_)( CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr; CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr; + CUresult (*cu_init_fn_)(unsigned int) = nullptr; CUDAHandler(); ~CUDAHandler() noexcept(false); From 3d85a2c90b41143d086d4d1d4013b2ecd3e9a61c Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Tue, 28 Mar 2023 10:05:10 -0700 Subject: [PATCH 088/216] Adding repo tag to torch.hub.load to fix compatibility issues (#224) * Adding repo tag to torch.hub.load to fix compatibility issues * Added comments to clarify tag in torch.hub.load --- examples/instance_kind/model.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/instance_kind/model.py b/examples/instance_kind/model.py index 6ebfb6bc..1db6e57b 100644 --- a/examples/instance_kind/model.py +++ b/examples/instance_kind/model.py @@ -45,7 +45,12 @@ def initialize(self, args): the default device of the framework. """ self.device = 'cuda' if args["model_instance_kind"] == "GPU" else 'cpu' - self.model = torch.hub.load("pytorch/vision", + # This example is configured to work with torch=1.13 + # and torchvision=0.14. Thus, we need to provide a proper tag `0.14.1` + # to make sure loaded Resnet50 is compatible with + # installed `torchvision`. + # Refer to README for installation instructions. + self.model = torch.hub.load("pytorch/vision:v0.14.1", "resnet50", weights="IMAGENET1K_V2", skip_validation=True)\ From 4f2308672b6208b87c3ed6b2551d97d507e46cce Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Mon, 20 Mar 2023 14:45:39 -0700 Subject: [PATCH 089/216] Link properly with dlclose and dlopen libraries --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26221055..5be0ed8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,7 +240,7 @@ target_link_libraries( dlpack Threads::Threads triton-backend-utils # from repo-backend - -ldl # dlopen + ${CMAKE_DL_LIBS} # dlopen and dlclose -lrt # shared memory triton-core-serverstub # from repo-core ZLIB::ZLIB @@ -253,6 +253,7 @@ target_link_libraries( dlpack Threads::Threads triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose pybind11::embed -lrt # shared memory -larchive # libarchive From 1b6fa772c4390ea036a3d911634699656eff47b8 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 30 Mar 2023 16:54:08 -0700 Subject: [PATCH 090/216] Add healthiness check to avoid hanging during model initialization (#221) * Add healthiness check to avoid hanging during model initialization * Address comment * Fix the type of the argument --- src/python_be.cc | 42 ++------------------------------ src/python_be.h | 7 ++---- src/stub_launcher.cc | 58 +++++++++++++++++++++++++++++++++++++++++++- src/stub_launcher.h | 4 +++ 4 files changed, 65 insertions(+), 46 deletions(-) diff --git a/src/python_be.cc b/src/python_be.cc index d2332a17..fa33dbae 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -165,7 +165,7 @@ ModelInstanceState::SendMessageAndReceiveResponse( } bi::managed_external_buffer::handle_t response_message; - error = ReceiveMessageFromStub(response_message); + error = Stub()->ReceiveMessageFromStub(response_message); if (error != nullptr) { restart = true; RespondErrorToAllRequests( @@ -215,44 +215,6 @@ ModelInstanceState::SendMessageToStub( return nullptr; // success } -TRITONSERVER_Error* -ModelInstanceState::ReceiveMessageFromStub( - bi::managed_external_buffer::handle_t& message) -{ - bool success = false; - while (!success) { - uint64_t timeout_miliseconds = 1000; - { - boost::posix_time::ptime timeout = - boost::get_system_time() + - boost::posix_time::milliseconds(timeout_miliseconds); - - bi::scoped_lock lock( - *Stub()->HealthMutex(), timeout); - - // Check if lock has been acquired. - if (lock) { - Stub()->IpcControl()->stub_health = false; - } else { - // If it failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); - } - } - - message = Stub()->ParentMessageQueue()->Pop( - timeout_miliseconds /* duration ms */, success); - - if (!success && !IsStubProcessAlive()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); - } - } - - return nullptr; // success -} - void ModelInstanceState::RespondErrorToAllRequests( const char* message, @@ -1251,7 +1213,7 @@ ModelInstanceState::ProcessRequests( boost::asio::post(*thread_pool_, std::move(task)); futures_.emplace_back(std::move(future)); - auto error = ReceiveMessageFromStub(response_message); + auto error = Stub()->ReceiveMessageFromStub(response_message); if (error != nullptr) { restart = true; RespondErrorToAllRequests( diff --git a/src/python_be.h b/src/python_be.h index b1bef0a4..bc9fb187 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -289,9 +289,6 @@ class ModelInstanceState : public BackendModelInstance { // Checks whether the stub process is live bool IsStubProcessAlive(); - // Get a message from the stub process - TRITONSERVER_Error* ReceiveMessageFromStub(off_t& message); - // Get a message from the stub process void SendMessageAndReceiveResponse( off_t message, off_t& response, bool& restart, @@ -365,10 +362,10 @@ class ModelInstanceState : public BackendModelInstance { // Model instance stub std::unique_ptr& Stub() { return model_instance_stub_; } - // Stop the log monitor threads + // Stop the stub_to_parent_queue_monitor thread void TerminateMonitor(); - // Start the log monitor threads + // Start the stub_to_parent_queue_monitor thread void StartMonitor(); // Send bls decoupled response to the stub process diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index d43ecee8..545c528f 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -428,8 +428,11 @@ StubLauncher::ModelInstanceStubProcess() initialize_message->Args() = initialize_map_handle; stub_message_queue_->Push(initialize_message->ShmHandle()); + bi::managed_external_buffer::handle_t message; + RETURN_IF_ERROR(ReceiveMessageFromStub(message)); + std::unique_ptr initialize_response_message = - IPCMessage::LoadFromSharedMemory(shm_pool_, parent_message_queue_->Pop()); + IPCMessage::LoadFromSharedMemory(shm_pool_, message); if (initialize_response_message->Command() != PYTHONSTUB_InitializeResponse) { return TRITONSERVER_ErrorNew( @@ -535,4 +538,57 @@ StubLauncher::KillStubProcess() stub_pid_ = 0; } +TRITONSERVER_Error* +StubLauncher::ReceiveMessageFromStub( + bi::managed_external_buffer::handle_t& message) +{ + bool success = false; + while (!success) { + uint64_t timeout_miliseconds = 1000; + { + boost::posix_time::ptime timeout = + boost::get_system_time() + + boost::posix_time::milliseconds(timeout_miliseconds); + + bi::scoped_lock lock(*health_mutex_, timeout); + + // Check if lock has been acquired. + if (lock) { + ipc_control_->stub_health = false; + } else { + // If it failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); + } + } + + message = parent_message_queue_->Pop( + timeout_miliseconds /* duration ms */, success); + + bool is_stub_alive = false; + { + boost::posix_time::ptime timeout = + boost::get_system_time() + boost::posix_time::seconds(1); + bi::scoped_lock lock(*health_mutex_, timeout); + if (lock) { + is_stub_alive = ipc_control_->stub_health; + } else { + // If It failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + is_stub_alive = false; + } + } + + if (!success && !is_stub_alive) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Stub process '") + model_instance_name_ + + "' is not healthy.") + .c_str()); + } + } + + return nullptr; // success +} }}}; // namespace triton::backend::python diff --git a/src/stub_launcher.h b/src/stub_launcher.h index 3edd6729..c3da400b 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -145,6 +145,10 @@ class StubLauncher { // Kill stub process void KillStubProcess(); + // Get a message from the stub process + TRITONSERVER_Error* ReceiveMessageFromStub( + bi::managed_external_buffer::handle_t& message); + private: pid_t parent_pid_; pid_t stub_pid_; From be5113804c2680f3aa79533cf704a4fd65942a75 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 5 Apr 2023 21:44:04 -0700 Subject: [PATCH 091/216] Add configuration for BLS to choose the device of output tensors (#227) * Add configuration to choose the device of output tensors * Fix up for decoupled BLS * Add documentation for preferred memory setting * Address comment * Fix up documentation * Rename to TRITONSERVER_MEMORY_* --- CMakeLists.txt | 1 + README.md | 42 +++++++++++++++++++++-------- src/infer_payload.cc | 14 ++++++++++ src/infer_payload.h | 15 +++++++++++ src/infer_request.cc | 13 +++++++-- src/infer_request.h | 8 +++++- src/pb_preferred_memory.h | 57 +++++++++++++++++++++++++++++++++++++++ src/pb_stub.cc | 33 ++++++++++++++++++++--- src/request_executor.cc | 48 +++++++++++++++++++++++++++++---- 9 files changed, 209 insertions(+), 22 deletions(-) create mode 100644 src/pb_preferred_memory.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5be0ed8e..dcc248bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -158,6 +158,7 @@ set( src/shm_manager.cc src/shm_manager.h src/pb_exception.h + src/pb_preferred_memory.h ) set( diff --git a/README.md b/README.md index d8367b2c..0841dd0d 100644 --- a/README.md +++ b/README.md @@ -892,12 +892,16 @@ class TritonPythonModel: inputs=[]) # `pb_utils.InferenceRequest` supports request_id, correlation_id, - # model version and timeout in addition to the arguments described above. + # model version, timeout and preferred_memory in addition to the + # arguments described above. # These arguments are optional. An example containing all the arguments: # inference_request = pb_utils.InferenceRequest(model_name='model_name', # requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], # inputs=[], - # request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5) + # request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5, + # preferred_memory=pb_utils.PreferredMemory( + # pb_utils.TRITONSERVER_MEMORY_GPU, # or pb_utils.TRITONSERVER_MEMORY_CPU + # 0)) # Execute the inference_request and wait for the response inference_response = inference_request.exec() @@ -984,13 +988,25 @@ models in both [default mode](#default-mode) and [iterator](https://docs.python.org/3/glossary.html#term-iterator) of inference responses returned by a decoupled model. If the `decoupled` parameter is set to `False`, the `exec` and `async_exec` function will return a single -response as shown in the example above. - -Besides, you can set the timeout via the parameter 'timeout' in microseconds -within the constructor of `InferenceRequest`. If the request times out, the -request will respond with an error. The default of 'timeout' is 0 which -indicates that the request has no timeout. Example below shows how to use this -feature: +response as shown in the example above. Besides, you can set the timeout via +the parameter 'timeout' in microseconds within the constructor of +`InferenceRequest`. If the request times out, the request will respond with an +error. The default of 'timeout' is 0 which indicates that the request has no +timeout. + +Additionally, starting from the 23.04 release, you have the flexibility to +select a specific device to receive output tensors from BLS calls. This +can be achieved by setting the optional `preferred_memory` parameter within the +`InferenceRequest` constructor. To do this, you can create a `PreferredMemory` +object and specify the `preferred_memory_type` as either +`TRITONSERVER_MEMORY_GPU` or `TRITONSERVER_MEMORY_CPU`, as well as the +`preferred_device_id` as an integer to indicate the memory type and device ID +on which you wish to receive output tensors. If you do not specify the +`preferred_memory` parameter, the output tensors will be allocated on the +same device where the output tensors were received from the model to which the +BLS call is made. + +Example below shows how to use this feature: ```python import triton_python_backend_utils as pb_utils @@ -1011,12 +1027,16 @@ class TritonPythonModel: inputs=[]) # `pb_utils.InferenceRequest` supports request_id, correlation_id, - # model version and timeout in addition to the arguments described above. + # model version, timeout and preferred_memory in addition to the + # arguments described above. # These arguments are optional. An example containing all the arguments: # inference_request = pb_utils.InferenceRequest(model_name='model_name', # requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], # inputs=[], - # request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5) + # request_id="1", correlation_id=4, model_version=1, flags=0, timeout=5, + # preferred_memory=pb_utils.PreferredMemory( + # pb_utils.TRITONSERVER_MEMORY_GPU, # or pb_utils.TRITONSERVER_MEMORY_CPU + # 0)) # Execute the inference_request and wait for the response. Here we are # running a BLS request on a decoupled model, hence setting the parameter diff --git a/src/infer_payload.cc b/src/infer_payload.cc index 33feba2b..a61335a7 100644 --- a/src/infer_payload.cc +++ b/src/infer_payload.cc @@ -75,4 +75,18 @@ InferPayload::Callback(std::unique_ptr infer_response) return callback_(std::move(infer_response)); } +void +InferPayload::SetResponseAllocUserp( + const ResponseAllocatorUserp& response_alloc_userp) +{ + response_alloc_userp_ = + std::make_shared(response_alloc_userp); +} + +std::shared_ptr +InferPayload::ResponseAllocUserp() +{ + return response_alloc_userp_; +} + }}} // namespace triton::backend::python diff --git a/src/infer_payload.h b/src/infer_payload.h index eb27931e..5c0458a5 100644 --- a/src/infer_payload.h +++ b/src/infer_payload.h @@ -29,9 +29,20 @@ #include #include #include "infer_response.h" +#include "pb_preferred_memory.h" namespace triton { namespace backend { namespace python { +struct ResponseAllocatorUserp { + ResponseAllocatorUserp( + void* shm_pool, const PreferredMemory& preferred_memory) + : shm_pool(shm_pool), preferred_memory(preferred_memory) + { + } + void* shm_pool; + PreferredMemory preferred_memory; +}; + class InferPayload { public: InferPayload( @@ -44,12 +55,16 @@ class InferPayload { bool IsDecoupled(); bool IsPromiseSet(); void Callback(std::unique_ptr infer_response); + void SetResponseAllocUserp( + const ResponseAllocatorUserp& response_alloc_userp); + std::shared_ptr ResponseAllocUserp(); private: std::unique_ptr>> prev_promise_; bool is_decoupled_; bool is_promise_set_; std::function)> callback_; + std::shared_ptr response_alloc_userp_; }; }}} // namespace triton::backend::python diff --git a/src/infer_request.cc b/src/infer_request.cc index 5a71ee33..2a9799db 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -42,12 +42,13 @@ InferRequest::InferRequest( const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags, const int32_t timeout, - const intptr_t response_factory_address, const intptr_t request_address) + const intptr_t response_factory_address, const intptr_t request_address, + const PreferredMemory& preferred_memory) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), model_version_(model_version), parameters_(parameters), flags_(flags), timeout_(timeout), response_factory_address_(response_factory_address), - request_address_(request_address) + request_address_(request_address), preferred_memory_(preferred_memory) { for (auto& input : inputs) { if (!input) { @@ -158,6 +159,12 @@ InferRequest::IsDecoupled() return is_decoupled_; } +PreferredMemory& +InferRequest::GetPreferredMemory() +{ + return preferred_memory_; +} + void InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { @@ -182,6 +189,7 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) infer_request_shm_ptr_->response_factory_address = response_factory_address_; infer_request_shm_ptr_->is_decoupled = is_decoupled_; infer_request_shm_ptr_->timeout = timeout_; + infer_request_shm_ptr_->preferred_memory = preferred_memory_; output_names_handle_shm_ptr_ = reinterpret_cast( @@ -358,6 +366,7 @@ InferRequest::InferRequest( response_factory_address_ = infer_request_shm_ptr_->response_factory_address; is_decoupled_ = infer_request_shm_ptr_->is_decoupled; timeout_ = infer_request_shm_ptr_->timeout; + preferred_memory_ = infer_request_shm_ptr_->preferred_memory; #ifdef TRITON_PB_STUB response_sender_ = std::make_shared( diff --git a/src/infer_request.h b/src/infer_request.h index 6792e54a..96b65dc0 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -29,6 +29,7 @@ #include #include #include "infer_response.h" +#include "pb_preferred_memory.h" #include "pb_tensor.h" #ifdef TRITON_PB_STUB @@ -52,6 +53,7 @@ struct InferRequestShm { intptr_t response_factory_address; bool is_decoupled; int32_t timeout; + PreferredMemory preferred_memory; }; class InferRequest { @@ -63,7 +65,9 @@ class InferRequest { const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags = 0, const int32_t timeout = 0, const intptr_t response_factory_address = 0, - const intptr_t request_address = 0); + const intptr_t request_address = 0, + const PreferredMemory& preferred_memory = + PreferredMemory(PreferredMemory::DEFAULT, 0)); const std::vector>& Inputs(); const std::string& RequestId(); @@ -78,6 +82,7 @@ class InferRequest { int32_t Timeout(); bool IsDecoupled(); void SetIsDecoupled(const bool is_decoupled); + PreferredMemory& GetPreferredMemory(); #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); @@ -132,6 +137,7 @@ class InferRequest { intptr_t response_factory_address_; intptr_t request_address_; bool is_decoupled_; + PreferredMemory preferred_memory_; // Shared Memory Data Structures AllocatedSharedMemory infer_request_shm_; diff --git a/src/pb_preferred_memory.h b/src/pb_preferred_memory.h new file mode 100644 index 00000000..55f4db89 --- /dev/null +++ b/src/pb_preferred_memory.h @@ -0,0 +1,57 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +namespace triton { namespace backend { namespace python { + +class PreferredMemory { + public: + enum MemoryType { GPU, CPU, DEFAULT }; + + PreferredMemory() + : preferred_memory_type_(MemoryType::DEFAULT), preferred_device_id_(0) + { + } + + PreferredMemory( + const MemoryType& preferred_memory_type, + const int64_t& preferred_device_id) + : preferred_memory_type_(preferred_memory_type), + preferred_device_id_(preferred_device_id) + { + } + + MemoryType PreferredMemoryType() { return preferred_memory_type_; } + + int64_t PreferredDeviceId() { return preferred_device_id_; } + + private: + MemoryType preferred_memory_type_; + int64_t preferred_device_id_; +}; + +}}} // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 5fa48f39..f2bc4def 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -46,6 +46,7 @@ #include "infer_response.h" #include "pb_error.h" #include "pb_map.h" +#include "pb_preferred_memory.h" #include "pb_response_iterator.h" #include "pb_string.h" #include "pb_utils.h" @@ -418,6 +419,15 @@ Stub::StubSetup() c_python_backend_utils.attr("InferenceResponse")); py::setattr( python_backend_utils, "Logger", c_python_backend_utils.attr("Logger")); + py::setattr( + python_backend_utils, "PreferredMemory", + c_python_backend_utils.attr("PreferredMemory")); + py::setattr( + python_backend_utils, "TRITONSERVER_MEMORY_GPU", + c_python_backend_utils.attr("TRITONSERVER_MEMORY_GPU")); + py::setattr( + python_backend_utils, "TRITONSERVER_MEMORY_CPU", + c_python_backend_utils.attr("TRITONSERVER_MEMORY_CPU")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); @@ -1286,6 +1296,18 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def(py::init()) .def("message", &PbError::Message); + py::class_>( + module, "PreferredMemory") + .def( + py::init(), + py::arg("preferred_memory_type").none(false), + py::arg("preferred_device_id").none(false) = 0); + + py::enum_(module, "MemoryType") + .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::GPU) + .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::CPU) + .export_values(); + py::class_>( module, "InferenceRequest") .def( @@ -1294,7 +1316,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::vector& requested_output_names, const std::string& model_name, const int64_t model_version, const uint32_t flags, - const int32_t timeout) { + const int32_t timeout, + const PreferredMemory& preferred_memory) { std::set requested_outputs; for (auto& requested_output_name : requested_output_names) { requested_outputs.emplace(requested_output_name); @@ -1302,7 +1325,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) // FIXME: InferenceRequest parameters are not supported in BLS now. return std::make_shared( request_id, correlation_id, inputs, requested_outputs, - model_name, model_version, "" /*parameters*/, flags, timeout); + model_name, model_version, "" /*parameters*/, flags, timeout, + 0 /*response_factory_address*/, 0 /*request_address*/, + preferred_memory); }), py::arg("request_id").none(false) = "", py::arg("correlation_id").none(false) = 0, @@ -1310,7 +1335,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("requested_output_names").none(false), py::arg("model_name").none(false), py::arg("model_version").none(false) = -1, - py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0) + py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, + py::arg("preferred_memory").none(false) = + PreferredMemory(PreferredMemory::DEFAULT, 0)) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) diff --git a/src/request_executor.cc b/src/request_executor.cc index ba183cab..43556e70 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -41,6 +41,27 @@ CreateTritonErrorFromException(const PythonBackendException& pb_exception) TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); } +TRITONSERVER_Error* +MemoryTypeToTritonMemoryType( + TRITONSERVER_MemoryType* triton_memory_type, + const PreferredMemory::MemoryType& memory_type) +{ + switch (memory_type) { + case PreferredMemory::MemoryType::CPU: + *triton_memory_type = TRITONSERVER_MEMORY_CPU; + break; + case PreferredMemory::MemoryType::GPU: + *triton_memory_type = TRITONSERVER_MEMORY_GPU; + break; + + default: + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Unknown memory type"); + } + + return nullptr; +} + void InferRequestComplete( TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) @@ -173,12 +194,24 @@ ResponseAlloc( void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type, int64_t* actual_memory_type_id) { + auto p = reinterpret_cast(userp); std::unique_ptr shm_pool( - reinterpret_cast(userp)); + reinterpret_cast(p->shm_pool)); ScopedDefer _([&shm_pool] { shm_pool.release(); }); - *actual_memory_type = preferred_memory_type; - *actual_memory_type_id = preferred_memory_type_id; + + if (p->preferred_memory.PreferredMemoryType() == + PreferredMemory::MemoryType::DEFAULT) { + *actual_memory_type = preferred_memory_type; + *actual_memory_type_id = preferred_memory_type_id; + } else { + TRITONSERVER_MemoryType user_preferred_memory_type; + RETURN_IF_ERROR(MemoryTypeToTritonMemoryType( + &user_preferred_memory_type, + p->preferred_memory.PreferredMemoryType())); + *actual_memory_type = user_preferred_memory_type; + *actual_memory_type_id = p->preferred_memory.PreferredDeviceId(); + } // If 'byte_size' is zero just return 'buffer' == nullptr, we don't // need to do any other book-keeping. @@ -349,9 +382,14 @@ RequestExecutor::Infer( { infer_payload->SetFuture(response_future); + ResponseAllocatorUserp response_allocator_userp( + shm_pool_.get(), infer_request->GetPreferredMemory()); + infer_payload->SetResponseAllocUserp(response_allocator_userp); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback( - irequest, response_allocator_, shm_pool_.get(), InferResponseComplete, - reinterpret_cast(infer_payload.get()))); + irequest, response_allocator_, + reinterpret_cast(infer_payload->ResponseAllocUserp().get()), + InferResponseComplete, reinterpret_cast(infer_payload.get()))); THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( server_, irequest, nullptr /* trace */)); From 9ca21ee8c52af716ccdc2f4b0521d3137fe5cdfb Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 6 Apr 2023 15:46:49 -0400 Subject: [PATCH 092/216] Clarify the request.parameters() return type (#228) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0841dd0d..5a6f4952 100644 --- a/README.md +++ b/README.md @@ -565,8 +565,9 @@ models You can retrieve the parameters associated with an inference request using the `inference_request.parameters()` function. This function -returns a JSON object where the keys are the keys of the parameters -object and the values are the values for the parameters field. +returns a JSON string where the keys are the keys of the parameters +object and the values are the values for the parameters field. Note that +you need to parse this string using `json.loads` to convert it to a dictionary. You can read more about the inference request parameters in the [parameters extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md) From fb1731923aa2a2d9a5255afb763b5c416d95147a Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 14 Apr 2023 16:17:15 -0700 Subject: [PATCH 093/216] Add PyTorch section with notes on determinism, naming, and PyTorch 2.0 (#232) --- README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/README.md b/README.md index 5a6f4952..18fc2ce6 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,8 @@ any C++ code. - [`pb_utils.Tensor.from_dlpack() -> Tensor`](#pb_utilstensorfrom_dlpack---tensor) - [`pb_utils.Tensor.is_cpu() -> bool`](#pb_utilstensoris_cpu---bool) - [Input Tensor Device Placement](#input-tensor-device-placement) +- [Frameworks](#frameworks) + - [PyTorch](#pytorch) - [Examples](#examples) - [AddSub in NumPy](#addsub-in-numpy) - [AddSubNet in PyTorch](#addsubnet-in-pytorch) @@ -1244,6 +1246,54 @@ CPU and GPU memory. To enable this setting, you need to add this setting to the parameters: { key: "FORCE_CPU_ONLY_INPUT_TENSORS" value: {string_value:"no"}} ``` +# Frameworks + +Since Python Backend models can support most python packages, it is a common +workflow for users to use Deep Learning Frameworks like PyTorch in their +`model.py` implementation. This section will document some notes and FAQ about +this workflow. + +> **Note** +> +> Using a deep learning framework/package in a Python Backend model is +> not necessarily the same as using the corresponding Triton Backend +> implementation. For example, the +> [PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend) +> is different from using a Python Backend model that uses `import torch`. +> If you are seeing significantly different results from a model executed by +> the framework (ex: PyTorch) compared to the Python Backend model running the +> same framework, some of the first things you should check is that the +> framework versions being used and the input/output preparation are the same. + +## PyTorch + +For a simple example of using PyTorch in a Python Backend model, see the +[AddSubNet PyTorch example](#addsubnet-in-pytorch). + +### Determinism and Reproducibility + +When running PyTorch code, you may notice slight differences in output values +across runs or across servers depending on hardware, system load, driver, etc. +For most intents and purposes, these differences aren't large enough to affect +a model's final prediction. However, to understand where these differences come +from, see this [doc](https://pytorch.org/docs/stable/notes/randomness.html). + +On Ampere devices and later, there is an optimization related to +FP32 operations called TensorFlow32 (TF32). Typically this optimization will +improve overall performance at the cost of minor precision loss, but similarly +this precision loss is acceptable for most model predictions. For more info on +TF32 in PyTorch and how to enable/disable it as needed, see +[here](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). + +### PyTorch 2.0 + +Currently, the +[PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend) +relies on LibTorch/TorchScript (C++) which has been deprecated from +[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). +So, users interested in new features introduced in PyTorch 2.0 should try the +Python backend route instead. + # Examples For using the Triton Python client in these examples you need to install From 00be0b9923051fc976f9d9e7fdc94cf9c91b355f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 18 Apr 2023 14:04:12 -0700 Subject: [PATCH 094/216] Add TF determinism section, fix TF32 typo, remove PyTorch 2.0 note (#233) --- README.md | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 18fc2ce6..fd08d75a 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ any C++ code. - [Input Tensor Device Placement](#input-tensor-device-placement) - [Frameworks](#frameworks) - [PyTorch](#pytorch) + - [TensorFlow](#tensorflow) - [Examples](#examples) - [AddSub in NumPy](#addsub-in-numpy) - [AddSubNet in PyTorch](#addsubnet-in-pytorch) @@ -1270,29 +1271,36 @@ this workflow. For a simple example of using PyTorch in a Python Backend model, see the [AddSubNet PyTorch example](#addsubnet-in-pytorch). -### Determinism and Reproducibility +### PyTorch Determinism When running PyTorch code, you may notice slight differences in output values -across runs or across servers depending on hardware, system load, driver, etc. +across runs or across servers depending on hardware, system load, driver, or even +batch size. These differences are generally related to the selection of CUDA +kernels used to execute the operations, based on the factors mentioned. + For most intents and purposes, these differences aren't large enough to affect a model's final prediction. However, to understand where these differences come from, see this [doc](https://pytorch.org/docs/stable/notes/randomness.html). On Ampere devices and later, there is an optimization related to -FP32 operations called TensorFlow32 (TF32). Typically this optimization will -improve overall performance at the cost of minor precision loss, but similarly -this precision loss is acceptable for most model predictions. For more info on -TF32 in PyTorch and how to enable/disable it as needed, see +FP32 operations called +[TensorFloat32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/). +Typically this optimization will improve overall performance at the cost of +minor precision loss, but similarly this precision loss is acceptable for most +model predictions. For more info on TF32 in PyTorch and how to enable/disable +it as needed, see [here](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). -### PyTorch 2.0 +## TensorFlow + +### TensorFlow Determinism -Currently, the -[PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend) -relies on LibTorch/TorchScript (C++) which has been deprecated from -[PyTorch 2.0](https://pytorch.org/get-started/pytorch-2.0/). -So, users interested in new features introduced in PyTorch 2.0 should try the -Python backend route instead. +Similar to the PyTorch determinism section above, TensorFlow can have slight +differences in outputs based on various factors like hardware, system +configurations, or batch sizes due to the library's internal CUDA kernel +selection process. For more information on improving the determinism of outputs +in TensorFlow, see +[here](https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism). # Examples From ad88027900392f7f481b1c9960fc386a32677f08 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 19 Apr 2023 14:41:31 -0700 Subject: [PATCH 095/216] fix shm-size out_of_range error (#234) --- src/pb_stub.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index f2bc4def..48a7d017 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1483,7 +1483,7 @@ main(int argc, char** argv) // Path to model.py std::string model_path = argv[1]; std::string shm_region_name = argv[2]; - int64_t shm_default_size = std::stoi(argv[3]); + int64_t shm_default_size = std::stol(argv[3]); std::vector model_path_tokens; @@ -1505,7 +1505,7 @@ main(int argc, char** argv) exit(1); } std::string model_version = model_path_tokens[model_path_tokens.size() - 2]; - int64_t shm_growth_size = std::stoi(argv[4]); + int64_t shm_growth_size = std::stol(argv[4]); std::string triton_install_path = argv[6]; std::string name = argv[8]; From cf9e32d72bfd3bfe98c9f3f707674e4e6239f92f Mon Sep 17 00:00:00 2001 From: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Date: Wed, 26 Apr 2023 01:16:51 -0500 Subject: [PATCH 096/216] Update inferentia 1 setup (#236) * updated inferentia build * Format * Update Inferentia doc. Update copyright --- inferentia/README.md | 10 +++---- inferentia/scripts/setup-pre-container.sh | 6 ++-- inferentia/scripts/setup.sh | 36 +++++++---------------- 3 files changed, 17 insertions(+), 35 deletions(-) diff --git a/inferentia/README.md b/inferentia/README.md index 50d443e0..db04f180 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -1,5 +1,5 @@ + +# Custom Metrics Example + +In this section we demonstrate an end-to-end example for +[Custom Metrics API](../../README.md#custom-metrics) in Python backend. The +[model repository](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) +should contain [custom_metrics](./model.py) model. The +[custom_metrics](./model.py) model uses +[Custom Metrics API](../../README.md#custom-metrics) to register and collect +custom metrics. + +## Deploying the Custom Metrics Models + +1. Create the model repository: + +```console +mkdir -p models/custom_metrics/1/ + +# Copy the Python models +cp examples/custom_metrics/model.py models/custom_metrics/1/model.py +cp examples/custom_metrics/config.pbtxt models/custom_metrics/config.pbtxt +``` + +2. Start the tritonserver: + +``` +tritonserver --model-repository `pwd`/models +``` + +3. Send inference requests to server: + +``` +python3 examples/custom_metrics/client.py +``` + +You should see an output similar to the output below in the client terminal: + +``` +custom_metrics example: found pattern '# HELP requests_process_latency_ns Cumulative time spent processing requests' in metrics +custom_metrics example: found pattern '# TYPE requests_process_latency_ns counter' in metrics +custom_metrics example: found pattern 'requests_process_latency_ns{model="custom_metrics",version="1"}' in metrics +PASS: custom_metrics +``` + +In the terminal that runs Triton Server, you should see an output similar to +the output below: +``` +Cumulative requests processing latency: 223406.0 +``` + +The [model.py](./model.py) model file is heavily commented with +explanations about each of the function calls. + +### Explanation of the Client Output + +The [client.py](./client.py) sends a HTTP request with url +`http://localhost:8002/metrics` to fetch the metrics from Triton server. The +client then verifies if the custom metrics added in the model file are +correctly reported. diff --git a/examples/custom_metrics/client.py b/examples/custom_metrics/client.py new file mode 100644 index 00000000..48b2e610 --- /dev/null +++ b/examples/custom_metrics/client.py @@ -0,0 +1,90 @@ +#Copyright 2023, NVIDIA CORPORATION& AFFILIATES.All rights reserved. +# +#Redistribution and use in source and binary forms, with or without +#modification, are permitted provided that the following conditions +#are met: +#* Redistributions of source code must retain the above copyright +#notice, this list of conditions and the following disclaimer. +#* Redistributions in binary form must reproduce the above copyright +#notice, this list of conditions and the following disclaimer in the +#documentation and / or other materials provided with the distribution. +#* Neither the name of NVIDIA CORPORATION nor the names of its +#contributors may be used to endorse or promote products derived +#from this software without specific prior written permission. +# +#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR +#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, +#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from tritonclient.utils import * +import tritonclient.http as httpclient +import requests +import sys + +import numpy as np + +model_name = "custom_metrics" +shape = [4] + +def get_metrics(): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + +with httpclient.InferenceServerClient("localhost:8000") as client: + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + inputs = [ + httpclient.InferInput("INPUT0", input0_data.shape, + np_to_triton_dtype(input0_data.dtype)), + httpclient.InferInput("INPUT1", input1_data.shape, + np_to_triton_dtype(input1_data.dtype)), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0"), + httpclient.InferRequestedOutput("OUTPUT1"), + ] + + response = client.infer(model_name, + inputs, + request_id=str(1), + outputs=outputs) + + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + + if not np.allclose(input0_data + input1_data, output0_data): + print("custom_metrics example error: incorrect sum") + sys.exit(1) + + if not np.allclose(input0_data - input1_data, output1_data): + print("custom_metrics example error: incorrect difference") + sys.exit(1) + + metrics = get_metrics() + patterns = [ + '# HELP requests_process_latency_ns Cumulative time spent processing requests', + '# TYPE requests_process_latency_ns counter', + 'requests_process_latency_ns{model="custom_metrics",version="1"}'] + for pattern in patterns: + if pattern not in metrics: + print("custom_metrics example error: missing pattern '{}' in metrics".format(pattern)) + sys.exit(1) + else: + print("custom_metrics example: found pattern '{}' in metrics".format(pattern)) + + print('PASS: custom_metrics') + sys.exit(0) diff --git a/examples/custom_metrics/config.pbtxt b/examples/custom_metrics/config.pbtxt new file mode 100644 index 00000000..a364058f --- /dev/null +++ b/examples/custom_metrics/config.pbtxt @@ -0,0 +1,65 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_metrics" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +instance_group [ + { + count: 3 + kind: KIND_CPU + } +] + diff --git a/examples/custom_metrics/model.py b/examples/custom_metrics/model.py new file mode 100644 index 00000000..c5a0a55b --- /dev/null +++ b/examples/custom_metrics/model.py @@ -0,0 +1,177 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to intialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # Parse model_config and extract OUTPUT0 and OUTPUT1 configuration + self.model_config = model_config = json.loads(args['model_config']) + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT1") + + # Convert Triton types to numpy types + self.out0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + self.out1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + # Create a MetricFamily object to report the latency of the model + # execution. The 'kind' parameter must be either 'COUNTER' or + # 'GAUGE'. + # If duplicate name is used, both MetricFamily objects + # will reference to the same underlying MetricFamily. If there are two + # MetricFamily objects with the same name and same kind but different + # description, the original description will be used. Note that + # Duplicate name with different kind is not allowed. + self.metric_family = pb_utils.MetricFamily( + name="requests_process_latency_ns", + description="Cumulative time spent processing requests", + kind=pb_utils.MetricFamily.COUNTER # or pb_utils.MetricFamily.GAUGE + ) + + # Create a Metric object under the MetricFamily object. The 'labels' + # is a dictionary of key-value pairs. You can create multiple Metric + # objects under the same MetricFamily object with unique labels. Empty + # labels is allowed. The 'labels' parameter is optional. If you don't + # specify the 'labels' parameter, empty labels will be used. + self.metric = self.metric_family.Metric(labels={ + "model": "custom_metrics", + "version": "1" + }) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Record the start time of processing the requests + start_ns = time.time_ns() + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get INPUT0 + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + # Get INPUT1 + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy()) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_tensor_0 = pb_utils.Tensor("OUTPUT0", + out_0.astype(self.out0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", + out_1.astype(self.out1_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occured")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1]) + responses.append(inference_response) + + # Record the end time of processing the requests + end_ns = time.time_ns() + + # Update metric to track cumulative requests processing latency. + # There are three operations you can do with the Metric object: + # - Metric.increment(value): Increment the value of the metric by + # the given value. The type of the value is double. The 'COUNTER' + # kind does not support negative value. + # - Metric.set(value): Set the value of the metric to the given + # value. This operation is only supported in 'GAUGE' kind. The + # type of the value is double. + # - Metric.value(): Get the current value of the metric. + self.metric.increment(end_ns - start_ns) + logger = pb_utils.Logger + logger.log_info("Cumulative requests processing latency: {}".format( + self.metric.value())) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/src/ipc_message.h b/src/ipc_message.h index 4ec15290..0d843d47 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -51,7 +51,14 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_AutoCompleteRequest, PYTHONSTUB_AutoCompleteResponse, PYTHONSTUB_LogRequest, - PYTHONSTUB_CleanupRequest + PYTHONSTUB_CleanupRequest, + PYTHONSTUB_MetricFamilyRequestNew, + PYTHONSTUB_MetricFamilyRequestDelete, + PYTHONSTUB_MetricRequestNew, + PYTHONSTUB_MetricRequestDelete, + PYTHONSTUB_MetricRequestValue, + PYTHONSTUB_MetricRequestIncrement, + PYTHONSTUB_MetricRequestSet } PYTHONSTUB_CommandType; /// diff --git a/src/metric.cc b/src/metric.cc new file mode 100644 index 00000000..a6266dbb --- /dev/null +++ b/src/metric.cc @@ -0,0 +1,285 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "metric.h" + +#ifdef TRITON_PB_STUB +#include "pb_stub.h" +#endif + +namespace triton { namespace backend { namespace python { + +Metric::Metric(const std::string& labels, void* metric_family_address) + : labels_(labels), operation_value_(0), metric_address_(nullptr), + metric_family_address_(metric_family_address), is_cleared_(false) +{ +#ifdef TRITON_PB_STUB + SendCreateMetricRequest(); +#endif +} + +Metric::~Metric() +{ +#ifdef TRITON_PB_STUB + Clear(); +#endif +} + +void +Metric::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory custom_metric_shm = + shm_pool->Construct(); + custom_metric_shm_ptr_ = custom_metric_shm.data_.get(); + + std::unique_ptr labels_shm = PbString::Create(shm_pool, labels_); + + custom_metric_shm_ptr_->operation_value = operation_value_; + custom_metric_shm_ptr_->labels_shm_handle = labels_shm->ShmHandle(); + custom_metric_shm_ptr_->metric_family_address = metric_family_address_; + custom_metric_shm_ptr_->metric_address = metric_address_; + + // Save the references to shared memory. + custom_metric_shm_ = std::move(custom_metric_shm); + labels_shm_ = std::move(labels_shm); + shm_handle_ = custom_metric_shm.handle_; +} + +std::unique_ptr +Metric::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory custom_metric_shm = + shm_pool->Load(handle); + MetricShm* custom_metric_shm_ptr = custom_metric_shm.data_.get(); + + std::unique_ptr labels_shm = PbString::LoadFromSharedMemory( + shm_pool, custom_metric_shm_ptr->labels_shm_handle); + + return std::unique_ptr(new Metric(custom_metric_shm, labels_shm)); +} + +Metric::Metric( + AllocatedSharedMemory& custom_metric_shm, + std::unique_ptr& labels_shm) + : custom_metric_shm_(std::move(custom_metric_shm)), + labels_shm_(std::move(labels_shm)) +{ + custom_metric_shm_ptr_ = custom_metric_shm_.data_.get(); + labels_ = labels_shm_->String(); + operation_value_ = custom_metric_shm_ptr_->operation_value; + metric_family_address_ = custom_metric_shm_ptr_->metric_family_address; + metric_address_ = custom_metric_shm_ptr_->metric_address; +} + +void* +Metric::MetricAddress() +{ + return metric_address_; +} + +#ifdef TRITON_PB_STUB +void +Metric::SendCreateMetricRequest() +{ + // Send the request to create the Metric to the parent process + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricRequestNew, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Error when creating Metric: " + std::string(pb_exception.what())); + } + metric_address_ = custom_metrics_msg->address; +} + +void +Metric::SendIncrementRequest(const double& value) +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricRequestIncrement, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to increment metric value: " + + std::string(pb_exception.what())); + } +} + +void +Metric::SendSetValueRequest(const double& value) +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricRequestSet, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to set metric value: " + std::string(pb_exception.what())); + } +} + +double +Metric::SendGetValueRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricRequestValue, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to get metric value: " + std::string(pb_exception.what())); + } + + return custom_metrics_msg->value; +} + +void +Metric::Clear() +{ + // Need to check if the metric has been cleared before as the Clear()' + // function can be called from two different locations: when the metric family + // clears the 'metric_map_' and when the 'Metric' object goes out of + // scope/being deleted. + if (!is_cleared_) { + is_cleared_ = true; + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricRequestDelete, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + std::cerr << "Error when deleting Metric: " << pb_exception.what() + << "\n"; + } + } +} + +#else +void* +Metric::InitializeTritonMetric() +{ + std::vector labels_params; + ParseLabels(labels_params, labels_); + TRITONSERVER_Metric* triton_metric = nullptr; + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNew( + &triton_metric, + reinterpret_cast(metric_family_address_), + labels_params.data(), labels_params.size())); + for (const auto label : labels_params) { + TRITONSERVER_ParameterDelete(const_cast(label)); + } + return reinterpret_cast(triton_metric); +} + +void +Metric::ParseLabels( + std::vector& labels_params, + const std::string& labels) +{ + triton::common::TritonJson::Value labels_json; + THROW_IF_TRITON_ERROR(labels_json.Parse(labels)); + + std::vector members; + labels_json.Members(&members); + for (const auto& member : members) { + std::string value; + THROW_IF_TRITON_ERROR(labels_json.MemberAsString(member.c_str(), &value)); + labels_params.emplace_back(TRITONSERVER_ParameterNew( + member.c_str(), TRITONSERVER_PARAMETER_STRING, value.c_str())); + } +} + +void +Metric::HandleMetricOperation( + CustomMetricsMessage* metrics_message_ptr, + const PYTHONSTUB_CommandType& command_type) +{ + if (command_type == PYTHONSTUB_MetricRequestValue) { + metrics_message_ptr->value = GetValue(); + } else if (command_type == PYTHONSTUB_MetricRequestIncrement) { + Increment(operation_value_); + } else if (command_type == PYTHONSTUB_MetricRequestSet) { + SetValue(operation_value_); + } else { + throw PythonBackendException("Unknown metric operation"); + } +} + +void +Metric::Increment(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricIncrement(triton_metric, value)); +} + +void +Metric::SetValue(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricSet(triton_metric, value)); +} + +double +Metric::GetValue() +{ + double value; + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricValue(triton_metric, &value)); + return value; +} + +void +Metric::ClearTritonMetric() +{ + auto triton_metric = reinterpret_cast(metric_address_); + if (triton_metric != nullptr) { + LOG_IF_ERROR(TRITONSERVER_MetricDelete(triton_metric), "deleting metric"); + } +} + +#endif + +}}} // namespace triton::backend::python diff --git a/src/metric.h b/src/metric.h new file mode 100644 index 00000000..0c9da6db --- /dev/null +++ b/src/metric.h @@ -0,0 +1,167 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include "ipc_message.h" +#include "pb_string.h" +#include "pb_utils.h" + +#ifdef TRITON_PB_STUB +#include +namespace py = pybind11; +#else +#include "triton/core/tritonserver.h" +#endif + +namespace triton { namespace backend { namespace python { + +// The 'MetricShm' struct is utilized by the 'Metric' class for saving the +// essential data to shared memory and for loading the data from shared memory +// in order to reconstruct the 'Metric' object. +struct MetricShm { + // The shared memory handle of the labels in PbString format. + bi::managed_external_buffer::handle_t labels_shm_handle; + // The value used for incrementing or setting the metric. + double operation_value; + // The address of the TRITONSERVER_Metric object. + void* metric_address; + // The address corresponds to the TRITONSERVER_MetricFamily object that this + // metric belongs to. + void* metric_family_address; +}; + +class Metric { + public: + Metric(const std::string& labels, void* metric_family_address); + + ~Metric(); + + /// Save Custom Metric object to shared memory. + /// \param shm_pool Shared memory pool to save the custom metric object. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a Custom Metric object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the custom metric. + /// \return Returns the custom metrics in the specified request_handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + + /// Get the address of the TRITONSERVER_Metric object. + /// \return Returns the address of the TRITONSERVER_Metric object. + void* MetricAddress(); + + /// Send the request to the parent process to delete the Metric object. + void Clear(); + +#ifdef TRITON_PB_STUB + /// Send a request to register a new 'TRITONSERVER_Metric' object to the + /// parent process. + void SendCreateMetricRequest(); + + /// Send the request to the parent process to increment the metric by the + /// specified value. + /// \param value The value to increment the metric by. + void SendIncrementRequest(const double& value); + + /// Send the request to the parent process to set the metric to the specified + /// value. + /// \param value The value to set the metric to. + void SendSetValueRequest(const double& value); + + /// Send the request to the parent process to get the value of the metric. + /// \return Returns the value of the metric. + double SendGetValueRequest(); +#else + // Initialize the TRITONSERVER_Metric object. + /// \return Returns the address of the TRITONSERVER_Metric object. + void* InitializeTritonMetric(); + + /// Parse the labels string into a vector of TRITONSERVER_Parameter. + /// \param labels_params The vector of TRITONSERVER_Parameter to store the + /// parsed labels. + /// \param labels The labels string to parse. + void ParseLabels( + std::vector& labels_params, + const std::string& labels); + + /// Handle the metric operation. + /// \param metrics_message_ptr The pointer to the CustomMetricsMessage object. + void HandleMetricOperation( + CustomMetricsMessage* metrics_message_ptr, + const PYTHONSTUB_CommandType& command_type); + + /// Use Triton C API to increment the value of the metric by the given value. + /// \param value The value to increment the metric by. + void Increment(const double& value); + + /// Use Triton C API to set the value of the metric to the given value. + /// \param value The value to set the metric to. + void SetValue(const double& value); + + /// Use Triton C API to get the value of the metric. + double GetValue(); + + /// Clear the TRITONSERVER_Metric object. + void ClearTritonMetric(); +#endif + + /// Disallow copying the custom metric object. + DISALLOW_COPY_AND_ASSIGN(Metric); + + private: + // The private constructor for creating a Metric object from shared memory. + Metric( + AllocatedSharedMemory& custom_metric_shm, + std::unique_ptr& labels_shm); + + // The labels of the metric, which is the identifier of the metric. + std::string labels_; + // The value used for incrementing or setting the metric. + double operation_value_; + // The address of the TRITONSERVER_Metric object. + void* metric_address_; + // The address corresponds to the TRITONSERVER_MetricFamily object that this + // metric belongs to. + void* metric_family_address_; + // Indicates whether the metric has been cleared. It is needed as the Clear()' + // function can be called from two different locations: when the metric family + // clears the 'metric_map_' and when the 'Metric' object goes out of + // scope/being deleted. + bool is_cleared_; + + // Shared Memory Data Structures + AllocatedSharedMemory custom_metric_shm_; + MetricShm* custom_metric_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr labels_shm_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/metric_family.cc b/src/metric_family.cc new file mode 100644 index 00000000..195e9828 --- /dev/null +++ b/src/metric_family.cc @@ -0,0 +1,215 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "metric_family.h" + +#ifdef TRITON_PB_STUB +#include "pb_stub.h" +#endif + +namespace triton { namespace backend { namespace python { + +MetricFamily::MetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind) + : name_(name), description_(description), kind_(kind), + metric_family_address_(nullptr) +{ +#ifdef TRITON_PB_STUB + SendCreateMetricFamilyRequest(); +#endif +} + +MetricFamily::~MetricFamily() +{ +#ifdef TRITON_PB_STUB + // Clear all the metrics first + { + std::lock_guard lock(metric_map_mu_); + for (auto& m : metric_map_) { + m.second->Clear(); + } + } + + // Send the request to delete the MetricFamily to the parent process + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricFamilyRequestDelete, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + std::cerr << "Error when deleting MetricFamily: " << pb_exception.what() + << "\n"; + } +#endif +}; + +void +MetricFamily::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory custom_metric_family_shm = + shm_pool->Construct(); + + custom_metric_family_shm_ptr_ = custom_metric_family_shm.data_.get(); + std::unique_ptr name_shm = PbString::Create(shm_pool, name_); + std::unique_ptr description_shm = + PbString::Create(shm_pool, description_); + + custom_metric_family_shm_ptr_->kind = kind_; + custom_metric_family_shm_ptr_->name_shm_handle = name_shm->ShmHandle(); + custom_metric_family_shm_ptr_->description_shm_handle = + description_shm->ShmHandle(); + custom_metric_family_shm_ptr_->metric_family_address = metric_family_address_; + + // Save the references to shared memory. + custom_metric_family_shm_ = std::move(custom_metric_family_shm); + name_shm_ = std::move(name_shm); + description_shm_ = std::move(description_shm); + shm_handle_ = custom_metric_family_shm.handle_; +} + +std::unique_ptr +MetricFamily::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory custom_metric_family_shm = + shm_pool->Load(handle); + MetricFamilyShm* custom_metric_family_shm_ptr = + custom_metric_family_shm.data_.get(); + std::unique_ptr name_shm = PbString::LoadFromSharedMemory( + shm_pool, custom_metric_family_shm_ptr->name_shm_handle); + std::unique_ptr description_shm = PbString::LoadFromSharedMemory( + shm_pool, custom_metric_family_shm_ptr->description_shm_handle); + + return std::unique_ptr( + new MetricFamily(custom_metric_family_shm, name_shm, description_shm)); +} + +MetricFamily::MetricFamily( + AllocatedSharedMemory& custom_metric_family_shm, + std::unique_ptr& name_shm, + std::unique_ptr& description_shm) + : custom_metric_family_shm_(std::move(custom_metric_family_shm)), + name_shm_(std::move(name_shm)), + description_shm_(std::move(description_shm)) +{ + custom_metric_family_shm_ptr_ = custom_metric_family_shm_.data_.get(); + name_ = name_shm_->String(); + description_ = description_shm_->String(); + kind_ = custom_metric_family_shm_ptr_->kind; + metric_family_address_ = custom_metric_family_shm_ptr_->metric_family_address; +} + +void* +MetricFamily::MetricFamilyAddress() +{ + return metric_family_address_; +} + +#ifdef TRITON_PB_STUB +std::shared_ptr +MetricFamily::CreateMetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind) +{ + std::shared_ptr metric_family = + std::make_shared(name, description, kind); + metric_family->SendCreateMetricFamilyRequest(); + return metric_family; +} + +void +MetricFamily::SendCreateMetricFamilyRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; + try { + stub->SendCustomMetricsMessage( + &custom_metrics_msg, PYTHONSTUB_MetricFamilyRequestNew, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Error when creating MetricFamily: " + + std::string(pb_exception.what())); + } + metric_family_address_ = custom_metrics_msg->address; +} + +std::shared_ptr +MetricFamily::CreateMetric(py::dict labels) +{ + py::module json = py::module_::import("json"); + std::string labels_str = std::string(py::str(json.attr("dumps")(labels))); + auto metric = std::make_shared(labels_str, metric_family_address_); + { + std::lock_guard lock(metric_map_mu_); + metric_map_.insert({metric->MetricAddress(), metric}); + } + + return metric; +} +#else +void* +MetricFamily::InitializeTritonMetricFamily() +{ + TRITONSERVER_MetricKind triton_kind = ToTritonServerMetricKind(kind_); + TRITONSERVER_MetricFamily* triton_metric_family = nullptr; + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricFamilyNew( + &triton_metric_family, triton_kind, name_.c_str(), description_.c_str())); + return reinterpret_cast(triton_metric_family); +} + +TRITONSERVER_MetricKind +MetricFamily::ToTritonServerMetricKind(const MetricKind& kind) +{ + switch (kind) { + case COUNTER: + return TRITONSERVER_METRIC_KIND_COUNTER; + case GAUGE: + return TRITONSERVER_METRIC_KIND_GAUGE; + default: + throw PythonBackendException("Unknown metric kind"); + } +} + +void +MetricFamily::ClearTritonMetricFamily() +{ + auto metric_family = + reinterpret_cast(metric_family_address_); + if (metric_family != nullptr) { + LOG_IF_ERROR( + TRITONSERVER_MetricFamilyDelete(metric_family), + "deleting metric family"); + } +} +#endif + +}}} // namespace triton::backend::python diff --git a/src/metric_family.h b/src/metric_family.h new file mode 100644 index 00000000..bab71076 --- /dev/null +++ b/src/metric_family.h @@ -0,0 +1,150 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include "ipc_message.h" +#include "metric.h" +#include "pb_string.h" +#include "pb_utils.h" + +#ifdef TRITON_PB_STUB +#include +namespace py = pybind11; +#else +#include "triton/core/tritonserver.h" +#endif + +namespace triton { namespace backend { namespace python { + +// The 'MetricFamilyShm' struct is utilized by the 'MetricFamily' class for +// saving the essential data to shared memory and for loading the data from +// shared memory in order to reconstruct the 'MetricFamily' object. +struct MetricFamilyShm { + // The shared memory handle of the name in PbString format. + bi::managed_external_buffer::handle_t name_shm_handle; + // The shared memory handle of the description in PbString format. + bi::managed_external_buffer::handle_t description_shm_handle; + // The metric kind of the 'MetricFamily'. + MetricKind kind; + // The address of the 'TRITONSERVER_MetricFamily' object. + void* metric_family_address; +}; + +class MetricFamily { + public: + MetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind); + + ~MetricFamily(); + + /// Save a custom metric family to shared memory. + /// \param shm_pool Shared memory pool to save the custom metric family. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a Custom Metric Family object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the custom metric family. + /// \return Returns the custom metric family in the specified handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); + + /// Get the address of the TRITONSERVER_MetricFamily object. + /// \return Returns the address of the TRITONSERVER_MetricFamily object. + void* MetricFamilyAddress(); + +#ifdef TRITON_PB_STUB + /// Create a metric family object and returned as a shared pointer. + /// \param name The name of the metric family. + /// \param description The description of the metric family. + /// \param kind The metric kind of the metric family. + /// \return Returns the shared pointer to the created metric family. + static std::shared_ptr CreateMetricFamily( + const std::string& name, const std::string& description, + const MetricKind& kind); + + /// Send a request to register a new 'TRITONSERVER_MetricFamily' object to the + /// parent process. + void SendCreateMetricFamilyRequest(); + + /// Create a metric from the metric family and store it in the metric map. + /// \param labels The labels of the metric. + /// \return Returns the shared pointer to the created metric. + std::shared_ptr CreateMetric(py::dict labels); +#else + /// Initialize the TRITONSERVER_MetricFamily object. + /// \return Returns the address of the TRITONSERVER_MetricFamily object. + void* InitializeTritonMetricFamily(); + + /// Helper function to convert the MetricKind enum to TRITONSERVER_MetricKind + /// \param kind The MetricKind enum to be converted. + /// \return Returns the TRITONSERVER_MetricKind enum. + TRITONSERVER_MetricKind ToTritonServerMetricKind(const MetricKind& kind); + + /// Clear the TRITONSERVER_MetricFamily object. + void ClearTritonMetricFamily(); +#endif + + /// Disallow copying the metric family object. + DISALLOW_COPY_AND_ASSIGN(MetricFamily); + + private: + // The private constructor for creating a MetricFamily object from shared + // memory. + MetricFamily( + AllocatedSharedMemory& custom_metric_family_shm, + std::unique_ptr& name_shm, + std::unique_ptr& description_shm); + + // The name of the metric family. + std::string name_; + // The description of the metric family. + std::string description_; + // The metric kind of the metric family. Currently only supports GAUGE and + // COUNTER. + MetricKind kind_; + // The address of the TRITONSERVER_MetricFamily object. + void* metric_family_address_; + + // The mutex to protect the 'metric_map_'. + std::mutex metric_map_mu_; + // Need to keep track of the metrics associated with the metric family to make + // sure the metrics are cleaned up before the metric family is deleted. + std::unordered_map> metric_map_; + + // Shared Memory Data Structures + AllocatedSharedMemory custom_metric_family_shm_; + MetricFamilyShm* custom_metric_family_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr name_shm_; + std::unique_ptr description_shm_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index f3a1b04e..e6cff15f 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -41,7 +41,6 @@ #include #include #include -#include "infer_response.h" #include "pb_error.h" #include "pb_map.h" #include "pb_preferred_memory.h" @@ -425,6 +424,9 @@ Stub::StubSetup() py::setattr( python_backend_utils, "TRITONSERVER_MEMORY_CPU", c_python_backend_utils.attr("TRITONSERVER_MEMORY_CPU")); + py::setattr( + python_backend_utils, "MetricFamily", + c_python_backend_utils.attr("MetricFamily")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); @@ -559,8 +561,10 @@ Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) dst_buffers.emplace_back(std::move(dst_buffer)); } - ScopedDefer load_gpu_buffer_response( - [this] { parent_message_queue_->Push(DUMMY_MESSAGE); }); + ScopedDefer load_gpu_buffer_response([this] { + // Push a dummy message to signal the thread to terminate. + parent_message_queue_->Push(DUMMY_MESSAGE); + }); for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::shared_ptr& src_buffer = gpu_tensors_[i]; @@ -879,8 +883,8 @@ Stub::TerminateStubToParentQueueMonitor() Logger::GetOrCreateInstance()->SetBackendLoggingActive(false); { std::lock_guard guard{stub_to_parent_message_mu_}; - log_request_buffer_.push(DUMMY_MESSAGE); - bls_response_cleanup_buffer_.push(DUMMY_MESSAGE); + // Push a dummy message to signal the thread to terminate. + stub_to_parent_buffer_.push(DUMMY_MESSAGE); } stub_to_parent_message_cv_.notify_one(); stub_to_parent_queue_monitor_.join(); @@ -889,11 +893,10 @@ Stub::TerminateStubToParentQueueMonitor() void Stub::EnqueueLogRequest(std::unique_ptr& log_ptr) { - { - std::lock_guard guard{stub_to_parent_message_mu_}; - log_request_buffer_.push(std::move(log_ptr)); - } - stub_to_parent_message_cv_.notify_one(); + std::unique_ptr utils_msg_payload = + std::make_unique( + PYTHONSTUB_LogRequest, reinterpret_cast(log_ptr.release())); + EnqueueUtilsMessage(std::move(utils_msg_payload)); } void @@ -901,43 +904,36 @@ Stub::ServiceStubToParentRequests() { while (stub_to_parent_thread_) { std::unique_lock guard{stub_to_parent_message_mu_}; - while (log_request_buffer_.empty() && - bls_response_cleanup_buffer_.empty()) { + while (stub_to_parent_buffer_.empty()) { stub_to_parent_message_cv_.wait(guard); } - if (!log_request_buffer_.empty()) { - // On exit, will send messages until - // DUMMY_MESSAGE is reached - std::unique_ptr log_request = - std::move(log_request_buffer_.front()); - if (log_request == DUMMY_MESSAGE) { - log_request_buffer_.pop(); - break; - } else { - log_request_buffer_.pop(); - SendLogMessage(log_request); - } - } - if (!bls_response_cleanup_buffer_.empty()) { - void* id = std::move(bls_response_cleanup_buffer_.front()); - if (id == DUMMY_MESSAGE) { - bls_response_cleanup_buffer_.pop(); - break; + // On exit, will send messages to the parent process until + // DUMMY_MESSAGE is reached + std::unique_ptr utils_msg_payload = + std::move(stub_to_parent_buffer_.front()); + if (utils_msg_payload == DUMMY_MESSAGE) { + stub_to_parent_buffer_.pop(); + break; + } else { + stub_to_parent_buffer_.pop(); + if (utils_msg_payload->command_type == PYTHONSTUB_LogRequest) { + SendLogMessage(utils_msg_payload); + } else if (utils_msg_payload->command_type == PYTHONSTUB_CleanupRequest) { + SendCleanupId(utils_msg_payload); } else { - bls_response_cleanup_buffer_.pop(); - { - std::lock_guard lock(response_iterator_map_mu_); - response_iterator_map_.erase(id); - } - SendCleanupId(id); + std::cerr << "Error when sending message via stub_to_parent message " + "buffer - unknown command\n"; } } } } void -Stub::SendLogMessage(std::unique_ptr& log_send_message) +Stub::SendLogMessage(std::unique_ptr& utils_msg_payload) { + std::unique_ptr log_send_message = std::unique_ptr( + reinterpret_cast(utils_msg_payload->utils_message_ptr)); + std::unique_ptr log_request_shm = PbLogShm::Create( shm_pool_, log_send_message->Filename(), log_send_message->Line(), log_send_message->Message(), log_send_message->Level()); @@ -966,8 +962,14 @@ Stub::SendLogMessage(std::unique_ptr& log_send_message) } void -Stub::SendCleanupId(void* id) +Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) { + void* id = utils_msg_payload->utils_message_ptr; + { + std::lock_guard lock(response_iterator_map_mu_); + response_iterator_map_.erase(id); + } + std::unique_ptr ipc_message = IPCMessage::Create(shm_pool_, true /* inline_response */); ipc_message->Command() = PYTHONSTUB_CleanupRequest; @@ -995,11 +997,9 @@ void Stub::EnqueueCleanupId(void* id) { if (id != nullptr) { - { - std::lock_guard guard{stub_to_parent_message_mu_}; - bls_response_cleanup_buffer_.push(id); - } - stub_to_parent_message_cv_.notify_one(); + std::unique_ptr utils_msg_payload = + std::make_unique(PYTHONSTUB_CleanupRequest, id); + EnqueueUtilsMessage(std::move(utils_msg_payload)); } } @@ -1022,6 +1022,7 @@ Stub::TerminateParentToStubQueueMonitor() { if (parent_to_stub_thread_) { parent_to_stub_thread_ = false; + // Push a dummy message to signal the thread to terminate. parent_to_stub_mq_->Push(DUMMY_MESSAGE); parent_to_stub_queue_monitor_.join(); } @@ -1165,6 +1166,85 @@ Stub::IsFinalizing() return finalizing_; } +void +Stub::EnqueueUtilsMessage( + std::unique_ptr utils_msg_payload) +{ + { + std::lock_guard guard{stub_to_parent_message_mu_}; + stub_to_parent_buffer_.push(std::move(utils_msg_payload)); + } + stub_to_parent_message_cv_.notify_one(); +} + +void +Stub::PrepareCustomMetricsMessage( + AllocatedSharedMemory& custom_metrics_msg_shm, + CustomMetricsMessage** custom_metrics_msg) +{ + custom_metrics_msg_shm = shm_pool_->Construct(); + *custom_metrics_msg = custom_metrics_msg_shm.data_.get(); + new (&((*custom_metrics_msg)->mu)) bi::interprocess_mutex; + new (&((*custom_metrics_msg)->cv)) bi::interprocess_condition; + (*custom_metrics_msg)->waiting_on_stub = false; + (*custom_metrics_msg)->is_error_set = false; + (*custom_metrics_msg)->has_error = false; +} + +void +Stub::SendCustomMetricsMessage( + CustomMetricsMessage** custom_metrics_msg, + PYTHONSTUB_CommandType command_type, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory custom_metrics_msg_shm; + PrepareCustomMetricsMessage(custom_metrics_msg_shm, custom_metrics_msg); + + (*custom_metrics_msg)->message = handle; + + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + ipc_message->Command() = command_type; + ipc_message->Args() = custom_metrics_msg_shm.handle_; + + std::unique_lock guard{stub_to_parent_message_mu_}; + { + ScopedDefer _([&ipc_message, custom_metrics_msg] { + { + bi::scoped_lock guard{ + (*custom_metrics_msg)->mu}; + (*custom_metrics_msg)->waiting_on_stub = false; + (*custom_metrics_msg)->cv.notify_all(); + } + }); + + { + bi::scoped_lock guard{(*custom_metrics_msg)->mu}; + SendIPCUtilsMessage(ipc_message); + while (!(*custom_metrics_msg)->waiting_on_stub) { + (*custom_metrics_msg)->cv.wait(guard); + } + } + } + if ((*custom_metrics_msg)->has_error) { + if ((*custom_metrics_msg)->is_error_set) { + std::unique_ptr pb_string = PbString::LoadFromSharedMemory( + shm_pool_, (*custom_metrics_msg)->error); + std::string err_message = + std::string( + "Failed to process the custom metrics request for model '" + + name_ + "', message: ") + + pb_string->String(); + throw PythonBackendException(err_message); + } else { + std::string err_message = std::string( + "Failed to process the custom metrics request for model '" + name_ + + "'."); + throw PythonBackendException(err_message); + } + } +} + std::unique_ptr Logger::log_instance_; std::unique_ptr& @@ -1451,6 +1531,28 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) logger.def_static("log_error", &Logger::LogError, py::arg("message")); logger.def_static("log_verbose", &Logger::LogVerbose, py::arg("message")); + py::class_>(module, "Metric") + .def("increment", &Metric::SendIncrementRequest) + .def("set", &Metric::SendSetValueRequest) + .def("value", &Metric::SendGetValueRequest); + + py::enum_(module, "MetricKind") + .value("COUNTER", MetricKind::COUNTER) + .value("GAUGE", MetricKind::GAUGE) + .export_values(); + + py::class_>( + module, "MetricFamily") + .def( + py::init(&MetricFamily::CreateMetricFamily), + py::arg("name").none(false), py::arg("description").none(false), + py::arg("kind").none(false)) + .def( + "Metric", &MetricFamily::CreateMetric, + py::arg("labels").none(false) = py::dict()); + module.attr("MetricFamily").attr("COUNTER") = MetricKind::COUNTER; + module.attr("MetricFamily").attr("GAUGE") = MetricKind::GAUGE; + // This class is not part of the public API for Python backend. This is only // used for internal testing purposes. py::class_(module, "SharedMemory") diff --git a/src/pb_stub.h b/src/pb_stub.h index 24d94eb6..7e4f2c56 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -40,10 +40,13 @@ #include #include #include +#include #include "infer_request.h" #include "infer_response.h" #include "ipc_message.h" #include "message_queue.h" +#include "metric.h" +#include "metric_family.h" #include "pb_log.h" #include "pb_response_iterator.h" #include "pb_utils.h" @@ -152,6 +155,19 @@ class LogMessage { #define LOG_FL(FN, LN, LVL) LogMessage((char*)(FN), LN, LVL).stream() +// The payload for the stub_to_parent message queue. This struct serves as a +// wrapper for different types of messages so that they can be sent through the +// same buffer. +struct UtilsMessagePayload { + UtilsMessagePayload( + const PYTHONSTUB_CommandType& command_type, void* utils_message_ptr) + : command_type(command_type), utils_message_ptr(utils_message_ptr) + { + } + PYTHONSTUB_CommandType command_type; + void* utils_message_ptr; +}; + class Stub { public: Stub() : stub_to_parent_thread_(false), parent_to_stub_thread_(false){}; @@ -211,6 +227,9 @@ class Stub { /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); + /// Get the shared memory pool + std::unique_ptr& ShmPool() { return shm_pool_; } + void ProcessResponse(InferResponse* response); void LoadGPUBuffers(std::unique_ptr& ipc_message); bool IsDecoupled(); @@ -229,7 +248,7 @@ class Stub { void ServiceStubToParentRequests(); /// Send client log to the python backend - void SendLogMessage(std::unique_ptr& log_send_message); + void SendLogMessage(std::unique_ptr& utils_msg_payload); /// Check if stub to parent message handler is running bool StubToParentServiceActive(); @@ -251,7 +270,7 @@ class Stub { std::shared_ptr infer_response); /// Send the id to the python backend for object cleanup - void SendCleanupId(void* id); + void SendCleanupId(std::unique_ptr& utils_msg_payload); /// Add cleanup id to queue void EnqueueCleanupId(void* id); @@ -262,6 +281,21 @@ class Stub { /// Is the stub in the finalize stage bool IsFinalizing(); + /// Helper function to enqueue a utils message to the stub to parent message + /// buffer + void EnqueueUtilsMessage( + std::unique_ptr utils_msg_payload); + + /// Send the custom metrics message to the python backend + void SendCustomMetricsMessage( + CustomMetricsMessage** custom_metrics_msg, + PYTHONSTUB_CommandType command_type, + bi::managed_external_buffer::handle_t handle); + + /// Helper function to prepare the custom metrics message + void PrepareCustomMetricsMessage( + AllocatedSharedMemory& custom_metrics_msg_shm, + CustomMetricsMessage** custom_metrics_msg); private: bi::interprocess_mutex* stub_mutex_; @@ -291,8 +325,7 @@ class Stub { bool finalizing_; static std::unique_ptr stub_instance_; std::vector> gpu_tensors_; - std::queue> log_request_buffer_; - std::queue bls_response_cleanup_buffer_; + std::queue> stub_to_parent_buffer_; std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; std::mutex stub_to_parent_message_mu_; diff --git a/src/pb_utils.h b/src/pb_utils.h index 20f17795..36d7e3c7 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -71,12 +71,14 @@ namespace bi = boost::interprocess; } \ while (false) -#define THROW_IF_TRITON_ERROR(X) \ - do { \ - TRITONSERVER_Error* tie_err__ = (X); \ - if (tie_err__ != nullptr) { \ - throw PythonBackendException(TRITONSERVER_ErrorMessage(tie_err__)); \ - } \ +#define THROW_IF_TRITON_ERROR(X) \ + do { \ + TRITONSERVER_Error* tie_err__ = (X); \ + if (tie_err__ != nullptr) { \ + auto error_message = std::string(TRITONSERVER_ErrorMessage(tie_err__)); \ + TRITONSERVER_ErrorDelete(tie_err__); \ + throw PythonBackendException(error_message); \ + } \ } while (false) #define THROW_IF_CUDA_ERROR(X) \ @@ -165,6 +167,8 @@ struct ResponseBatch : SendMessageBase { enum LogLevel { INFO = 0, WARNING, ERROR, VERBOSE }; +enum MetricKind { COUNTER, GAUGE }; + struct LogSendMessage : SendMessageBase { bi::managed_external_buffer::handle_t filename; int32_t line; @@ -172,11 +176,28 @@ struct LogSendMessage : SendMessageBase { LogLevel level; }; - struct CleanupMessage : SendMessageBase { void* id; }; +struct CustomMetricsMessage : SendMessageBase { + bi::managed_external_buffer::handle_t message; + bool has_error; + bool is_error_set; + bi::managed_external_buffer::handle_t error; + // This field is specifically utilized when making the + // 'PYTHONSTUB_MetricRequestValue' request. It is used to hold the metric + // value after the Python backend calls the Triton C API to retrieve the + // metric value and pass it back to the stub process. + double value; + // This field is specifically utilized when making the + // 'PYTHONSTUB_MetricFamilyRequestNew' or 'PYTHONSTUB_MetricRequestNew' + // requests. It is used to hold the memory address of + // TRITONSERVER_MetricFamily' or 'TRITONSERVER_Metric' objects created in the + // Python backend and pass back to the stub process. + void* address; +}; + struct ResponseSenderBase { bi::interprocess_mutex mu; bi::interprocess_condition cv; diff --git a/src/python_be.cc b/src/python_be.cc index 99e8471d..5c815485 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -804,15 +804,37 @@ ModelInstanceState::StubToParentMQMonitor() std::unique_ptr message = IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle); - if (message->Command() == PYTHONSTUB_LogRequest) { - ProcessLogRequest(message); - } else if (message->Command() == PYTHONSTUB_CleanupRequest) { - ProcessBLSCleanupRequest(message); + switch (message->Command()) { + case PYTHONSTUB_LogRequest: { + ProcessLogRequest(message); + break; + } + case PYTHONSTUB_CleanupRequest: { + ProcessBLSCleanupRequest(message); + break; + } + case PYTHONSTUB_MetricFamilyRequestNew: + case PYTHONSTUB_MetricFamilyRequestDelete: { + ProcessMetricFamilyRequest(message); + break; + } + case PYTHONSTUB_MetricRequestNew: + case PYTHONSTUB_MetricRequestDelete: + case PYTHONSTUB_MetricRequestValue: + case PYTHONSTUB_MetricRequestIncrement: + case PYTHONSTUB_MetricRequestSet: { + ProcessMetricRequest(message); + break; + } + default: { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, "Unexpected message type received."); + break; + } } } } - void ModelInstanceState::ProcessLogRequest( const std::unique_ptr& message) @@ -886,6 +908,112 @@ ModelInstanceState::ProcessBLSCleanupRequest( } } +template +void +ModelInstanceState::ProcessCustomMetricsRequest( + const std::unique_ptr& message, + std::function&, CustomMetricsMessage*)> + request_handler) +{ + AllocatedSharedMemory metrics_message = + Stub()->ShmPool()->Load(message->Args()); + CustomMetricsMessage* metrics_message_ptr = + reinterpret_cast(metrics_message.data_.get()); + std::unique_ptr pb_error_message; + PythonBackendException pb_exception(std::string{}); + std::unique_ptr metrics_object = + T::LoadFromSharedMemory(Stub()->ShmPool(), metrics_message_ptr->message); + + ScopedDefer _([metrics_message_ptr] { + { + bi::scoped_lock guard{metrics_message_ptr->mu}; + metrics_message_ptr->waiting_on_stub = true; + metrics_message_ptr->cv.notify_all(); + while (metrics_message_ptr->waiting_on_stub) { + metrics_message_ptr->cv.wait(guard); + } + } + }); + + try { + request_handler(metrics_object, metrics_message_ptr); + } + catch (const PythonBackendException& exception) { + pb_exception = exception; + } + + if (pb_exception.what() != std::string{}) { + metrics_message_ptr->has_error = true; + LOG_IF_EXCEPTION( + pb_error_message = + PbString::Create(Stub()->ShmPool(), pb_exception.what())); + metrics_message_ptr->error = pb_error_message->ShmHandle(); + metrics_message_ptr->is_error_set = true; + } +} + +void +ModelInstanceState::ProcessMetricFamilyRequest( + const std::unique_ptr& message) +{ + auto command = message->Command(); + ProcessCustomMetricsRequest( + message, [this, command]( + std::unique_ptr& metric_family, + CustomMetricsMessage* metrics_message_ptr) { + switch (command) { + case PYTHONSTUB_MetricFamilyRequestNew: { + metrics_message_ptr->address = + metric_family->InitializeTritonMetricFamily(); + break; + } + case PYTHONSTUB_MetricFamilyRequestDelete: { + metric_family->ClearTritonMetricFamily(); + break; + } + default: { + throw PythonBackendException("Unknown metric family request kind"); + } + } + }); +} + +void +ModelInstanceState::ProcessMetricRequest( + const std::unique_ptr& message) +{ + auto command = message->Command(); + ProcessCustomMetricsRequest( + message, [this, command]( + std::unique_ptr& metric, + CustomMetricsMessage* metrics_message_ptr) { + try { + switch (command) { + case PYTHONSTUB_MetricRequestNew: { + metrics_message_ptr->address = metric->InitializeTritonMetric(); + break; + } + case PYTHONSTUB_MetricRequestIncrement: + case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestValue: { + metric->HandleMetricOperation(metrics_message_ptr, command); + break; + } + case PYTHONSTUB_MetricRequestDelete: { + metric->ClearTritonMetric(); + break; + } + default: { + throw PythonBackendException("Unknown metric request kind"); + } + } + } + catch (const PythonBackendException& exception) { + throw exception; + } + }); +} + void ModelInstanceState::StartMonitor() { @@ -899,6 +1027,7 @@ ModelInstanceState::TerminateMonitor() { if (stub_to_parent_thread_) { stub_to_parent_thread_ = false; + // Push a dummy message to signal the thread to terminate. Stub()->StubToParentMessageQueue()->Push(DUMMY_MESSAGE); stub_to_parent_queue_monitor_.join(); } @@ -1063,7 +1192,10 @@ ModelInstanceState::ProcessRequestsDecoupled( ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; ipc_message->Args() = request_batch.handle_; received_message_ = nullptr; - ScopedDefer _([this] { Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); + ScopedDefer _([this] { + // Push a dummy message to signal the thread to terminate. + Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); + }); { std::unique_lock guard{mu_}; @@ -1184,6 +1316,7 @@ ModelInstanceState::ProcessRequests( // the object stored in shared memory. NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); if (!restart) + // Push a dummy message to signal the thread to terminate. Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); if (restart) { @@ -1532,6 +1665,7 @@ ModelInstanceState::~ModelInstanceState() if (Stub()->IsHealthy()) { if (model_state->IsDecoupled()) { futures_.clear(); + // Push a dummy message to signal the thread to terminate. Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); decoupled_monitor_.join(); } diff --git a/src/python_be.h b/src/python_be.h index bc9fb187..ebcedba3 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -62,6 +62,8 @@ #include "ipc_message.h" #include "memory_manager.h" #include "message_queue.h" +#include "metric.h" +#include "metric_family.h" #include "pb_env.h" #include "pb_map.h" #include "pb_metric_reporter.h" @@ -385,5 +387,19 @@ class ModelInstanceState : public BackendModelInstance { // Process the bls decoupled cleanup request void ProcessBLSCleanupRequest(const std::unique_ptr& message); + + // Process a custom metrics request. The function 'request_handler' is invoked + // to handle the request. T should be either 'MetricFamily' or 'Metric'. + template + void ProcessCustomMetricsRequest( + const std::unique_ptr& message, + std::function&, CustomMetricsMessage*)> + request_handler); + + // Process a metric family request + void ProcessMetricFamilyRequest(const std::unique_ptr& message); + + // Process a metric request + void ProcessMetricRequest(const std::unique_ptr& message); }; }}} // namespace triton::backend::python From 6c4b81711368dd798bf66e5d1d563d563f813026 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Mon, 8 May 2023 16:20:40 -0700 Subject: [PATCH 102/216] Update dlpack implementation for PbTensor (#223) * Update dlpack implementation for PbTensor: handle new API + bools --- CMakeLists.txt | 2 +- README.md | 9 +++ src/pb_stub.cc | 4 +- src/pb_stub_utils.cc | 14 +++-- src/pb_tensor.cc | 140 +++++++++++++++++++++++++++++++++++++++---- src/pb_tensor.h | 25 +++++++- 6 files changed, 171 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 888b7465..213b1927 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,7 +89,7 @@ FetchContent_MakeAvailable(pybind11) FetchContent_Declare( dlpack GIT_REPOSITORY "https://github.com/dmlc/dlpack" - GIT_TAG "v0.7" + GIT_TAG "v0.8" GIT_SHALLOW ON ) FetchContent_MakeAvailable(dlpack) diff --git a/README.md b/README.md index c105665c..d3a44505 100644 --- a/README.md +++ b/README.md @@ -1226,6 +1226,15 @@ class TritonPythonModel: # tensor. input0 = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(pytorch_tensor)) ``` +Python backend allows tensors implementing +[`__dlpack__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack__.html) +and [`__dlpack_device__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack_device__.html) +[interface](https://dmlc.github.io/dlpack/latest/python_spec.html) +to be converted to Python backend tensors. For instance: + +```python +input0 = pb_utils.Tensor.from_dlpack("INPUT0", pytorch_tensor) +``` This method only supports contiguous Tensors that are in C-order. If the tensor is not C-order contiguous an exception will be raised. diff --git a/src/pb_stub.cc b/src/pb_stub.cc index e6cff15f..77f75af2 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1488,7 +1488,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("to_dlpack", &PbTensor::ToDLPack) .def("is_cpu", &PbTensor::IsCPU) .def("shape", &PbTensor::Dims) - .def("from_dlpack", &PbTensor::FromDLPack); + .def("from_dlpack", &PbTensor::FromDLPack) + .def("__dlpack__", &PbTensor::DLPack, py::arg("stream") = py::none()) + .def("__dlpack_device__", &PbTensor::DLPackDevice); py::class_>( module, "InferenceResponse") diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc index 315f74a4..70557f64 100644 --- a/src/pb_stub_utils.cc +++ b/src/pb_stub_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -189,8 +189,8 @@ triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype) dl_dtype.lanes = 1; switch (triton_dtype) { case TRITONSERVER_TYPE_BOOL: - dl_code = DLDataTypeCode::kDLInt; - dt_size = 1; + dl_code = DLDataTypeCode::kDLBool; + dt_size = 8; break; case TRITONSERVER_TYPE_UINT8: dl_code = DLDataTypeCode::kDLUInt; @@ -279,8 +279,6 @@ dlpack_to_triton_type(const DLDataType& data_type) return TRITONSERVER_TYPE_INT32; } else if (data_type.bits == 64) { return TRITONSERVER_TYPE_INT64; - } else if (data_type.bits == 1) { - return TRITONSERVER_TYPE_BOOL; } } @@ -296,6 +294,12 @@ dlpack_to_triton_type(const DLDataType& data_type) } } + if (data_type.code == DLDataTypeCode::kDLBool) { + if (data_type.bits == 8) { + return TRITONSERVER_TYPE_BOOL; + } + } + return TRITONSERVER_TYPE_INVALID; } }}} // namespace triton::backend::python diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index c4b08b7f..f3cb1de3 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -231,6 +231,36 @@ PbTensor::FromNumpy(const std::string& name, py::array& numpy_array) return std::make_shared(name, numpy_array); } +DLDeviceType +PbTensor::DeviceType() +{ + DLDeviceType device_type{}; + + switch (memory_type_) { + case TRITONSERVER_MEMORY_GPU: + device_type = DLDeviceType::kDLCUDA; + break; + case TRITONSERVER_MEMORY_CPU: + device_type = DLDeviceType::kDLCPU; + break; + case TRITONSERVER_MEMORY_CPU_PINNED: + device_type = DLDeviceType::kDLCUDAHost; + break; + } + + return device_type; +} + +py::capsule +PbTensor::DLPack(const py::object& stream) +{ + // Here external tensor requests PbTensor's `__dlpack__` method to provide + // a PyCapsule. By the design of PbTensor, in a GPU case no pending work + // is scheduled to work with PbTensor's data and we can simply pass + // the capsule without a synchronization. + return this->ToDLPack(); +} + py::capsule PbTensor::ToDLPack() { @@ -269,23 +299,19 @@ PbTensor::ToDLPack() tensor_handle.inc_ref(); dlpack_tensor->dl_tensor.device.device_id = memory_type_id_; + dlpack_tensor->dl_tensor.device.device_type = this->DeviceType(); dlpack_tensor->dl_tensor.dtype = triton_to_dlpack_type(dtype_); - switch (memory_type_) { - case TRITONSERVER_MEMORY_GPU: - dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDA; - break; - case TRITONSERVER_MEMORY_CPU: - dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU; - break; - case TRITONSERVER_MEMORY_CPU_PINNED: - dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDAHost; - break; - } - return py::capsule( static_cast(dlpack_tensor), "dltensor", &delete_unused_dltensor); } + +std::pair +PbTensor::DLPackDevice() +{ + return std::pair(this->DeviceType(), memory_type_id_); +} + #endif // TRITON_PB_STUB void @@ -305,12 +331,100 @@ PbTensor::Memory() #ifdef TRITON_PB_STUB std::shared_ptr -PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor) +PbTensor::FromDLPack(const std::string& name, const py::object& tensor) { if (name == "") { throw PythonBackendException("Tensor name cannot be an empty string."); } + if (py::isinstance(tensor)) { + return FromDLPackCapsule(name, tensor); + } + + if (!py::hasattr(tensor, "__dlpack__") || + !py::hasattr(tensor, "__dlpack_device__")) { + throw PythonBackendException( + "Provided tensor is not supported. Tensor must be a DLPack capsule \ + or have `__dlpack__` and `__dlpack_device__` attributes"); + } + + auto capsule_device_info = + tensor.attr("__dlpack_device__")().cast>(); + if (capsule_device_info.first == DLDeviceType::kDLCUDA) { +#ifdef TRITON_ENABLE_GPU + int current_device; + cudaError_t err = cudaGetDevice(¤t_device); + if (err != cudaSuccess) { + throw PythonBackendException("Failed to get current CUDA device id."); + } + + bool overridden = (current_device != capsule_device_info.second); + err = overridden ? cudaSetDevice(capsule_device_info.second) : cudaSuccess; + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to set CUDA device to device with id " + + std::to_string(capsule_device_info.second)); + } + // In case there is a pending job on the data, where this capsule + // is pointing to, we need to wait for it before consuming. + // This is important for when data is located on different + // context (GPU) and work is done on the default stream. + // For this scenario, __dlpack__ implementation may skip + // syncronization (since the work is on the default stream) + // and we will return pointer to the data on different GPU too early + // (i.e. before pending work is done). Thus we sync on the default stream + // only in the case we switched to a different context. + err = overridden ? cudaStreamSynchronize(0) : cudaSuccess; + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to synchronize CUDA device with id " + + std::to_string( + overridden ? capsule_device_info.second : current_device)); + } + + // Array API requirements for the stream argument: + // stream = 1 the legacy default stream (in this case should + // synchronize on CUDA stream 0) + // For CPU, `stream=None` is the only accepted argument + // according to array API. For GPU, when `stream=None` producer + // must assume the legacy default stream. Reference: + // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html + auto ptr_to_tensor = FromDLPackCapsule( + name, tensor.attr("__dlpack__")(py::arg("stream") = py::int_(1))); + + err = overridden ? cudaSetDevice(current_device) : cudaSuccess; + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to set CUDA device back to initial compute device " + "with id " + + std::to_string(current_device)); + } + return ptr_to_tensor; +#else + throw PythonBackendException( + "DLPack capsule passed pointer to memory allocated on GPU device, \ + when GPU is not available"); +#endif + } else if ( + capsule_device_info.first != DLDeviceType::kDLCPU && + capsule_device_info.first != DLDeviceType::kDLCUDAHost) { + throw PythonBackendException( + "DLDevice type " + std::to_string(capsule_device_info.first) + + " is not support by Python backend."); + } + + // If data is located on CPU, `stream=None` is the only accepted argument + // according to array API. For GPU, when `stream=None` producer must + // assume the legacy default stream. + // Reference: + // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html + return FromDLPackCapsule( + name, tensor.attr("__dlpack__")(py::arg("stream") = py::none())); +} +std::shared_ptr +PbTensor::FromDLPackCapsule( + const std::string& name, const py::capsule& dlpack_tensor) +{ DLManagedTensor* dl_managed_tensor = static_cast(dlpack_tensor.get_pointer()); diff --git a/src/pb_tensor.h b/src/pb_tensor.h index 912b50a4..79adf500 100644 --- a/src/pb_tensor.h +++ b/src/pb_tensor.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -112,11 +112,16 @@ class PbTensor { DISALLOW_COPY_AND_ASSIGN(PbTensor); #ifdef TRITON_PB_STUB - /// Construct a Python backend tensor using a DLPack - /// capsule. + /// Construct a Python backend tensor from an + /// external tensor. /// \param dlpack source dlpack tensor /// \param name name of the tensor static std::shared_ptr FromDLPack( + const std::string& name, const py::object& dlpack); + + /// Construct a Python backend tensor using a DLPack + /// capsule. + static std::shared_ptr FromDLPackCapsule( const std::string& name, const py::capsule& dlpack); /// Construct a Python backend tensor using a NumPy object. @@ -125,9 +130,23 @@ class PbTensor { static std::shared_ptr FromNumpy( const std::string& name, py::array& numpy_array); + /// Get device type in DLPack format. + DLDeviceType DeviceType(); + + /// Exports tensor for consumption by `from_dlpack()` as a DLPack capsule. + /// \param stream a Python integer representing a pointer to a stream, + /// on devices that support streams + /// \return Capsule object containing pointer to a DLPack object. + py::capsule DLPack(const py::object& stream); + /// Get a PyCapsule object containing the DLPack representation of the tensor. /// \return Capsule object containing pointer to a DLPack object. py::capsule ToDLPack(); + + /// Returns device type and device ID. + /// Meant for use within `from_dlpack()`. + /// \return a pair (device_type, device_id). + std::pair DLPackDevice(); #endif /// Get the name of the tensor From 6fa88ce8c376e974a16132d34fe76826a86cb326 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 17 May 2023 13:54:19 -0700 Subject: [PATCH 103/216] Fix the lifetime of InferPayload (#241) * Add mutex for InferPayload to make sure it's thread-safe during callback * Remove reset for the promise * Address comment * Remove destructor * Fix lifetime of infer payload * Make sure the mutex is unlocked before promise.set_value * Revert "Make sure the mutex is unlocked before promise.set_value" This reverts commit 2eb5c325d1176fd2962a2fda92dda132a8a94aaa. * fix leak * Serialize all the responses in decoupled BLS * use enable_shared_from_this * Add a warning about using "GetPtr" * Remove the callback from mutex lock --------- Co-authored-by: Iman Tabrizian --- src/infer_payload.cc | 26 ++++++++++++++------------ src/infer_payload.h | 12 ++++++++---- src/request_executor.cc | 20 ++++++++------------ 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/src/infer_payload.cc b/src/infer_payload.cc index a61335a7..762201e8 100644 --- a/src/infer_payload.cc +++ b/src/infer_payload.cc @@ -33,28 +33,30 @@ InferPayload::InferPayload( std::function)> callback) : is_decoupled_(is_decoupled), is_promise_set_(false), callback_(callback) { - prev_promise_.reset(new std::promise>()); -} - -InferPayload::~InferPayload() -{ - prev_promise_.reset(); + promise_.reset(new std::promise>()); } void -InferPayload::SetValueForPrevPromise( - std::unique_ptr infer_response) +InferPayload::SetValue(std::unique_ptr infer_response) { - prev_promise_->set_value(std::move(infer_response)); - prev_promise_.reset(); - is_promise_set_ = true; + { + // Only set value to the promise with the first response. Call the callback + // function to send decoupled response to the stub. + std::lock_guard lock(mutex_); + if (!is_promise_set_) { + is_promise_set_ = true; + promise_->set_value(std::move(infer_response)); + return; + } + } + Callback(std::move(infer_response)); } void InferPayload::SetFuture( std::future>& response_future) { - response_future = prev_promise_->get_future(); + response_future = promise_->get_future(); } bool diff --git a/src/infer_payload.h b/src/infer_payload.h index 5c0458a5..2002d0be 100644 --- a/src/infer_payload.h +++ b/src/infer_payload.h @@ -43,14 +43,17 @@ struct ResponseAllocatorUserp { PreferredMemory preferred_memory; }; -class InferPayload { +class InferPayload : public std::enable_shared_from_this { public: InferPayload( const bool is_decouple, std::function)> callback); - ~InferPayload(); - void SetValueForPrevPromise(std::unique_ptr infer_response); + /// GetPtr should be only called when the InferPayload object is constructed + /// using a shared pointer. Calling this function in any other circumstance + /// is undefined behaviour until C++17. + std::shared_ptr GetPtr() { return shared_from_this(); } + void SetValue(std::unique_ptr infer_response); void SetFuture(std::future>& response_future); bool IsDecoupled(); bool IsPromiseSet(); @@ -60,8 +63,9 @@ class InferPayload { std::shared_ptr ResponseAllocUserp(); private: - std::unique_ptr>> prev_promise_; + std::unique_ptr>> promise_; bool is_decoupled_; + std::mutex mutex_; bool is_promise_set_; std::function)> callback_; std::shared_ptr response_alloc_userp_; diff --git a/src/request_executor.cc b/src/request_executor.cc index 43556e70..025c0deb 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -77,7 +77,8 @@ void InferResponseComplete( TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) { - auto p = reinterpret_cast(userp); + auto linfer_payload = reinterpret_cast(userp); + std::shared_ptr infer_payload = linfer_payload->GetPtr(); std::unique_ptr infer_response; std::vector> output_tensors; std::shared_ptr pb_error; @@ -146,7 +147,7 @@ InferResponseComplete( output_tensors.clear(); } - if (!p->IsDecoupled()) { + if (!infer_payload->IsDecoupled()) { infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */); } else { @@ -167,7 +168,8 @@ InferResponseComplete( TRITONSERVER_InferenceResponseDelete(response), "Failed to release BLS inference response."); } else if ( - p->IsDecoupled() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { + (infer_payload)->IsDecoupled() && + (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { // An empty response may be the last reponse for decoupled models. infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */, userp /* id */); @@ -177,13 +179,7 @@ InferResponseComplete( output_tensors, pb_error, true /* is_last_response */, userp /* id */); } - // Only set value to the promise with the first response. Call the callback - // function to send decoupled response to the stub. - if (p->IsPromiseSet()) { - p->Callback(std::move(infer_response)); - } else { - p->SetValueForPrevPromise(std::move(infer_response)); - } + infer_payload->SetValue(std::move(infer_response)); } TRITONSERVER_Error* @@ -339,8 +335,8 @@ RequestExecutor::Infer( std::string("Model ") + model_name + " is using the decoupled. The current BLS request call doesn't " "support models using the decoupled transaction policy. Please use " - "stream API 'stream_exec()' or 'async_stream_exec() for decoupled " - "models.'"); + "'decoupled=True' argument to the 'exec' or 'async_exec' calls for " + "decoupled models.'"); } // Inference From 091970d7246211d4bd2a761f857ddedb5915cb22 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Fri, 19 May 2023 13:53:46 -0700 Subject: [PATCH 104/216] adding proxy CUDA streams for dlpack synchronization (#243) * adjusting cudaStreamSynchronize(0) for all cases * Adding proxy dlpack cuda streams * Cleaned up comments, moved stream destruction to Finalize * Corrected according to feedback --- src/pb_stub.cc | 41 +++++++++++++++++++++++++++++++++++++++++ src/pb_stub.h | 10 ++++++++++ src/pb_tensor.cc | 42 ++++++++++++++++++++++-------------------- 3 files changed, 73 insertions(+), 20 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 77f75af2..9539a250 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -59,6 +59,10 @@ namespace py = pybind11; using namespace pybind11::literals; namespace bi = boost::interprocess; +#ifndef TRITON_ENABLE_GPU +using cudaStream_t = void*; +#endif + namespace triton { namespace backend { namespace python { std::atomic non_graceful_exit = {false}; @@ -823,6 +827,20 @@ Stub::Finalize() LOG_INFO << e.what(); } } +#ifdef TRITON_ENABLE_GPU + // We also need to destroy created proxy CUDA streams for dlpack, if any + std::lock_guard lock(dlpack_proxy_stream_pool_mu_); + for (auto& entry : dlpack_proxy_stream_pool_) { + // We don't need to switch device to destroy a stream + // https://stackoverflow.com/questions/64663943/how-to-destroy-a-stream-that-was-created-on-a-specific-device + cudaError_t err = cudaStreamDestroy(entry.second); + if (err != cudaSuccess) { + LOG_ERROR + << "Failed to destroy dlpack CUDA proxy stream on device with id " + + std::to_string(entry.first); + } + } +#endif } void @@ -1245,6 +1263,29 @@ Stub::SendCustomMetricsMessage( } } +cudaStream_t +Stub::GetProxyStream(const int& device_id) +{ +#ifdef TRITON_ENABLE_GPU + std::lock_guard lock(dlpack_proxy_stream_pool_mu_); + if (dlpack_proxy_stream_pool_.find(device_id) == + dlpack_proxy_stream_pool_.end()) { + cudaStream_t new_proxy_stream; + cudaError_t err = cudaStreamCreate(&new_proxy_stream); + if (err == cudaSuccess) { + dlpack_proxy_stream_pool_.emplace(device_id, new_proxy_stream); + return new_proxy_stream; + } else { + throw PythonBackendException( + "Failed to create a CUDA stream for a DLPack call."); + } + } + return dlpack_proxy_stream_pool_[device_id]; +#else + return nullptr; +#endif +} + std::unique_ptr Logger::log_instance_; std::unique_ptr& diff --git a/src/pb_stub.h b/src/pb_stub.h index 7e4f2c56..c9188631 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -56,6 +56,10 @@ namespace bi = boost::interprocess; namespace py = pybind11; using namespace pybind11::literals; +#ifndef TRITON_ENABLE_GPU +using cudaStream_t = void*; +#endif + namespace triton { namespace backend { namespace python { #define LOG_IF_EXCEPTION(X) \ @@ -297,6 +301,10 @@ class Stub { AllocatedSharedMemory& custom_metrics_msg_shm, CustomMetricsMessage** custom_metrics_msg); + /// Helper function to retrieve a proxy stream for dlpack synchronization + /// for provided device + cudaStream_t GetProxyStream(const int& device_id); + private: bi::interprocess_mutex* stub_mutex_; bi::interprocess_condition* stub_cond_; @@ -335,5 +343,7 @@ class Stub { std::mutex response_iterator_map_mu_; std::unordered_map> response_iterator_map_; + std::mutex dlpack_proxy_stream_pool_mu_; + std::unordered_map dlpack_proxy_stream_pool_; }; }}} // namespace triton::backend::python diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index f3cb1de3..20d5302f 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -29,6 +29,7 @@ #endif // TRITON_ENABLE_GPU #ifdef TRITON_PB_STUB +#include "pb_stub.h" #include "pb_stub_utils.h" namespace py = pybind11; #endif @@ -353,6 +354,7 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor) #ifdef TRITON_ENABLE_GPU int current_device; cudaError_t err = cudaGetDevice(¤t_device); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); if (err != cudaSuccess) { throw PythonBackendException("Failed to get current CUDA device id."); } @@ -364,22 +366,8 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor) "Failed to set CUDA device to device with id " + std::to_string(capsule_device_info.second)); } - // In case there is a pending job on the data, where this capsule - // is pointing to, we need to wait for it before consuming. - // This is important for when data is located on different - // context (GPU) and work is done on the default stream. - // For this scenario, __dlpack__ implementation may skip - // syncronization (since the work is on the default stream) - // and we will return pointer to the data on different GPU too early - // (i.e. before pending work is done). Thus we sync on the default stream - // only in the case we switched to a different context. - err = overridden ? cudaStreamSynchronize(0) : cudaSuccess; - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to synchronize CUDA device with id " + - std::to_string( - overridden ? capsule_device_info.second : current_device)); - } + + cudaStream_t proxy_stream = stub->GetProxyStream(current_device); // Array API requirements for the stream argument: // stream = 1 the legacy default stream (in this case should @@ -389,7 +377,22 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor) // must assume the legacy default stream. Reference: // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html auto ptr_to_tensor = FromDLPackCapsule( - name, tensor.attr("__dlpack__")(py::arg("stream") = py::int_(1))); + name, tensor.attr("__dlpack__")( + py::arg("stream") = + py::int_(reinterpret_cast(proxy_stream)))); + + // In case there is a pending job on the data, where this capsule + // is pointing to, we need to wait for it to finish before returning + // capsule. + // We synchronize on the proxy stream explicitly since that what we + // pass to external tensor's `__dlpack__` method. + err = cudaStreamSynchronize(proxy_stream); + if (err != cudaSuccess) { + throw PythonBackendException( + "Failed to synchronize CUDA device with id " + + std::to_string( + overridden ? capsule_device_info.second : current_device)); + } err = overridden ? cudaSetDevice(current_device) : cudaSuccess; if (err != cudaSuccess) { @@ -412,9 +415,8 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor) " is not support by Python backend."); } - // If data is located on CPU, `stream=None` is the only accepted argument - // according to array API. For GPU, when `stream=None` producer must - // assume the legacy default stream. + // If data is located on a CPU, `stream=None` is the only accepted argument + // according to array API. // Reference: // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html return FromDLPackCapsule( From a654b4f0242435605eaf70851ac1aef5c4927e55 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Sat, 20 May 2023 05:35:34 -0400 Subject: [PATCH 105/216] Fix cuda synchronization (#245) * Fix cuda synchronization * address comment * Address comment --- src/pb_memory.cc | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/pb_memory.cc b/src/pb_memory.cc index 2354391f..beecb3d9 100644 --- a/src/pb_memory.cc +++ b/src/pb_memory.cc @@ -141,8 +141,16 @@ PbMemory::CopyBuffer( kind = cudaMemcpyDeviceToDevice; } - cudaError_t err = - cudaMemcpy(dst->DataPtr(), src->DataPtr(), src->ByteSize(), kind); + cudaError_t err; + if ((kind == cudaMemcpyDeviceToDevice) && + (src->MemoryTypeId() != dst->MemoryTypeId())) { + err = cudaMemcpyPeer( + dst->DataPtr(), dst->MemoryTypeId(), src->DataPtr(), + src->MemoryTypeId(), src->ByteSize()); + + } else { + err = cudaMemcpy(dst->DataPtr(), src->DataPtr(), src->ByteSize(), kind); + } if (err != cudaSuccess) { throw PythonBackendException( @@ -150,6 +158,19 @@ PbMemory::CopyBuffer( "failed to copy data: " + std::string(cudaGetErrorString(err))) .c_str()); } + + if (kind == cudaMemcpyDeviceToDevice) { + // Synchronize the default stream for d2d copies. + // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html#api-sync-behavior__memcpy-sync + err = cudaStreamSynchronize(0); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string( + "failed to synchronize the default CUDA stream. error: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + } #endif } From 78c6fbf5e50a9f254ede583f7837917e203f5892 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 23 May 2023 15:38:45 -0700 Subject: [PATCH 106/216] Improve the documentation for BLS (#248) * Improve the documentation for BLS * Address comment * Address comment Co-authored-by: Iman Tabrizian --------- Co-authored-by: Iman Tabrizian --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d3a44505..786f5227 100644 --- a/README.md +++ b/README.md @@ -877,8 +877,10 @@ Scripting (BLS)*. Starting from 21.08, you can implement BLS in your Python model. A new set of utility functions allows you to execute inference requests on other models -being served by Triton as a part of executing your Python model. Example below -shows how to use this feature: +being served by Triton as a part of executing your Python model. Note that BLS +should only be used inside the `execute` function and is not supported +in the `initialize` or `finalize` methods. Example below shows how to use this +feature: ```python import triton_python_backend_utils as pb_utils From bca5a5ac2a591d31e7c7a6241f1b09fd6f463aed Mon Sep 17 00:00:00 2001 From: Misha Chornyi Date: Sun, 2 Apr 2023 17:25:58 -0700 Subject: [PATCH 107/216] Set CMAKE to pull latest version --- inferentia/scripts/setup.sh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/inferentia/scripts/setup.sh b/inferentia/scripts/setup.sh index 8321f988..fd537867 100644 --- a/inferentia/scripts/setup.sh +++ b/inferentia/scripts/setup.sh @@ -115,14 +115,16 @@ apt-get update && \ libarchive-dev \ rapidjson-dev -# CMake -wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ - gpg --dearmor - | \ - tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null && \ -apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main' && \ -apt-get update && \ -apt-get install -y --no-install-recommends \ -cmake-data=3.21.1-0kitware1ubuntu20.04.1 cmake=3.21.1-0kitware1ubuntu20.04.1 && \ +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +apt install -y gpg wget && \ + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ + gpg --dearmor - | \ + tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ + . /etc/os-release && \ + echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ + tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ + apt-get update && \ + apt-get install -y --no-install-recommends cmake cmake-data cmake --version # First compile correct python stub From a156eba0cc03739a5bdcf59da545d00c531e4ee3 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 2 Jun 2023 16:12:42 -0400 Subject: [PATCH 108/216] Update mentions of Python 3.8 -> 3.10 (#253) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 786f5227..5b2c9edd 100644 --- a/README.md +++ b/README.md @@ -582,13 +582,13 @@ documentation. ## Managing Python Runtime and Libraries Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/) -containers uses Python 3.8. Python backend is able to use the libaries +containers uses Python 3.10. Python backend is able to use the libaries that exist in the current Python environment. These libraries can be installed in a virtualenv, conda environment, or the global system Python. These libraries will only be used if the Python version matches the Python version of the Python backend's stub executable. For example, if you install a set of libraries in a Python 3.9 environment and your -Python backend stub is compiled with Python 3.8 these libraries will NOT +Python backend stub is compiled with Python 3.10 these libraries will NOT be available in your Python model served using Triton. You would need to compile the stub executble with Python 3.9 using the instructions in [Building Custom Python Backend Stub](#building-custom-python-backend-stub) @@ -597,7 +597,7 @@ section. ### Building Custom Python Backend Stub **Important Note: You only need to compile a custom Python backend stub if the -Python version is different from Python 3.8 which is shipped by +Python version is different from Python 3.10 which is shipped by default in the Triton containers.** Python backend uses a *stub* process to connect your `model.py` file to the @@ -759,7 +759,7 @@ In this case you only need to pack your environment using `conda-pack` and provide the path to tar file in the model config. However, the previous note still applies here and the version of the Python interpreter inside the conda environment must match the Python version of stub used by Python backend. The -default version of the stub is Python 3.8. +default version of the stub is Python 3.10. 3. You can share a single execution environment across multiple models. You need to provide the path to the tar file in the `EXECUTION_ENV_PATH` in the From 637c7e3f5420e95afd8db701d8cd64eab5d2e484 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 5 Jun 2023 17:14:14 -0400 Subject: [PATCH 109/216] Remove the requirement to specify output_tensors when setting an error (#251) * Remove the requirement to specify output_tensors when setting an error response * Review edit --- README.md | 10 +++++++--- src/pb_stub.cc | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5b2c9edd..324de7b4 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ any C++ code. - [`initialize`](#initialize) - [`execute`](#execute) - [Default Mode](#default-mode) + - [Error Handling](#error-handling) - [Decoupled mode](#decoupled-mode) - [Use Cases](#use-cases) - [Known Issues](#known-issues) @@ -55,7 +56,7 @@ any C++ code. - [Building Custom Python Backend Stub](#building-custom-python-backend-stub) - [Creating Custom Execution Environments](#creating-custom-execution-environments) - [Important Notes](#important-notes) - - [Error Handling](#error-handling) + - [Error Handling](#error-handling-1) - [Managing Shared Memory](#managing-shared-memory) - [Multiple Model Instance Support](#multiple-model-instance-support) - [Running Multiple Instances of Triton Server](#running-multiple-instances-of-triton-server) @@ -450,6 +451,8 @@ Upon return from the execute function all tensor data associated with the InferenceRequest objects passed to the function are deleted, and so InferenceRequest objects should not be retained by the Python model. +#### Error Handling + In case one of the requests has an error, you can use the `TritonError` object to set the error message for that specific request. Below is an example of setting errors for an `InferenceResponse` object: @@ -466,9 +469,10 @@ class TritonPythonModel: for request in requests: if an_error_occurred: - # If there is an error, the output_tensors are ignored + # If there is an error, there is no need to pass the + # "output_tensors" to the InferenceResponse. The "output_tensors" + # that are passed in this case will be ignored. responses.append(pb_utils.InferenceResponse( - output_tensors=[], error=pb_utils.TritonError("An Error Occurred"))) return responses diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 9539a250..234cfd9f 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1539,7 +1539,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::init< const std::vector>&, std::shared_ptr>(), - py::arg("output_tensors").none(false), + py::arg("output_tensors") = py::list(), py::arg("error") = static_cast>(nullptr)) .def( "output_tensors", &InferResponse::OutputTensors, From 0a54e59380436ac6f2c476025e18af10331aca59 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 6 Jun 2023 14:00:06 -0400 Subject: [PATCH 110/216] Fix error handling for GPU tensors (#249) * Fix error handling for GPU tensors * Fix GPU buffer handling * Review edit * Fix for dynamically batched responses with GPU tensor * Review edits * Fix unused i variable for GPU=OFF * Review comments * Review edit --- CMakeLists.txt | 2 + src/gpu_buffers.cc | 88 ++++++++++++++++++++++++++++++ src/gpu_buffers.h | 67 +++++++++++++++++++++++ src/infer_request.cc | 11 +++- src/infer_response.cc | 54 +++++++----------- src/infer_response.h | 13 +++-- src/pb_stub.cc | 60 +++++++++++--------- src/pb_utils.h | 10 +--- src/python_be.cc | 121 +++++++++++++---------------------------- src/response_sender.cc | 26 +++++---- 10 files changed, 280 insertions(+), 172 deletions(-) create mode 100644 src/gpu_buffers.cc create mode 100644 src/gpu_buffers.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 213b1927..3659c7bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,8 @@ set( src/metric.cc src/metric_family.h src/metric_family.cc + src/gpu_buffers.cc + src/gpu_buffers.h ) set( diff --git a/src/gpu_buffers.cc b/src/gpu_buffers.cc new file mode 100644 index 00000000..6b370ea1 --- /dev/null +++ b/src/gpu_buffers.cc @@ -0,0 +1,88 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gpu_buffers.h" +#include "pb_string.h" + +namespace triton { namespace backend { namespace python { +GPUBuffersHelper::GPUBuffersHelper() +{ + completed_ = false; +} + +void +GPUBuffersHelper::AddBuffer(const bi::managed_external_buffer::handle_t& handle) +{ + if (completed_) { + throw PythonBackendException( + "It is not possible to add buffers after 'Complete' has been called on " + "a GPUBuffersHelper."); + } + + buffers_.emplace_back(handle); +} + +void +GPUBuffersHelper::SetError( + std::unique_ptr& shm_pool, const std::string& error) +{ + error_shm_ = PbString::Create(shm_pool, error); +} + +void +GPUBuffersHelper::Complete(std::unique_ptr& shm_pool) +{ + if (completed_) { + throw PythonBackendException( + "Complete has already been called. Complete should only be called " + "once."); + } + gpu_buffers_shm_ = shm_pool->Construct(); + if (!error_shm_) { + buffers_handle_shm_ = + shm_pool->Construct( + buffers_.size()); + gpu_buffers_shm_.data_->buffer_count = buffers_.size(); + gpu_buffers_shm_.data_->success = true; + gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_; + for (size_t i = 0; i < buffers_.size(); ++i) { + buffers_handle_shm_.data_.get()[i] = buffers_[i]; + } + } else { + gpu_buffers_shm_.data_->success = false; + gpu_buffers_shm_.data_->error = error_shm_->ShmHandle(); + } + completed_ = true; +} + + +bi::managed_external_buffer::handle_t +GPUBuffersHelper::ShmHandle() +{ + return gpu_buffers_shm_.handle_; +} + +}}} // namespace triton::backend::python diff --git a/src/gpu_buffers.h b/src/gpu_buffers.h new file mode 100644 index 00000000..fd683ba7 --- /dev/null +++ b/src/gpu_buffers.h @@ -0,0 +1,67 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include "pb_string.h" +#include "pb_utils.h" +#include "scoped_defer.h" + +namespace triton { namespace backend { namespace python { + +/// \param success indicating whether the process of fetching the GPU buffers +/// was successful. +/// \param error if success is equal to false, the error object will be set. +/// \param buffers list of buffers elements. +/// \param buffer_count the number of buffers. +struct GPUBuffersShm { + bool success; + bi::managed_external_buffer::handle_t error; + bi::managed_external_buffer::handle_t buffers; + uint32_t buffer_count; +}; + +/// Helper class to facilitate transfer of metadata associated +/// the GPU buffers in shared memory. +class GPUBuffersHelper { + public: + GPUBuffersHelper(); + void AddBuffer(const bi::managed_external_buffer::handle_t& handle); + void Complete(std::unique_ptr& shm_pool); + void SetError( + std::unique_ptr& shm_pool, const std::string& error); + bi::managed_external_buffer::handle_t ShmHandle(); + + private: + AllocatedSharedMemory gpu_buffers_shm_; + std::vector buffers_; + AllocatedSharedMemory + buffers_handle_shm_; + std::unique_ptr error_shm_; + bool completed_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/infer_request.cc b/src/infer_request.cc index 2a9799db..3ecde9e8 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -28,6 +28,7 @@ #include +#include "gpu_buffers.h" #include "pb_utils.h" #include "scoped_defer.h" #ifdef TRITON_PB_STUB @@ -481,11 +482,19 @@ InferRequest::Exec(const bool is_decoupled) // Additional round trip required for asking the stub process // to fill in the GPU tensor buffers if (has_gpu_tensor) { + AllocatedSharedMemory gpu_buffers_shm = + shm_pool->Load( + request_batch_shm_ptr->gpu_buffers_handle); AllocatedSharedMemory gpu_buffers_handle = shm_pool->Load( - request_batch_shm_ptr->gpu_buffers_handle); + gpu_buffers_shm.data_->buffers); try { + if (!gpu_buffers_shm.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool, gpu_buffers_shm.data_->error); + throw PythonBackendException(error->String()); + } #ifdef TRITON_ENABLE_GPU size_t i = 0; for (auto& input_tensor : this->Inputs()) { diff --git a/src/infer_response.cc b/src/infer_response.cc index 4defd74b..668a03d1 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -201,64 +201,50 @@ InferResponse::IsLastResponse() } #ifndef TRITON_PB_STUB -std::shared_ptr +void InferResponse::Send( - TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream, + TRITONBACKEND_Response* response, void* cuda_stream, bool& requires_deferred_callback, const uint32_t flags, std::unique_ptr& shm_pool, + GPUBuffersHelper& gpu_buffer_helper, std::vector, void*>>& output_buffers, - const std::set& requested_output_names, - TRITONBACKEND_Response* response) + const std::set& requested_output_names) { std::shared_ptr response_error = WrapTritonErrorInSharedPtr(nullptr); std::unique_ptr response_error_handling; requires_deferred_callback = false; - // Should only destruct the response factory whenever a response factory is - // being created. - bool destruct_response_factor = (response == nullptr); - - if (response == nullptr) { - SET_ERROR_AND_RETURN( - response_error, - TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)); - } - // This lambda expression will be called when this function exits, if the // inference response doesn't have any GPU tensors. Otherwise, it will be // called when the object is destructed or DeferredSendCallback is called. - response_error_handling = std::make_unique( - [response, response_error, flags, response_factory, - destruct_response_factor] { + response_error_handling = + std::make_unique([response, response_error, flags] { if (response != nullptr) { LOG_IF_ERROR( TRITONBACKEND_ResponseSend(response, flags, *response_error), "failed to send the response."); - if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL && - destruct_response_factor) { - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory_ptr( - reinterpret_cast( - response_factory)); - } } }); // Moves the response sending callback so that it is not called until the stub // process fills in the GPU buffers. - ScopedDefer deferred_task( - [this, &requires_deferred_callback, &response_error_handling] { - if (requires_deferred_callback) { - deferred_send_callback_ = std::move(response_error_handling); - } - }); + ScopedDefer deferred_task([this, &requires_deferred_callback, + &response_error_handling, &gpu_buffer_helper, + response_error, &shm_pool] { + if (*response_error != nullptr) { + gpu_buffer_helper.SetError( + shm_pool, TRITONSERVER_ErrorMessage(*response_error)); + } + if (requires_deferred_callback) { + deferred_send_callback_ = std::move(response_error_handling); + } + }); if (HasError()) { *response_error = TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str()); - return nullptr; + return; } bool cuda_copy = false; @@ -322,6 +308,7 @@ InferResponse::Send( output_tensor->ByteSize(), reinterpret_cast(buffer), true /* copy_gpu */)); } + gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle()); output_buffers.push_back({std::move(output_buffer), buffer}); #endif } @@ -336,6 +323,7 @@ InferResponse::Send( shm_pool, actual_memory_type, actual_memory_type_id, output_tensor->ByteSize(), nullptr /* data ptr */)); + gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle()); output_buffers.push_back({std::move(output_buffer), buffer}); } @@ -357,8 +345,6 @@ InferResponse::Send( cudaStreamSynchronize(reinterpret_cast(cuda_stream)); } #endif // TRITON_ENABLE_GPU - - return response_error; } #endif diff --git a/src/infer_response.h b/src/infer_response.h index 9197df4e..330354a1 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -27,6 +27,7 @@ #pragma once #include +#include "gpu_buffers.h" #include "pb_error.h" #include "pb_tensor.h" #include "pb_utils.h" @@ -49,7 +50,7 @@ struct ResponseShm { TRITONSERVER_Error* raasnie_err__ = (X); \ if (raasnie_err__ != nullptr) { \ *E = raasnie_err__; \ - return E; \ + return; \ } \ } while (false) @@ -62,7 +63,7 @@ struct ResponseShm { TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \ TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \ *E = rarie_err__; \ - return E; \ + return; \ } \ } while (false) @@ -96,13 +97,13 @@ class InferResponse { /// response needs to be done in two step. The boolean /// 'requires_deferred_callback' indicates whether DeferredSendCallback method /// should be called or not. - std::shared_ptr Send( - TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream, + void Send( + TRITONBACKEND_Response* response, void* cuda_stream, bool& requires_deferred_callback, const uint32_t flags, std::unique_ptr& shm_pool, + GPUBuffersHelper& gpu_buffer_helper, std::vector, void*>>& output_buffers, - const std::set& requested_output_names = {}, - TRITONBACKEND_Response* response = nullptr); + const std::set& requested_output_names = {}); void DeferredSendCallback(); #endif diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 234cfd9f..22ecd7e9 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -356,9 +356,10 @@ Stub::RunCommand() LoadGPUBuffers(ipc_message); } catch (const PythonBackendException& pb_exception) { - LOG_INFO << "An error occurred while trying to load GPU buffers in the " - "Python backend stub: " - << pb_exception.what() << std::endl; + LOG_ERROR + << "An error occurred while trying to load GPU buffers in the " + "Python backend stub: " + << pb_exception.what() << std::endl; } break; @@ -539,43 +540,48 @@ Stub::ProcessResponse(InferResponse* response) void Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) { - AllocatedSharedMemory gpu_buffers_handle = - shm_pool_->Load(ipc_message->Args()); + ScopedDefer load_gpu_buffer_response([this] { + // LoadGPUBuffers must let the parent process know when loading the + // buffers have been finished. + parent_message_queue_->Push(DUMMY_MESSAGE); + gpu_tensors_.clear(); + }); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - - if (gpu_tensors_.size() != *gpu_buffer_count) { - LOG_INFO - << (std::string( - "GPU buffers size does not match the provided buffers: ") + - std::to_string(gpu_tensors_.size()) + - " != " + std::to_string(*gpu_buffer_count)); - return; + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Load(ipc_message->Args()); + + if (!gpu_buffers_handle.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle.data_->error); + throw PythonBackendException( + "Failed to load GPU buffers: " + error->String()); } - std::vector> dst_buffers; + uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count; + AllocatedSharedMemory + gpu_buffers_handle_shm = + shm_pool_->Load( + gpu_buffers_handle.data_->buffers); + if (gpu_tensors_.size() != gpu_buffer_count) { + throw PythonBackendException( + std::string("GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors_.size()) + + " != " + std::to_string(gpu_buffer_count)); + } + + std::vector> dst_buffers; for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( - shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); + shm_pool_, gpu_buffers_handle_shm.data_.get()[i], + true /* open_cuda_handle */); dst_buffers.emplace_back(std::move(dst_buffer)); } - ScopedDefer load_gpu_buffer_response([this] { - // Push a dummy message to signal the thread to terminate. - parent_message_queue_->Push(DUMMY_MESSAGE); - }); - for (size_t i = 0; i < gpu_tensors_.size(); i++) { std::shared_ptr& src_buffer = gpu_tensors_[i]; PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); } - - gpu_tensors_.clear(); } py::list diff --git a/src/pb_utils.h b/src/pb_utils.h index 36d7e3c7..c05e8411 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -212,23 +212,17 @@ struct ResponseSenderBase { struct ResponseSendMessage : ResponseSenderBase { bi::managed_external_buffer::handle_t response; - // GPU Buffers handle + // A shm handle to a GPUBuffersShm object. bi::managed_external_buffer::handle_t gpu_buffers_handle; - // GPU buffers count - uint32_t gpu_buffers_count; - uint32_t flags; }; struct RequestBatch { uint32_t batch_size; - // GPU Buffers handle + // A shm handle to a GPUBuffersShm object. bi::managed_external_buffer::handle_t gpu_buffers_handle; - - // GPU buffers count - uint32_t gpu_buffers_count; }; #ifdef TRITON_ENABLE_GPU diff --git a/src/python_be.cc b/src/python_be.cc index 5c815485..87375348 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "python_be.h" +#include "gpu_buffers.h" #include "infer_payload.h" #include "pb_log.h" @@ -623,10 +624,9 @@ ModelInstanceState::ExecuteBLSRequest( is_response_batch_set = true; bool has_gpu_tensor = false; + GPUBuffersHelper gpu_buffer_helper; PythonBackendException pb_exception(std::string{}); - - uint32_t gpu_buffers_count = 0; if (request_batch_shm_ptr->batch_size == 1) { std::shared_ptr infer_request; bi::managed_external_buffer::handle_t* request_handle = @@ -643,7 +643,6 @@ ModelInstanceState::ExecuteBLSRequest( for (auto& input_tensor : infer_request->Inputs()) { if (!input_tensor->IsCPU()) { #ifdef TRITON_ENABLE_GPU - gpu_buffers_count++; BackendMemory* backend_memory; std::unique_ptr lbackend_memory; has_gpu_tensor = true; @@ -661,38 +660,25 @@ ModelInstanceState::ExecuteBLSRequest( lbackend_memory.reset(backend_memory); input_tensor->SetMemory(std::move(PbMemory::Create( Stub()->ShmPool(), std::move(lbackend_memory)))); + gpu_buffer_helper.AddBuffer( + input_tensor->Memory()->ShmHandle()); #endif // TRITON_ENABLE_GPU } } } catch (const PythonBackendException& exception) { + gpu_buffer_helper.SetError(Stub()->ShmPool(), exception.what()); pb_exception = exception; } - AllocatedSharedMemory gpu_handles; // Wait for the extra round trip to complete. The stub process will fill // in the data for the GPU tensors. If there is an error, the extra round // trip must be still completed, otherwise the stub process will always be // waiting for a message from the parent process. if (has_gpu_tensor) { - try { - gpu_handles = Stub() - ->ShmPool() - ->Construct( - gpu_buffers_count); - request_batch_shm_ptr->gpu_buffers_count = gpu_buffers_count; - request_batch_shm_ptr->gpu_buffers_handle = gpu_handles.handle_; - size_t i = 0; - for (auto& input_tensor : infer_request->Inputs()) { - if (!input_tensor->IsCPU()) { - gpu_handles.data_.get()[i] = input_tensor->Memory()->ShmHandle(); - ++i; - } - } - } - catch (const PythonBackendException& exception) { - pb_exception = exception; - } + gpu_buffer_helper.Complete(Stub()->ShmPool()); + request_batch_shm_ptr->gpu_buffers_handle = + gpu_buffer_helper.ShmHandle(); bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; @@ -700,7 +686,7 @@ ModelInstanceState::ExecuteBLSRequest( ipc_message->ResponseCondition()->wait(lock); } - if (pb_exception.what() != nullptr) { + if (pb_exception.what() == std::string{""}) { auto callback = std::bind( &ModelInstanceState::SendBLSDecoupledResponse, this, std::placeholders::_1); @@ -1071,32 +1057,31 @@ ModelInstanceState::ResponseSendDecoupled( false /* open cuda ipc handle */); bool requires_deferred_callback = false; + TRITONBACKEND_Response* response; + SetErrorForResponseSendMessage( + send_message_payload, + WrapTritonErrorInSharedPtr( + TRITONBACKEND_ResponseNewFromFactory(&response, response_factory)), + error_message); + std::vector, void*>> gpu_output_buffers; - std::shared_ptr error = infer_response->Send( - response_factory, CudaStream(), requires_deferred_callback, - send_message_payload->flags, Stub()->ShmPool(), gpu_output_buffers); - SetErrorForResponseSendMessage(send_message_payload, error, error_message); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + response_factory_ptr; + GPUBuffersHelper gpu_buffer_helper; + if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_ptr.reset( + reinterpret_cast(response_factory)); + } + infer_response->Send( + response, CudaStream(), requires_deferred_callback, + send_message_payload->flags, Stub()->ShmPool(), gpu_buffer_helper, + gpu_output_buffers); if (requires_deferred_callback) { - AllocatedSharedMemory gpu_buffers_handle = - Stub()->ShmPool()->Construct( - sizeof(uint64_t) + - gpu_output_buffers.size() * - sizeof(bi::managed_external_buffer::handle_t)); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - *gpu_buffer_count = gpu_output_buffers.size(); - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - send_message_payload->gpu_buffers_handle = gpu_buffers_handle.handle_; - - size_t index = 0; - for (auto& output_buffer_pair : gpu_output_buffers) { - std::unique_ptr& pb_memory = output_buffer_pair.first; - gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); - ++index; - } + gpu_buffer_helper.Complete(Stub()->ShmPool()); + send_message_payload->gpu_buffers_handle = + gpu_buffer_helper.ShmHandle(); // Additional round trip so that the stub can fill the GPU output buffers. { @@ -1109,7 +1094,6 @@ ModelInstanceState::ResponseSendDecoupled( } } - index = 0; bool cuda_copy = false; for (auto& output_buffer_pair : gpu_output_buffers) { auto& pb_memory = output_buffer_pair.first; @@ -1125,8 +1109,6 @@ ModelInstanceState::ResponseSendDecoupled( CudaStream(), &cuda_used); cuda_copy |= cuda_used; } - gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); - ++index; #ifdef TRITON_ENABLE_GPU if (cuda_copy) { cudaStreamSynchronize(stream_); @@ -1407,6 +1389,7 @@ ModelInstanceState::ProcessRequests( std::vector> shm_responses; std::vector, void*>>> gpu_output_buffers(request_count); + GPUBuffersHelper gpu_buffer_helper; for (uint32_t r = 0; r < request_count; ++r) { NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); @@ -1466,17 +1449,13 @@ ModelInstanceState::ProcessRequests( gpu_output_buffers[r] = std::vector, void*>>{}; - std::shared_ptr error = infer_response->Send( - nullptr, CudaStream(), require_deferred_callback, + infer_response->Send( + response, CudaStream(), require_deferred_callback, TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), - gpu_output_buffers[r], requested_output_names, response); - GUARDED_RESPOND_IF_ERROR(responses, r, *error); + gpu_buffer_helper, gpu_output_buffers[r], requested_output_names); requires_deferred_callback[r] = require_deferred_callback; - // Error object will be deleted by the GUARDED_RESPOND macro - *error = nullptr; - error.reset(); if (requires_deferred_callback[r]) { has_gpu_output = true; } @@ -1488,39 +1467,15 @@ ModelInstanceState::ProcessRequests( // If the output tensor is in GPU, there will be a second round trip // required for filling the GPU buffers provided by the main process. if (has_gpu_output) { - size_t total_gpu_buffers_count = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - total_gpu_buffers_count += gpu_output_buffer.size(); - } - AllocatedSharedMemory gpu_buffers_handle = - Stub()->ShmPool()->Construct( - sizeof(uint64_t) + - total_gpu_buffers_count * - sizeof(bi::managed_external_buffer::handle_t)); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - *gpu_buffer_count = total_gpu_buffers_count; - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - - size_t index = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer) { - gpu_buffers_handle_shm[index] = buffer_memory_pair.first->ShmHandle(); - ++index; - } - } - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; - ipc_message->Args() = gpu_buffers_handle.handle_; + gpu_buffer_helper.Complete(Stub()->ShmPool()); + ipc_message->Args() = gpu_buffer_helper.ShmHandle(); SendMessageAndReceiveResponse( ipc_message->ShmHandle(), response_message, restart, responses, requests, 0); bool cuda_copy = false; - index = 0; uint32_t response_index = 0; for (auto& gpu_output_buffer : gpu_output_buffers) { for (auto& buffer_memory_pair : gpu_output_buffer) { @@ -1538,8 +1493,6 @@ ModelInstanceState::ProcessRequests( CudaStream(), &cuda_used)); cuda_copy |= cuda_used; } - gpu_buffers_handle_shm[index] = pb_memory->ShmHandle(); - ++index; } response_index++; #ifdef TRITON_ENABLE_GPU diff --git a/src/response_sender.cc b/src/response_sender.cc index e8394df9..31e1be5b 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -130,20 +130,21 @@ ResponseSender::Send( } if (has_gpu_output) { - AllocatedSharedMemory gpu_buffers_handle = - shm_pool_->Load(send_message_payload->gpu_buffers_handle); - - bi::managed_external_buffer::handle_t* gpu_buffers_handle_shm = - reinterpret_cast( - gpu_buffers_handle.data_.get() + sizeof(uint64_t)); - uint64_t* gpu_buffer_count = - reinterpret_cast(gpu_buffers_handle.data_.get()); - if (gpu_tensors.size() != *gpu_buffer_count) { - LOG_INFO + AllocatedSharedMemory gpu_buffers_handle = + shm_pool_->Load( + send_message_payload->gpu_buffers_handle); + + AllocatedSharedMemory + gpu_buffers_handle_shm = + shm_pool_->Load( + gpu_buffers_handle.data_->buffers); + uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count; + if (gpu_tensors.size() != gpu_buffer_count) { + LOG_ERROR << (std::string( "GPU buffers size does not match the provided buffers: ") + std::to_string(gpu_tensors.size()) + - " != " + std::to_string(*gpu_buffer_count)); + " != " + std::to_string(gpu_buffer_count)); return; } @@ -151,7 +152,8 @@ ResponseSender::Send( for (size_t i = 0; i < gpu_tensors.size(); i++) { std::unique_ptr dst_buffer = PbMemory::LoadFromSharedMemory( - shm_pool_, gpu_buffers_handle_shm[i], true /* open_cuda_handle */); + shm_pool_, gpu_buffers_handle_shm.data_.get()[i], + true /* open_cuda_handle */); dst_buffers.emplace_back(std::move(dst_buffer)); std::shared_ptr& src_buffer = gpu_tensors[i]; PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); From 19d0334dc70c2e09776e58acc070316f2829924a Mon Sep 17 00:00:00 2001 From: Stephen Mugisha Date: Mon, 12 Jun 2023 23:21:05 +0300 Subject: [PATCH 111/216] Add missing word in README (#256) I think the word end is the logical missing word here --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 324de7b4..cbebf4a7 100644 --- a/README.md +++ b/README.md @@ -1152,7 +1152,7 @@ keyword and `asyncio.run` being introduced in Python 3.7. [Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models) require setting additional flags in the inference request to indicate the -start and of a sequence. The `flags` argument in the `pb_utils.InferenceRequest` +start and end of a sequence. The `flags` argument in the `pb_utils.InferenceRequest` object can be used to indicate whether the request is the first or last request in the sequence. An example indicating that the request is starting the sequence: From 22558bc9147830b9397dfe0f1744b5ca445d678a Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Mon, 12 Jun 2023 14:33:45 -0700 Subject: [PATCH 112/216] Add error checking for the case where MetricFamily is deleted before deleting Metric. Update documentation. (#254) --- README.md | 5 ++++- src/metric.cc | 34 ++++++++++++++++++++++++---------- src/metric.h | 5 +++++ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index cbebf4a7..961c6c3a 100644 --- a/README.md +++ b/README.md @@ -1326,7 +1326,10 @@ Starting from 23.05, you can utlize Custom Metrics API to register and collect custom metrics in the `initialize`, `execute`, and `finalize` functions of your Python model. The Custom Metrics API is the Python equivalent of the [TRITON C API custom metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md#custom-metrics) -support. +support. You will need to take the ownership of the custom metrics created +through the APIs and must manage their lifetime. Note that a `MetricFamily` +object should be deleted only after all the `Metric` objects under it are +deleted if you'd like to explicitly delete the custom metrics objects. Example below shows how to use this feature: diff --git a/src/metric.cc b/src/metric.cc index a6266dbb..cabf8352 100644 --- a/src/metric.cc +++ b/src/metric.cc @@ -124,11 +124,12 @@ Metric::SendCreateMetricRequest() void Metric::SendIncrementRequest(const double& value) { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - operation_value_ = value; - SaveToSharedMemory(stub->ShmPool()); - CustomMetricsMessage* custom_metrics_msg = nullptr; try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; stub->SendCustomMetricsMessage( &custom_metrics_msg, PYTHONSTUB_MetricRequestIncrement, shm_handle_); } @@ -142,11 +143,12 @@ Metric::SendIncrementRequest(const double& value) void Metric::SendSetValueRequest(const double& value) { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - operation_value_ = value; - SaveToSharedMemory(stub->ShmPool()); - CustomMetricsMessage* custom_metrics_msg = nullptr; try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + CustomMetricsMessage* custom_metrics_msg = nullptr; stub->SendCustomMetricsMessage( &custom_metrics_msg, PYTHONSTUB_MetricRequestSet, shm_handle_); } @@ -159,10 +161,11 @@ Metric::SendSetValueRequest(const double& value) double Metric::SendGetValueRequest() { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - SaveToSharedMemory(stub->ShmPool()); CustomMetricsMessage* custom_metrics_msg = nullptr; try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); stub->SendCustomMetricsMessage( &custom_metrics_msg, PYTHONSTUB_MetricRequestValue, shm_handle_); } @@ -197,6 +200,17 @@ Metric::Clear() } } +void +Metric::CheckIfCleared() +{ + if (is_cleared_) { + throw PythonBackendException( + "Invalid metric operation as the corresponding 'MetricFamily' has been " + "deleted. The 'MetricFamily' object should be deleted AFTER its " + "corresponding 'Metric' objects have been deleted."); + } +} + #else void* Metric::InitializeTritonMetric() diff --git a/src/metric.h b/src/metric.h index 0c9da6db..882b0d5c 100644 --- a/src/metric.h +++ b/src/metric.h @@ -99,6 +99,11 @@ class Metric { /// Send the request to the parent process to get the value of the metric. /// \return Returns the value of the metric. double SendGetValueRequest(); + + /// Throws an exception if the metric has been cleared. This check is to avoid + /// the user error where the corresponding metric family has been deleted + /// before the metric is deleted. + void CheckIfCleared(); #else // Initialize the TRITONSERVER_Metric object. /// \return Returns the address of the TRITONSERVER_Metric object. From 995a8fa663924229de86444f1d74b09233b15358 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Mon, 12 Jun 2023 16:34:13 -0700 Subject: [PATCH 113/216] Fix ubuntu version in the doc (#257) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 961c6c3a..410643d7 100644 --- a/README.md +++ b/README.md @@ -782,7 +782,7 @@ file system paths are currently supported. The behavior of using cloud paths is compile it in the official Triton NGC containers. Otherwise, your compiled stub may use dependencies that are not available in the Triton container that you are using for deployment. For example, compiling the Python backend stub on an OS -other than Ubuntu 20.04 can lead to unexpected errors. +other than Ubuntu 22.04 can lead to unexpected errors. ## Error Handling From 85b23a60118ee095d63cd3ba060f3f8576c962d4 Mon Sep 17 00:00:00 2001 From: Nikhil Kulkarni Date: Tue, 13 Jun 2023 11:29:17 -0700 Subject: [PATCH 114/216] Make changes to python backend to run on inf2 and trn1 devices (#231) * Make changes to python backend to run on inf2 and trn1 devices * Add cmdline option to specify installation for inf1 and inf2 * Remove old script * README update * Fix bugs * fix indentation * store true * Store true * torch.neuron not required in inf2 * Fix model args * torch_neuronx fix * Make compiled model path change relative for inf2 only * Make inf1 default as NV ci does not support inf2 yet * Fix indentation * Address comment and add readme * Unpin protobuf * Address comments to remove tensorboard --- inferentia/README.md | 90 +++++++++++++-- inferentia/scripts/gen_triton_model.py | 63 ++++++++--- inferentia/scripts/setup-pre-container.sh | 100 +++++++++++++++-- inferentia/scripts/setup.sh | 128 ++++++++++++---------- 4 files changed, 291 insertions(+), 90 deletions(-) mode change 100644 => 100755 inferentia/scripts/setup-pre-container.sh diff --git a/inferentia/README.md b/inferentia/README.md index db04f180..132d302b 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Using Triton with Inferentia +# Using Triton with Inferentia 1 Starting from 21.11 release, Triton supports [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) @@ -59,8 +59,6 @@ After logging into the inf1* instance, you will need to clone or simply clone with https. Clone this repo with Github to home repo `/home/ubuntu`. -Ensure that the neuron runtime 1.0 demon (neuron-rtd) is not running and set up -and install neuron 2.X runtime builds with ``` $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh @@ -87,16 +85,10 @@ After starting the Triton container, go into the `python_backend` folder and run ``` This script will: 1. Install necessary dependencies -2. Create a [Custom Python Execution Environment](https://github.com/triton-inference-server/python_backend#creating-custom-execution-environments), - `python_backend_stub` to use for Inferentia -3. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), the Neuron compiler. +2. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), the Neuron compiler. +3. Install neuron framework packages as per your preference e.g., either pytorch, or tensorflow or both. There are user configurable options available for the script as well. -For example, to control the python version for the python environment to 3.6, -you can run: -``` - $source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -v 3.6 -``` Please use the `-h` or `--help` options to learn about more configurable options. ## Setting up the Inferentia model @@ -276,3 +268,79 @@ you would need to run $export TRITON_SERVER_REPO_TAG= ``` before running the script. + +# Using Triton with Inferentia 2, or Trn1 +## pytorch-neuronx and tensorflow-neuronx +1. Similar to the steps for inf1, change the argument to the pre-container and on-container setup scripts to include the `-inf2` or `-trn1`flags e.g., +``` + $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2 +``` +2. On the container, followed by the `docker run` command, you can pass similar argument to the setup.sh script +For Pytorch: +``` +source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -inf2 -p +``` +For Tensorflow: +``` +source /home/ubuntu/python_backend/inferentia/scripts/setup.sh -inf2 -t +``` +3. Following the above steps, when using the `gen_triton_model.py` script, you can pass similar argument `--inf2` to the setup.sh script e.g., for Pytorch +``` +python3 inferentia/scripts/gen_triton_model.py --inf2 --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 +``` +4. **Note**: When using the `--inf2` option, the `--compiled_model` path should be provided relative to the triton model directory. The `initialize()` function in model.py will derive the full path by concatenating the model path within the repository and the relative `--compiled_model` path. +## transformers-neuronx +To use inf2/trn1 instances with transformers-neuronx packages for serving models, generate a `pytorch` model as per above instructions. The transformers-neuronx currently supports the models listed [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/transformers-neuronx/readme.html#currently-supported-models). + +As prescribed on the neuronx documentation page, while the neuronx load API differs per model, it follows the same pattern. + +1. To serve transformers-neuronx models, first trace the model using `save_pretrained_split()` API on an inf2 instance (recommed inf2.24xl for Large Language Models). Following that, package the folder as the '--compiled_model' when using `gen_triton_model.py` file. +2. The following tree shows a sample model structure for OPT model: +``` +opt/ +├── 1 +│   └── model.py +├── opt-125m-model +│   └── pytorch_model.bin +└── opt-125m-tp12 + ├── FullyUnrolled.1814.1 + │   ├── penguin-sg0000 + │   └── sg00 + ├── FullyUnrolled.1814.2 + │   ├── penguin-sg0000 + │   └── sg00 + ├── FullyUnrolled.1814.3 + │   ├── penguin-sg0000 + │   └── sg00 + ├── FullyUnrolled.1814.4 + │   ├── penguin-sg0000 + │   └── sg00 + └── FullyUnrolled.1814.5 + ├── penguin-sg0000 + └── sg00 + ├── config.pbtxt +``` +3. Add the following imports (e.g., for OPT model). The import will differ as per the model you're trying to run. +``` +from transformers_neuronx.opt.model import OPTForSampling +``` +4. Add the following lines in `initialize()` function. Set the `batch_size`, `tp_degree`, `n_positions`, `amp` and `unroll` args as per your requirement. `tp_degree` should typically match the number of neuron cores available on inf2 instance. +``` +batch_size = 1 +tp_degree = 12 +n_positions = 2048 +amp = 'bf16' +unroll = None +self.model_neuron = OPTForSampling.from_pretrained(compiled_model, batch_size=batch_size, amp=amp, tp_degree=tp_degree, n_positions=n_positions, unroll=unroll) +self.model_neuron.to_neuron() + +self.model_neuron.num_workers = num_threads +``` +You may also chose to add the `batch_size` etc. arguments to config.pbtxt as parameters and read them in the `initialize()` function similar to `--compiled-model`. +5. Finally, in the `excute()` function, use the following API to run the inference: +``` +batched_results = self.model_neuron.sample(batched_tensor, 2048) +``` +Above, `2048` is a sufficiently-long output token. It may also be passed in as one of the inputs if you wanto specify it as part of the payload. +6. Proceed to load the model, and submit the inference payload similar to any other triton model. \ No newline at end of file diff --git a/inferentia/scripts/gen_triton_model.py b/inferentia/scripts/gen_triton_model.py index 0e8a4e10..e02f9862 100644 --- a/inferentia/scripts/gen_triton_model.py +++ b/inferentia/scripts/gen_triton_model.py @@ -231,6 +231,7 @@ def initialize(self, args): params = model_config['parameters'] compiled_model = params['COMPILED_MODEL']['string_value'] + nc_start_idx = int(params['NEURON_CORE_START_INDEX']['string_value']) nc_end_idx = int(params['NEURON_CORE_END_INDEX']['string_value']) if nc_end_idx < nc_start_idx: @@ -255,7 +256,7 @@ def initialize(self, args): return init_impl -def get_tensorflow_initialize_impl(): +def get_tensorflow_initialize_impl(is_inf2=False): init_impl = get_common_initialize_impl() init_impl += ''' self.input_list = [] @@ -270,20 +271,27 @@ def get_tensorflow_initialize_impl(): (config_output['name'], config_output['data_type'], config_output['dims'])) - # TODO: Validate input/output from the model - os.environ["NEURON_RT_NUM_CORES"] = str(cores_per_instance) - +''' + if is_inf2: + init_impl += ''' + compiled_model = os.path.join(args['model_repository'], compiled_model) + self.pred_list = [ + tf.keras.models.load_model(compiled_model) + for _ in range(cores_per_instance) + ] * threads_per_core +''' + else: + init_impl += ''' self.pred_list = [ tf.contrib.predictor.from_saved_model(compiled_model) for _ in range(cores_per_instance) ] * threads_per_core - ''' return init_impl -def get_pytorch_initialize_impl(): +def get_pytorch_initialize_impl(is_inf2=False): init_impl = ''' def _validate_and_get_index(self, name): parts = name.split('__') @@ -340,11 +348,19 @@ def _validate_output_dict(self, expected_count): os.environ["NEURON_RT_VISIBLE_CORES"] = cores_range consumed_cores_list = [i for i in range(cores_per_instance)] - +''' + if is_inf2: + init_impl += ''' + compiled_model = os.path.join(args['model_repository'], compiled_model) + self.model_neuron = torch.jit.load(compiled_model) +''' + else: + init_impl += ''' self.model_neuron = torch.neuron.DataParallel( - torch.jit.load(compiled_model), device_ids=consumed_cores_list) + torch.jit.load(compiled_model), device_ids=consumed_cores_list) +''' + init_impl += ''' self.model_neuron.num_workers = num_threads - ''' return init_impl @@ -590,7 +606,7 @@ def finalize(self): def get_triton_python_model_impl(using_tensorflow_model, - disable_batch_requests_to_neuron): + disable_batch_requests_to_neuron, is_inf2=False): triton_pmi = ''' class TritonPythonModel: """Your Python model must use the same class name. Every Python model @@ -599,11 +615,11 @@ class TritonPythonModel: ''' if using_tensorflow_model: - triton_pmi += get_tensorflow_initialize_impl() + triton_pmi += get_tensorflow_initialize_impl(is_inf2) triton_pmi += get_tensorflow_execute_impl( disable_batch_requests_to_neuron) else: - triton_pmi += get_pytorch_initialize_impl() + triton_pmi += get_pytorch_initialize_impl(is_inf2) triton_pmi += get_pytorch_execute_impl(disable_batch_requests_to_neuron) triton_pmi += get_finalize_impl() @@ -611,7 +627,7 @@ class TritonPythonModel: return triton_pmi -def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): +def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False): triton_model = get_model_license() triton_model += ''' import json @@ -629,15 +645,28 @@ def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): else: triton_model += ''' import torch -import torch.neuron ''' + if not is_inf2: + triton_model += ''' +import torch.neuron + ''' + else: + triton_model += ''' +import torch_neuronx +''' triton_model += get_triton_python_model_impl( - using_tensorflow_model, disable_batch_requests_to_neuron) + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2) return triton_model if __name__ == '__main__': parser = argparse.ArgumentParser() + parser.add_argument('--inf2', + required=False, + default=False, + action='http://github.com/store_true', + help="Specify whether the model should be generate for inf2 or inf1, default is inf1" + ) parser.add_argument('--model_type', type=str, required=True, @@ -799,7 +828,9 @@ def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron): with open(FLAGS.triton_model_dir + "/config.pbtxt", "w") as config_file: config_file.write(mc) + is_inf2 = FLAGS.inf2 + mf = create_model_file(is_tensorflow_model, - FLAGS.disable_batch_requests_to_neuron) + FLAGS.disable_batch_requests_to_neuron, is_inf2) with open(FLAGS.triton_model_dir + "/1/model.py", "w") as model_file: model_file.write(mf) diff --git a/inferentia/scripts/setup-pre-container.sh b/inferentia/scripts/setup-pre-container.sh old mode 100644 new mode 100755 index 1109edc7..1d3e9a43 --- a/inferentia/scripts/setup-pre-container.sh +++ b/inferentia/scripts/setup-pre-container.sh @@ -24,11 +24,75 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#! /bin/sh + +USAGE=" +usage: setup.sh [options] + +Sets up runtime and tools for execution on Inferentia chips. +-h|--help Shows usage +-inf1|--inf1-setup Installs runtime and tools for inf1/neuron, inf1 is default +-inf2|--inf2-setup Installs runtime and tools for inf2/neuronx +-trn1|--trn1-setup Installs runtime, tools for inf2, and installs EFA for trn1 +" + +# Get all options: +OPTS=$(getopt -o hb:v:i:tp --long help,python-backend-path:,python-version:,inferentia-path:,use-tensorflow,use-pytorch,tensorflow-version: -- "$@") + + +export INSTALL_INF2=0 +export INSTALL_INF1=1 +export INSTALL_TRN1=0 + +export CWD=`pwd` + cd /home/ubuntu +for OPTS; do + case "$OPTS" in + -h|--help) + printf "%s\\n" "$USAGE" + return 0 + ;; + -inf1|--inf1-setup) + INSTALL_INF1=1 + echo "Script will install runtime and tools for inf1/neuron" + shift 1 + ;; + -inf2|--inf2-setup) + INSTALL_INF2=1 + shift 1 + echo "Script will install runtime and tools for inf2/neruonx" + ;; + -trn1|--trn1-setup) + INSTALL_TRN1=1 + echo "Script will install runtime and tools for trn1" + shift 1 + ;; + esac +done + +if [ ${INSTALL_INF1} -ne 1 ] && [ ${INSTALL_INF2} -ne 1 ]; then + echo "Error: need to specify either -inf1, -inf2 or -trn1." + printf "source %s\\n" "$USAGE" + return 1 +fi + +if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_INF2} -eq 1] +then + echo "Error: cannot install both inf1 and inf2 dependencies. Please select either -inf1 or -inf2." + return 1 +fi + +if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_TRN1} -eq 1 ] +then + echo "Error: cannot install both inf1 and trn1 dependencies. Selecting -trn1 will install inf2 dependencies and EFA." +fi + # First stop and remove old neuron 1.X runtime -sudo systemctl stop neuron-rtd -sudo apt remove aws-neuron-runtime -y +sudo systemctl stop neuron-rtd || true +sudo apt remove aws-neuron-runtime -y || true # Then install new neuron libraries . /etc/os-release @@ -36,9 +100,31 @@ sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null </dev/null | \ - gpg --dearmor - | \ - tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ - . /etc/os-release && \ - echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ - tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ - apt-get update && \ - apt-get install -y --no-install-recommends cmake cmake-data -cmake --version - -# First compile correct python stub -cd ${PYTHON_BACKEND_PATH} -rm -rf build && mkdir build && cd build -cmake -DTRITON_ENABLE_GPU=ON -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install .. -make triton-python-backend-stub -j16 # Set Pip repository to point to the Neuron repository # since we need to use pip to update: # https://aws.amazon.com/blogs/developer/neuron-conda-packages-eol/ pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com +pip install --upgrade pip + +if [ ${INSTALL_INF2} -eq 1 ];then + # Install Neuron Runtime + # Then install new neuron libraries + . /etc/os-release + tee /etc/apt/sources.list.d/neuron.list > /dev/null < Date: Tue, 13 Jun 2023 17:36:51 -0700 Subject: [PATCH 115/216] Update neuron readme and setup (#259) * Update neuron setup script * Update readme typos * Set inf2 to 0 when installing inf1 for completeness --- inferentia/README.md | 6 +++++- inferentia/scripts/setup.sh | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/inferentia/README.md b/inferentia/README.md index 132d302b..77a00d2c 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -321,10 +321,12 @@ opt/ └── sg00 ├── config.pbtxt ``` + 3. Add the following imports (e.g., for OPT model). The import will differ as per the model you're trying to run. ``` from transformers_neuronx.opt.model import OPTForSampling ``` + 4. Add the following lines in `initialize()` function. Set the `batch_size`, `tp_degree`, `n_positions`, `amp` and `unroll` args as per your requirement. `tp_degree` should typically match the number of neuron cores available on inf2 instance. ``` batch_size = 1 @@ -338,9 +340,11 @@ self.model_neuron.to_neuron() self.model_neuron.num_workers = num_threads ``` You may also chose to add the `batch_size` etc. arguments to config.pbtxt as parameters and read them in the `initialize()` function similar to `--compiled-model`. -5. Finally, in the `excute()` function, use the following API to run the inference: + +5. Finally, in the `execute()` function, use the following API to run the inference: ``` batched_results = self.model_neuron.sample(batched_tensor, 2048) ``` Above, `2048` is a sufficiently-long output token. It may also be passed in as one of the inputs if you wanto specify it as part of the payload. + 6. Proceed to load the model, and submit the inference payload similar to any other triton model. \ No newline at end of file diff --git a/inferentia/scripts/setup.sh b/inferentia/scripts/setup.sh index 4ffb246d..550da0ce 100644 --- a/inferentia/scripts/setup.sh +++ b/inferentia/scripts/setup.sh @@ -91,16 +91,19 @@ for OPTS; do ;; -inf1|--inf1-setup) INSTALL_INF1=1 + INSTALL_INF2=0 echo "Installing framework and tools for inf1." shift 1 ;; -inf2|--inf2-setup) INSTALL_INF2=1 + INSTALL_INF1=0 echo "Installing framework and tools for inf2" shift 1 ;; -trn1|--trn1-setup) INSTALL_INF2=1 # same frameworks are used for inf2 and trn1 + INSTALL_INF1=0 echo "Installing framework and tools for trn1/inf2" shift 1 ;; From 5bd5048de8eee672545535ea075e0468e117c223 Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Thu, 22 Jun 2023 11:32:28 -0700 Subject: [PATCH 116/216] Auto-format (#261) --- examples/bls_decoupled/async_client.py | 7 ++++--- examples/bls_decoupled/async_model.py | 3 +-- examples/bls_decoupled/sync_model.py | 4 ++-- examples/custom_metrics/client.py | 13 ++++++++++--- examples/custom_metrics/model.py | 2 +- examples/instance_kind/model.py | 4 ++-- inferentia/scripts/gen_triton_model.py | 21 +++++++++++++-------- src/gpu_buffers.cc | 1 + src/infer_payload.h | 1 + src/infer_request.h | 1 + src/infer_response.cc | 1 + src/infer_response.h | 1 + src/ipc_message.h | 1 + src/memory_manager.cc | 1 + src/memory_manager.h | 1 + src/message_queue.h | 1 + src/metric.h | 1 + src/metric_family.h | 1 + src/pb_env.cc | 2 ++ src/pb_error.h | 1 + src/pb_log.h | 4 ++-- src/pb_map.h | 1 + src/pb_metric_reporter.h | 1 + src/pb_response_iterator.cc | 6 ++++-- src/pb_response_iterator.h | 1 + src/pb_stub.cc | 2 ++ src/pb_stub.h | 2 ++ src/pb_stub_utils.cc | 1 + src/pb_stub_utils.h | 1 + src/pb_tensor.h | 1 + src/pb_utils.cc | 2 ++ src/pb_utils.h | 2 ++ src/python_be.cc | 10 ++++------ src/python_be.h | 2 ++ src/request_executor.cc | 1 + src/request_executor.h | 1 + src/response_sender.cc | 2 ++ src/shm_manager.cc | 4 ++-- src/shm_manager.h | 4 +++- src/shm_monitor/shm_monitor.cc | 1 + src/stub_launcher.cc | 1 + src/stub_launcher.h | 2 ++ 42 files changed, 86 insertions(+), 34 deletions(-) diff --git a/examples/bls_decoupled/async_client.py b/examples/bls_decoupled/async_client.py index aede17b4..4cf8364a 100644 --- a/examples/bls_decoupled/async_client.py +++ b/examples/bls_decoupled/async_client.py @@ -53,12 +53,13 @@ # output_data contains two times of the square value of the input value. output_data = response.as_numpy("SUM") print("==========model result==========") - print("Two times the square value of {} is {}\n".format(input_data, output_data)) + print("Two times the square value of {} is {}\n".format( + input_data, output_data)) - if not np.allclose((2*input_data*input_data), output_data): + if not np.allclose((2 * input_data * input_data), output_data): print( "BLS Decoupled Async example error: incorrect output value. Expected {}, got {}." - .format((2*input_data*input_data), output_data)) + .format((2 * input_data * input_data), output_data)) sys.exit(1) print('PASS: BLS Decoupled Async') diff --git a/examples/bls_decoupled/async_model.py b/examples/bls_decoupled/async_model.py index 4107f064..3b1d454c 100644 --- a/examples/bls_decoupled/async_model.py +++ b/examples/bls_decoupled/async_model.py @@ -126,8 +126,7 @@ async def execute(self, requests): # Wait for all the inference requests to finish. The execution # of the Python script will be blocked until all the awaitables # are resolved. - async_responses = await asyncio.gather( - *inference_response_awaits) + async_responses = await asyncio.gather(*inference_response_awaits) # The variable that will store the sum of the responses. response_sum = np.array([0]) diff --git a/examples/bls_decoupled/sync_model.py b/examples/bls_decoupled/sync_model.py index b68ffbf8..7ca397fd 100644 --- a/examples/bls_decoupled/sync_model.py +++ b/examples/bls_decoupled/sync_model.py @@ -126,8 +126,8 @@ def execute(self, requests): # Check for the last empty response. if len(infer_response.output_tensors()) > 0: - response_sum += pb_utils.get_output_tensor_by_name( - infer_response, "OUT").as_numpy() + response_sum += pb_utils.get_output_tensor_by_name( + infer_response, "OUT").as_numpy() response = [ pb_utils.InferenceResponse( diff --git a/examples/custom_metrics/client.py b/examples/custom_metrics/client.py index 48b2e610..e74c5bc0 100644 --- a/examples/custom_metrics/client.py +++ b/examples/custom_metrics/client.py @@ -34,12 +34,14 @@ model_name = "custom_metrics" shape = [4] + def get_metrics(): metrics_url = "http://localhost:8002/metrics" r = requests.get(metrics_url) r.raise_for_status() return r.text + with httpclient.InferenceServerClient("localhost:8000") as client: input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) @@ -78,13 +80,18 @@ def get_metrics(): patterns = [ '# HELP requests_process_latency_ns Cumulative time spent processing requests', '# TYPE requests_process_latency_ns counter', - 'requests_process_latency_ns{model="custom_metrics",version="1"}'] + 'requests_process_latency_ns{model="custom_metrics",version="1"}' + ] for pattern in patterns: if pattern not in metrics: - print("custom_metrics example error: missing pattern '{}' in metrics".format(pattern)) + print( + "custom_metrics example error: missing pattern '{}' in metrics". + format(pattern)) sys.exit(1) else: - print("custom_metrics example: found pattern '{}' in metrics".format(pattern)) + print( + "custom_metrics example: found pattern '{}' in metrics".format( + pattern)) print('PASS: custom_metrics') sys.exit(0) diff --git a/examples/custom_metrics/model.py b/examples/custom_metrics/model.py index c5a0a55b..02abcc13 100644 --- a/examples/custom_metrics/model.py +++ b/examples/custom_metrics/model.py @@ -80,7 +80,7 @@ def initialize(self, args): self.metric_family = pb_utils.MetricFamily( name="requests_process_latency_ns", description="Cumulative time spent processing requests", - kind=pb_utils.MetricFamily.COUNTER # or pb_utils.MetricFamily.GAUGE + kind=pb_utils.MetricFamily.COUNTER # or pb_utils.MetricFamily.GAUGE ) # Create a Metric object under the MetricFamily object. The 'labels' diff --git a/examples/instance_kind/model.py b/examples/instance_kind/model.py index 1db6e57b..24f51cfc 100644 --- a/examples/instance_kind/model.py +++ b/examples/instance_kind/model.py @@ -45,9 +45,9 @@ def initialize(self, args): the default device of the framework. """ self.device = 'cuda' if args["model_instance_kind"] == "GPU" else 'cpu' - # This example is configured to work with torch=1.13 + # This example is configured to work with torch=1.13 # and torchvision=0.14. Thus, we need to provide a proper tag `0.14.1` - # to make sure loaded Resnet50 is compatible with + # to make sure loaded Resnet50 is compatible with # installed `torchvision`. # Refer to README for installation instructions. self.model = torch.hub.load("pytorch/vision:v0.14.1", diff --git a/inferentia/scripts/gen_triton_model.py b/inferentia/scripts/gen_triton_model.py index e02f9862..75f0425b 100644 --- a/inferentia/scripts/gen_triton_model.py +++ b/inferentia/scripts/gen_triton_model.py @@ -606,7 +606,8 @@ def finalize(self): def get_triton_python_model_impl(using_tensorflow_model, - disable_batch_requests_to_neuron, is_inf2=False): + disable_batch_requests_to_neuron, + is_inf2=False): triton_pmi = ''' class TritonPythonModel: """Your Python model must use the same class name. Every Python model @@ -627,7 +628,9 @@ class TritonPythonModel: return triton_pmi -def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False): +def create_model_file(using_tensorflow_model, + disable_batch_requests_to_neuron, + is_inf2=False): triton_model = get_model_license() triton_model += ''' import json @@ -661,12 +664,14 @@ def create_model_file(using_tensorflow_model, disable_batch_requests_to_neuron, if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--inf2', - required=False, - default=False, - action='http://github.com/store_true', - help="Specify whether the model should be generate for inf2 or inf1, default is inf1" - ) + parser.add_argument( + '--inf2', + required=False, + default=False, + action='http://github.com/store_true', + help= + "Specify whether the model should be generate for inf2 or inf1, default is inf1" + ) parser.add_argument('--model_type', type=str, required=True, diff --git a/src/gpu_buffers.cc b/src/gpu_buffers.cc index 6b370ea1..4b1b0f9f 100644 --- a/src/gpu_buffers.cc +++ b/src/gpu_buffers.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "gpu_buffers.h" + #include "pb_string.h" namespace triton { namespace backend { namespace python { diff --git a/src/infer_payload.h b/src/infer_payload.h index 2002d0be..662e8922 100644 --- a/src/infer_payload.h +++ b/src/infer_payload.h @@ -28,6 +28,7 @@ #include #include + #include "infer_response.h" #include "pb_preferred_memory.h" diff --git a/src/infer_request.h b/src/infer_request.h index 96b65dc0..7eb2fd88 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -28,6 +28,7 @@ #include #include + #include "infer_response.h" #include "pb_preferred_memory.h" #include "pb_tensor.h" diff --git a/src/infer_response.cc b/src/infer_response.cc index 668a03d1..afadc324 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -31,6 +31,7 @@ namespace py = pybind11; #endif #include + #include "scoped_defer.h" diff --git a/src/infer_response.h b/src/infer_response.h index 330354a1..bdf31bb4 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -27,6 +27,7 @@ #pragma once #include + #include "gpu_buffers.h" #include "pb_error.h" #include "pb_tensor.h" diff --git a/src/ipc_message.h b/src/ipc_message.h index 0d843d47..04268d93 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -28,6 +28,7 @@ #include #include + #include "shm_manager.h" diff --git a/src/memory_manager.cc b/src/memory_manager.cc index 54bdfe39..23ac99be 100644 --- a/src/memory_manager.cc +++ b/src/memory_manager.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "memory_manager.h" + #include "pb_utils.h" diff --git a/src/memory_manager.h b/src/memory_manager.h index 7930d0e8..3ea6cc12 100644 --- a/src/memory_manager.h +++ b/src/memory_manager.h @@ -30,6 +30,7 @@ #include #include #include + #include "message_queue.h" #include "triton/backend/backend_common.h" #include "triton/core/tritonserver.h" diff --git a/src/message_queue.h b/src/message_queue.h index bb87e04a..045c56e9 100644 --- a/src/message_queue.h +++ b/src/message_queue.h @@ -31,6 +31,7 @@ #include #include #include + #include "shm_manager.h" namespace triton { namespace backend { namespace python { diff --git a/src/metric.h b/src/metric.h index 882b0d5c..197e8ce9 100644 --- a/src/metric.h +++ b/src/metric.h @@ -27,6 +27,7 @@ #pragma once #include + #include "ipc_message.h" #include "pb_string.h" #include "pb_utils.h" diff --git a/src/metric_family.h b/src/metric_family.h index bab71076..54574892 100644 --- a/src/metric_family.h +++ b/src/metric_family.h @@ -27,6 +27,7 @@ #pragma once #include + #include "ipc_message.h" #include "metric.h" #include "pb_string.h" diff --git a/src/pb_env.cc b/src/pb_env.cc index 2065e6db..a4278102 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -29,9 +29,11 @@ #include #include #include + #include #include #include + #include "pb_utils.h" diff --git a/src/pb_error.h b/src/pb_error.h index d4461082..b80546b2 100644 --- a/src/pb_error.h +++ b/src/pb_error.h @@ -27,6 +27,7 @@ #pragma once #include + #include "pb_string.h" #include "pb_utils.h" diff --git a/src/pb_log.h b/src/pb_log.h index 62c24aa6..65d41009 100644 --- a/src/pb_log.h +++ b/src/pb_log.h @@ -27,13 +27,13 @@ #pragma once #include + #include "pb_string.h" #include "pb_utils.h" namespace triton { namespace backend { namespace python { class PbLog { public: - /// Create a PbLog instance PbLog( const std::string& filename, uint32_t line, const std::string& message, @@ -65,7 +65,7 @@ class PbLogShm { std::unique_ptr& shm_pool, const std::string& filename, const uint32_t& line, const std::string& message, const LogLevel& level); - + /// Load PbLog object to shared memory static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, diff --git a/src/pb_map.h b/src/pb_map.h index c4827b7c..a231b719 100644 --- a/src/pb_map.h +++ b/src/pb_map.h @@ -27,6 +27,7 @@ #pragma once #include + #include "pb_string.h" #include "shm_manager.h" diff --git a/src/pb_metric_reporter.h b/src/pb_metric_reporter.h index 88062f86..89c81b38 100644 --- a/src/pb_metric_reporter.h +++ b/src/pb_metric_reporter.h @@ -29,6 +29,7 @@ #include #include #include + #include "triton/core/tritonbackend.h" namespace triton { namespace backend { namespace python { diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc index 27a6c64b..26817c40 100644 --- a/src/pb_response_iterator.cc +++ b/src/pb_response_iterator.cc @@ -25,10 +25,12 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "pb_response_iterator.h" -#include -#include "pb_stub.h" #include + +#include + +#include "pb_stub.h" namespace py = pybind11; namespace triton { namespace backend { namespace python { diff --git a/src/pb_response_iterator.h b/src/pb_response_iterator.h index 98351369..1122a216 100644 --- a/src/pb_response_iterator.h +++ b/src/pb_response_iterator.h @@ -27,6 +27,7 @@ #pragma once #include + #include "infer_response.h" namespace triton { namespace backend { namespace python { diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 22ecd7e9..3353a5e5 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -29,6 +29,7 @@ #include #include #include + #include #include #include @@ -41,6 +42,7 @@ #include #include #include + #include "pb_error.h" #include "pb_map.h" #include "pb_preferred_memory.h" diff --git a/src/pb_stub.h b/src/pb_stub.h index c9188631..f5af89c9 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -29,6 +29,7 @@ #include #include #include + #include #include #include @@ -41,6 +42,7 @@ #include #include #include + #include "infer_request.h" #include "infer_response.h" #include "ipc_message.h" diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc index 70557f64..c9ffd661 100644 --- a/src/pb_stub_utils.cc +++ b/src/pb_stub_utils.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "pb_stub_utils.h" + #include "pb_utils.h" namespace triton { namespace backend { namespace python { diff --git a/src/pb_stub_utils.h b/src/pb_stub_utils.h index 598bf436..6068fba9 100644 --- a/src/pb_stub_utils.h +++ b/src/pb_stub_utils.h @@ -28,6 +28,7 @@ #include #include #include + #include "triton/core/tritonserver.h" namespace py = pybind11; diff --git a/src/pb_tensor.h b/src/pb_tensor.h index 79adf500..b9c0d593 100644 --- a/src/pb_tensor.h +++ b/src/pb_tensor.h @@ -41,6 +41,7 @@ namespace py = pybind11; #include #include + #include "pb_memory.h" #include "pb_string.h" #include "pb_utils.h" diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 3c607dea..c6897631 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -38,12 +38,14 @@ #include #include #include + #include #include #include #include #include #include + #include "scoped_defer.h" #ifdef TRITON_ENABLE_GPU diff --git a/src/pb_utils.h b/src/pb_utils.h index c05e8411..a46aa8fa 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -30,6 +30,7 @@ #include #endif // TRITON_ENABLE_GPU #include + #include #include #include @@ -38,6 +39,7 @@ #include #include #include + #include "pb_exception.h" #include "shm_manager.h" #include "triton/backend/backend_common.h" diff --git a/src/python_be.cc b/src/python_be.cc index 87375348..08110d0a 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -660,8 +660,7 @@ ModelInstanceState::ExecuteBLSRequest( lbackend_memory.reset(backend_memory); input_tensor->SetMemory(std::move(PbMemory::Create( Stub()->ShmPool(), std::move(lbackend_memory)))); - gpu_buffer_helper.AddBuffer( - input_tensor->Memory()->ShmHandle()); + gpu_buffer_helper.AddBuffer(input_tensor->Memory()->ShmHandle()); #endif // TRITON_ENABLE_GPU } } @@ -1080,8 +1079,7 @@ ModelInstanceState::ResponseSendDecoupled( if (requires_deferred_callback) { gpu_buffer_helper.Complete(Stub()->ShmPool()); - send_message_payload->gpu_buffers_handle = - gpu_buffer_helper.ShmHandle(); + send_message_payload->gpu_buffers_handle = gpu_buffer_helper.ShmHandle(); // Additional round trip so that the stub can fill the GPU output buffers. { @@ -1125,8 +1123,8 @@ ModelInstanceState::ResponseSendDecoupled( if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { std::unique_ptr< TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory(reinterpret_cast( - send_message_payload->response_factory_address)); + response_factory(reinterpret_cast( + send_message_payload->response_factory_address)); } } } diff --git a/src/python_be.h b/src/python_be.h index ebcedba3..b1a44b23 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -32,6 +32,7 @@ #include #include #include + #include #include #include @@ -57,6 +58,7 @@ #include #include #include + #include "infer_request.h" #include "infer_response.h" #include "ipc_message.h" diff --git a/src/request_executor.cc b/src/request_executor.cc index 025c0deb..00a9b201 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -27,6 +27,7 @@ #include "request_executor.h" #include + #include "pb_utils.h" #include "scoped_defer.h" #include "triton/backend/backend_common.h" diff --git a/src/request_executor.h b/src/request_executor.h index 56ed5ca5..1c5eb1fa 100644 --- a/src/request_executor.h +++ b/src/request_executor.h @@ -27,6 +27,7 @@ #pragma once #include + #include "infer_payload.h" #include "infer_request.h" #include "infer_response.h" diff --git a/src/response_sender.cc b/src/response_sender.cc index 31e1be5b..a74459f6 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -25,8 +25,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "response_sender.h" + #include #include + #include "pb_stub.h" #include "pb_stub_utils.h" #include "scoped_defer.h" diff --git a/src/shm_manager.cc b/src/shm_manager.cc index 555bd023..b5499f88 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -24,13 +24,13 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#include "shm_manager.h" + #include #include #include #include -#include "shm_manager.h" - namespace triton { namespace backend { namespace python { SharedMemoryManager::SharedMemoryManager( diff --git a/src/shm_manager.h b/src/shm_manager.h index 108a3a44..ef2e5cb1 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -27,15 +27,17 @@ #pragma once #include + #include #include #include +#include #include #include #include #include #include -#include + #include "pb_exception.h" namespace triton { namespace backend { namespace python { diff --git a/src/shm_monitor/shm_monitor.cc b/src/shm_monitor/shm_monitor.cc index dfeb1fbb..e0c08d3c 100644 --- a/src/shm_monitor/shm_monitor.cc +++ b/src/shm_monitor/shm_monitor.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include + #include "../shm_manager.h" namespace triton { namespace backend { namespace python { diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index b5fe0aef..79863fb6 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "stub_launcher.h" + #include "python_be.h" namespace triton { namespace backend { namespace python { diff --git a/src/stub_launcher.h b/src/stub_launcher.h index c3da400b..fc5b6578 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -27,6 +27,7 @@ #pragma once #include + #include #include #include @@ -40,6 +41,7 @@ #include #include #include + #include "ipc_message.h" #include "memory_manager.h" #include "message_queue.h" From 7865b03a80adebbb58cfc733913353b85c31598d Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Mon, 26 Jun 2023 08:54:12 -0700 Subject: [PATCH 117/216] Pin pybind version to v2.10.0 commit to avoid unanticipated changes --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3659c7bd..e2c5ab8e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend) FetchContent_Declare( pybind11 GIT_REPOSITORY "https://github.com/pybind/pybind11" - GIT_TAG "v2.10" + # COMMIT ID for v2.10.0 + GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020" GIT_SHALLOW ON ) FetchContent_MakeAvailable(pybind11) From 902df12c42d9958eed0027a725e7cd9c3e777983 Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Mon, 26 Jun 2023 20:57:37 -0700 Subject: [PATCH 118/216] Add GitHub action to format and lint code (#265) * Set up pre-commit * Apply pre-commit * Make max-line-length 88 * Remove unnecessary line * Run pre-install with updated config * End of file newline * Fix comment * Remove unused variable * Add and apply isort * Newline at eof * Remove duplicate copyrights, add hooks link * Pin workflow Ubuntu version * Remove unnecessary imports * Remove unused import, update copyrights --- .github/workflows/codeql.yml | 6 +- .github/workflows/pre-commit.yml | 40 ++ .pre-commit-config.yaml | 73 +++ CMakeLists.txt | 6 +- README.md | 36 +- cmake/TritonPythonBackendConfig.cmake.in | 2 +- examples/add_sub/client.py | 35 +- examples/add_sub/model.py | 35 +- examples/auto_complete/batch_model.py | 73 ++- examples/auto_complete/client.py | 50 +-- examples/auto_complete/nobatch_model.py | 73 ++- examples/bls/async_client.py | 36 +- examples/bls/async_model.py | 33 +- examples/bls/sync_client.py | 72 +-- examples/bls/sync_model.py | 23 +- examples/bls_decoupled/README.md | 4 +- examples/bls_decoupled/async_client.py | 30 +- examples/bls_decoupled/async_model.py | 36 +- examples/bls_decoupled/sync_client.py | 19 +- examples/bls_decoupled/sync_model.py | 31 +- examples/custom_metrics/README.md | 2 +- examples/custom_metrics/client.py | 85 ++-- examples/custom_metrics/config.pbtxt | 4 +- examples/custom_metrics/model.py | 49 +- examples/decoupled/README.md | 2 +- examples/decoupled/repeat_client.py | 48 +- examples/decoupled/repeat_model.py | 61 +-- examples/decoupled/square_client.py | 43 +- examples/decoupled/square_model.py | 72 +-- examples/instance_kind/README.md | 44 +- examples/instance_kind/client.py | 89 ++-- examples/instance_kind/config.pbtxt | 0 examples/instance_kind/model.py | 32 +- examples/instance_kind/resnet50_labels.txt | 2 +- examples/jax/client.py | 37 +- examples/jax/model.py | 34 +- examples/preprocessing/README.md | 10 +- examples/preprocessing/client.py | 71 +-- examples/preprocessing/model.py | 52 +-- .../model_repository/preprocess/config.pbtxt | 4 +- .../resnet50_trt/config.pbtxt | 2 +- .../model_repository/resnet50_trt/labels.txt | 2 +- examples/preprocessing/onnx_exporter.py | 37 +- examples/pytorch/client.py | 36 +- examples/pytorch/model.py | 31 +- inferentia/README.md | 32 +- .../qa/setup_test_enviroment_and_test.sh | 6 +- inferentia/scripts/gen_triton_model.py | 419 ++++++++++-------- inferentia/scripts/setup-pre-container.sh | 16 +- inferentia/scripts/setup.sh | 8 +- pyproject.toml | 47 ++ src/message_queue.h | 2 +- src/pb_env.h | 2 +- src/pb_memory.h | 2 +- src/pb_response_iterator.cc | 2 +- src/pb_stub.cc | 2 +- src/pb_tensor.cc | 2 +- src/pb_utils.cc | 2 +- src/python_be.cc | 8 +- src/request_executor.cc | 4 +- src/resources/triton_python_backend_utils.py | 271 ++++++----- src/shm_manager.h | 8 +- src/stub_launcher.cc | 2 +- 63 files changed, 1358 insertions(+), 1039 deletions(-) create mode 100644 .github/workflows/pre-commit.yml create mode 100644 .pre-commit-config.yaml mode change 100755 => 100644 examples/instance_kind/config.pbtxt mode change 100755 => 100644 examples/instance_kind/resnet50_labels.txt mode change 100644 => 100755 inferentia/qa/setup_test_enviroment_and_test.sh mode change 100644 => 100755 inferentia/scripts/setup.sh create mode 100644 pyproject.toml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index a724718d..4f3f98cc 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -63,12 +63,12 @@ jobs: # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. - + # Details on CodeQL's query packs refer to: # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs queries: +security-and-quality - + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild @@ -77,7 +77,7 @@ jobs: # Command-line programs to run using the OS shell. # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - # If the Autobuild fails above, remove it and uncomment the following three lines. + # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..40cbd972 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,40 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..9c0fff8a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,73 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +repos: +- repo: https://github.com/timothycrosley/isort + rev: 5.12.0 + hooks: + - id: isort + additional_dependencies: [toml] +- repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + types_or: [python, cython] +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] + types_or: [python, cython] +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.5 + hooks: + - id: clang-format + types_or: [c, c++, cuda, proto, textproto, java] + args: ["-fallback-style=none", "-style=file", "-i"] +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) +# More details about these pre-commit hooks here: +# https://pre-commit.com/hooks.html +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: check-yaml + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + types_or: [c, c++, cuda, proto, textproto, java, python] + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace diff --git a/CMakeLists.txt b/CMakeLists.txt index e2c5ab8e..a9f070d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ endif() # # Dependencies # -# FetchContent's composibility isn't very good. We must include the +# FetchContent's composability isn't very good. We must include the # transitive closure of all repos so that we can override the tag. # include(FetchContent) @@ -249,7 +249,7 @@ target_link_libraries( Threads::Threads triton-backend-utils # from repo-backend ${CMAKE_DL_LIBS} # dlopen and dlclose - -lrt # shared memory + -lrt # shared memory triton-core-serverstub # from repo-core ZLIB::ZLIB -larchive @@ -263,7 +263,7 @@ target_link_libraries( triton-backend-utils # from repo-backend ${CMAKE_DL_LIBS} # dlopen and dlclose pybind11::embed - -lrt # shared memory + -lrt # shared memory -larchive # libarchive ) diff --git a/README.md b/README.md index 410643d7..de29a257 100644 --- a/README.md +++ b/README.md @@ -537,7 +537,7 @@ The decoupled mode is powerful and supports various other use cases: The [decoupled examples](examples/decoupled/README.md) demonstrate -full power of what can be acheived from decoupled API. Read +full power of what can be achieved from decoupled API. Read [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) for more details on how to host a decoupled model. @@ -586,7 +586,7 @@ documentation. ## Managing Python Runtime and Libraries Python backend shipped in the [NVIDIA GPU Cloud](https://ngc.nvidia.com/) -containers uses Python 3.10. Python backend is able to use the libaries +containers uses Python 3.10. Python backend is able to use the libraries that exist in the current Python environment. These libraries can be installed in a virtualenv, conda environment, or the global system Python. These libraries will only be used if the Python version matches @@ -594,7 +594,7 @@ the Python version of the Python backend's stub executable. For example, if you install a set of libraries in a Python 3.9 environment and your Python backend stub is compiled with Python 3.10 these libraries will NOT be available in your Python model served using Triton. You would need to -compile the stub executble with Python 3.9 using the instructions in +compile the stub executable with Python 3.9 using the instructions in [Building Custom Python Backend Stub](#building-custom-python-backend-stub) section. @@ -849,7 +849,7 @@ will create additional threads instead of spawning separate processes. ## Running Multiple Instances of Triton Server -Python backend uses shared memory to transfer requests to the stub process. +Python backend uses shared memory to transfer requests to the stub process. When running multiple instances of Triton Server on the same machine that use Python models, there would be shared memory region name conflicts that can result in segmentation faults or hangs. In order to avoid this issue, you need @@ -1233,9 +1233,9 @@ class TritonPythonModel: input0 = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(pytorch_tensor)) ``` Python backend allows tensors implementing -[`__dlpack__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack__.html) -and [`__dlpack_device__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack_device__.html) -[interface](https://dmlc.github.io/dlpack/latest/python_spec.html) +[`__dlpack__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack__.html) +and [`__dlpack_device__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack_device__.html) +[interface](https://dmlc.github.io/dlpack/latest/python_spec.html) to be converted to Python backend tensors. For instance: ```python @@ -1275,8 +1275,8 @@ this workflow. > **Note** > > Using a deep learning framework/package in a Python Backend model is -> not necessarily the same as using the corresponding Triton Backend -> implementation. For example, the +> not necessarily the same as using the corresponding Triton Backend +> implementation. For example, the > [PyTorch Backend](https://github.com/triton-inference-server/pytorch_backend) > is different from using a Python Backend model that uses `import torch`. > If you are seeing significantly different results from a model executed by @@ -1289,7 +1289,7 @@ this workflow. For a simple example of using PyTorch in a Python Backend model, see the [AddSubNet PyTorch example](#addsubnet-in-pytorch). -### PyTorch Determinism +### PyTorch Determinism When running PyTorch code, you may notice slight differences in output values across runs or across servers depending on hardware, system load, driver, or even @@ -1297,23 +1297,23 @@ batch size. These differences are generally related to the selection of CUDA kernels used to execute the operations, based on the factors mentioned. For most intents and purposes, these differences aren't large enough to affect -a model's final prediction. However, to understand where these differences come +a model's final prediction. However, to understand where these differences come from, see this [doc](https://pytorch.org/docs/stable/notes/randomness.html). On Ampere devices and later, there is an optimization related to -FP32 operations called +FP32 operations called [TensorFloat32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/). Typically this optimization will improve overall performance at the cost of minor precision loss, but similarly this precision loss is acceptable for most model predictions. For more info on TF32 in PyTorch and how to enable/disable -it as needed, see +it as needed, see [here](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). ## TensorFlow ### TensorFlow Determinism -Similar to the PyTorch determinism section above, TensorFlow can have slight +Similar to the PyTorch determinism section above, TensorFlow can have slight differences in outputs based on various factors like hardware, system configurations, or batch sizes due to the library's internal CUDA kernel selection process. For more information on improving the determinism of outputs @@ -1429,18 +1429,18 @@ You can find the complete example instructions in ## Model Instance Kind -Triton model configuration allows users to provide kind to [instance group +Triton model configuration allows users to provide kind to [instance group settings.](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) -A python backend model can be written to respect the kind setting to control +A python backend model can be written to respect the kind setting to control the execution of a model instance either on CPU or GPU. -In the [model instance kind example](examples/instance_kind/README.md) +In the [model instance kind example](examples/instance_kind/README.md) we demonstrate how this can be achieved for your python model. ## Auto-complete config The auto-complete config example demonstrates how to use the -`auto_complete_config` function to define +`auto_complete_config` function to define [minimal model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#minimal-model-configuration) when a configuration file is not available. You can find the complete example instructions in [examples/auto_complete](examples/auto_complete/README.md). diff --git a/cmake/TritonPythonBackendConfig.cmake.in b/cmake/TritonPythonBackendConfig.cmake.in index 2ab3af1a..4869a6df 100644 --- a/cmake/TritonPythonBackendConfig.cmake.in +++ b/cmake/TritonPythonBackendConfig.cmake.in @@ -1,4 +1,4 @@ - + # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/examples/add_sub/client.py b/examples/add_sub/client.py index 614a88af..1c08ae74 100644 --- a/examples/add_sub/client.py +++ b/examples/add_sub/client.py @@ -24,11 +24,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient import sys import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * model_name = "add_sub" shape = [4] @@ -37,10 +37,12 @@ input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), ] inputs[0].set_data_from_numpy(input0_data) @@ -51,19 +53,22 @@ httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("add_sub example error: incorrect sum") @@ -73,5 +78,5 @@ print("add_sub example error: incorrect difference") sys.exit(1) - print('PASS: add_sub') + print("PASS: add_sub") sys.exit(0) diff --git a/examples/add_sub/model.py b/examples/add_sub/model.py index 9ada6b10..f416e79d 100644 --- a/examples/add_sub/model.py +++ b/examples/add_sub/model.py @@ -41,7 +41,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -56,21 +56,21 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -107,15 +107,15 @@ def execute(self, requests): # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -123,9 +123,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -137,4 +138,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/auto_complete/batch_model.py b/examples/auto_complete/batch_model.py index 336b667e..98fa06f5 100644 --- a/examples/auto_complete/batch_model.py +++ b/examples/auto_complete/batch_model.py @@ -69,24 +69,14 @@ def auto_complete_config(auto_complete_model_config): pb_utils.ModelConfig An object containing the auto-completed model configuration """ - inputs = [{ - 'name': 'INPUT0', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }, { - 'name': 'INPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }] - outputs = [{ - 'name': 'OUTPUT0', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }, { - 'name': 'OUTPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }] + inputs = [ + {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] + outputs = [ + {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] # Demonstrate the usage of `as_dict`, `add_input`, `add_output`, # and `set_max_batch_size` functions. @@ -94,24 +84,24 @@ def auto_complete_config(auto_complete_model_config): config = auto_complete_model_config.as_dict() input_names = [] output_names = [] - for input in config['input']: - input_names.append(input['name']) - for output in config['output']: - output_names.append(output['name']) + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) for input in inputs: # The name checking here is only for demonstrating the usage of # `as_dict` function. `add_input` will check for conflicts and # raise errors if an input with the same name already exists in # the configuration but has different data_type or dims property. - if input['name'] not in input_names: + if input["name"] not in input_names: auto_complete_model_config.add_input(input) for output in outputs: # The name checking here is only for demonstrating the usage of # `as_dict` function. `add_output` will check for conflicts and # raise errors if an output with the same name already exists in # the configuration but has different data_type or dims property. - if output['name'] not in output_names: + if output["name"] not in output_names: auto_complete_model_config.add_output(output) auto_complete_model_config.set_max_batch_size(4) @@ -122,7 +112,7 @@ def auto_complete_config(auto_complete_model_config): def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -137,21 +127,21 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -188,15 +178,15 @@ def execute(self, requests): # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -204,9 +194,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -218,4 +209,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/auto_complete/client.py b/examples/auto_complete/client.py index d2ef893b..24fc1fac 100644 --- a/examples/auto_complete/client.py +++ b/examples/auto_complete/client.py @@ -24,11 +24,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient import sys import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * nobatch_model_name = "nobatch_auto_complete" batch_model_name = "batch_auto_complete" @@ -46,46 +46,38 @@ def validate_ios(config, expected_ios, model_name): sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": with httpclient.InferenceServerClient("localhost:8000") as client: expected_max_batch_size = { "nobatch_auto_complete": 0, - "batch_auto_complete": 4 + "batch_auto_complete": 4, } - expected_inputs = [{ - 'name': 'INPUT0', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }, { - 'name': 'INPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }] - expected_outputs = [{ - 'name': 'OUTPUT0', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }, { - 'name': 'OUTPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }] + expected_inputs = [ + {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] + expected_outputs = [ + {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] models = [nobatch_model_name, batch_model_name] for model_name in models: # Validate the auto-complete model configuration model_config = client.get_model_config(model_name) - if model_config["max_batch_size"] != expected_max_batch_size[ - model_name]: - print("model '" + model_name + - "' has unexpected max_batch_size") + if model_config["max_batch_size"] != expected_max_batch_size[model_name]: + print("model '" + model_name + "' has unexpected max_batch_size") sys.exit(1) validate_ios(model_config["input"], expected_inputs, model_name) validate_ios(model_config["output"], expected_outputs, model_name) - print("'" + model_name + "' configuration matches the expected " + - "auto complete configuration\n") + print( + "'" + + model_name + + "' configuration matches the expected " + + "auto complete configuration\n" + ) - print('PASS: auto_complete') + print("PASS: auto_complete") sys.exit(0) diff --git a/examples/auto_complete/nobatch_model.py b/examples/auto_complete/nobatch_model.py index b8dc476e..6e875138 100644 --- a/examples/auto_complete/nobatch_model.py +++ b/examples/auto_complete/nobatch_model.py @@ -69,24 +69,14 @@ def auto_complete_config(auto_complete_model_config): pb_utils.ModelConfig An object containing the auto-completed model configuration """ - inputs = [{ - 'name': 'INPUT0', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }, { - 'name': 'INPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }] - outputs = [{ - 'name': 'OUTPUT0', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }, { - 'name': 'OUTPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4] - }] + inputs = [ + {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] + outputs = [ + {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] # Demonstrate the usage of `as_dict`, `add_input`, `add_output`, # and `set_max_batch_size` functions. @@ -94,24 +84,24 @@ def auto_complete_config(auto_complete_model_config): config = auto_complete_model_config.as_dict() input_names = [] output_names = [] - for input in config['input']: - input_names.append(input['name']) - for output in config['output']: - output_names.append(output['name']) + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) for input in inputs: # The name checking here is only for demonstrating the usage of # `as_dict` function. `add_input` will check for conflicts and # raise errors if an input with the same name already exists in # the configuration but has different data_type or dims property. - if input['name'] not in input_names: + if input["name"] not in input_names: auto_complete_model_config.add_input(input) for output in outputs: # The name checking here is only for demonstrating the usage of # `as_dict` function. `add_output` will check for conflicts and # raise errors if an output with the same name already exists in # the configuration but has different data_type or dims property. - if output['name'] not in output_names: + if output["name"] not in output_names: auto_complete_model_config.add_output(output) auto_complete_model_config.set_max_batch_size(0) @@ -121,7 +111,7 @@ def auto_complete_config(auto_complete_model_config): def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -136,21 +126,21 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -187,15 +177,15 @@ def execute(self, requests): # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -203,9 +193,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -217,4 +208,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/bls/async_client.py b/examples/bls/async_client.py index cd6275b4..6d8fe577 100644 --- a/examples/bls/async_client.py +++ b/examples/bls/async_client.py @@ -24,10 +24,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient import sys + import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * model_name = "bls_async" shape = [4] @@ -36,10 +37,12 @@ input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), ] inputs[0].set_data_from_numpy(input0_data) @@ -50,19 +53,22 @@ httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("BLS async example error: incorrect sum") @@ -72,5 +78,5 @@ print("BLS async example error: incorrect difference") sys.exit(1) - print('PASS: BLS Async') + print("PASS: BLS Async") sys.exit(0) diff --git a/examples/bls/async_model.py b/examples/bls/async_model.py index ef287fdd..4cb0f6dc 100644 --- a/examples/bls/async_model.py +++ b/examples/bls/async_model.py @@ -24,13 +24,14 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import asyncio +import json + # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils -import json -import asyncio class TritonPythonModel: @@ -41,7 +42,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -56,7 +57,7 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = json.loads(args['model_config']) + self.model_config = json.loads(args["model_config"]) # You must add the Python 'async' keyword to the beginning of `execute` # function if you want to use `async_exec` function. @@ -94,12 +95,13 @@ async def execute(self, requests): # List of awaitables containing inflight inference responses. inference_response_awaits = [] - for model_name in ['pytorch', 'add_sub']: + for model_name in ["pytorch", "add_sub"]: # Create inference request object infer_request = pb_utils.InferenceRequest( model_name=model_name, requested_output_names=["OUTPUT0", "OUTPUT1"], - inputs=[in_0, in_1]) + inputs=[in_0, in_1], + ) # Store the awaitable inside the array. We don't need # the inference response immediately so we do not `await` @@ -109,8 +111,7 @@ async def execute(self, requests): # Wait for all the inference requests to finish. The execution # of the Python script will be blocked until all the awaitables # are resolved. - inference_responses = await asyncio.gather( - *inference_response_awaits) + inference_responses = await asyncio.gather(*inference_response_awaits) for infer_response in inference_responses: # Make sure that the inference response doesn't have an error. @@ -118,15 +119,18 @@ async def execute(self, requests): # execution you can raise an exception. if infer_response.has_error(): raise pb_utils.TritonModelException( - infer_response.error().message()) + infer_response.error().message() + ) # Get the OUTPUT0 from the "pytorch" model inference response pytorch_output0_tensor = pb_utils.get_output_tensor_by_name( - inference_responses[0], "OUTPUT0") + inference_responses[0], "OUTPUT0" + ) # Get the OUTPUT1 from the "addsub" model inference response addsub_output1_tensor = pb_utils.get_output_tensor_by_name( - inference_responses[1], "OUTPUT1") + inference_responses[1], "OUTPUT1" + ) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -134,13 +138,14 @@ async def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( - output_tensors=[pytorch_output0_tensor, addsub_output1_tensor]) + output_tensors=[pytorch_output0_tensor, addsub_output1_tensor] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -152,4 +157,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/bls/sync_client.py b/examples/bls/sync_client.py index 5d36e8a9..d9483e43 100644 --- a/examples/bls/sync_client.py +++ b/examples/bls/sync_client.py @@ -24,11 +24,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient -import numpy as np import sys +import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * + model_name = "bls_sync" shape = [4] @@ -36,37 +37,41 @@ input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), - httpclient.InferInput("MODEL_NAME", [1], - np_to_triton_dtype(np.object_)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), + httpclient.InferInput("MODEL_NAME", [1], np_to_triton_dtype(np.object_)), ] inputs[0].set_data_from_numpy(input0_data) inputs[1].set_data_from_numpy(input1_data) # Will perform the inference request on the 'add_sub' model. - inputs[2].set_data_from_numpy(np.array(['add_sub'], dtype=np.object_)) + inputs[2].set_data_from_numpy(np.array(["add_sub"], dtype=np.object_)) outputs = [ httpclient.InferRequestedOutput("OUTPUT0"), httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") print("=========='add_sub' model result==========") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("BLS sync example error: incorrect sum") sys.exit(1) @@ -76,21 +81,24 @@ sys.exit(1) # Will perform the inference request on the pytorch model: - inputs[2].set_data_from_numpy(np.array(['pytorch'], dtype=np.object_)) - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + inputs[2].set_data_from_numpy(np.array(["pytorch"], dtype=np.object_)) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") print("\n") print("=========='pytorch' model result==========") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("BLS sync example error: incorrect sum") sys.exit(1) @@ -104,14 +112,10 @@ print("\n") print("=========='undefined' model result==========") try: - inputs[2].set_data_from_numpy( - np.array(['undefined_model'], dtype=np.object_)) - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + inputs[2].set_data_from_numpy(np.array(["undefined_model"], dtype=np.object_)) + _ = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) except InferenceServerException as e: print(e.message()) - print('PASS: BLS Sync') + print("PASS: BLS Sync") sys.exit(0) diff --git a/examples/bls/sync_model.py b/examples/bls/sync_model.py index 0a05024e..f89bed72 100644 --- a/examples/bls/sync_model.py +++ b/examples/bls/sync_model.py @@ -24,12 +24,13 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json + # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils -import json class TritonPythonModel: @@ -40,7 +41,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -55,7 +56,7 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = json.loads(args['model_config']) + self.model_config = json.loads(args["model_config"]) def execute(self, requests): """`execute` must be implemented in every Python model. `execute` @@ -90,8 +91,7 @@ def execute(self, requests): in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") # Get Model Name - model_name = pb_utils.get_input_tensor_by_name( - request, "MODEL_NAME") + model_name = pb_utils.get_input_tensor_by_name(request, "MODEL_NAME") # Model Name string model_name_string = model_name.as_numpy()[0] @@ -100,7 +100,8 @@ def execute(self, requests): infer_request = pb_utils.InferenceRequest( model_name=model_name_string, requested_output_names=["OUTPUT0", "OUTPUT1"], - inputs=[in_0, in_1]) + inputs=[in_0, in_1], + ) # Perform synchronous blocking inference request infer_response = infer_request.exec() @@ -109,8 +110,7 @@ def execute(self, requests): # it has an error and you can't proceed with your model execution # you can raise an exception. if infer_response.has_error(): - raise pb_utils.TritonModelException( - infer_response.error().message()) + raise pb_utils.TritonModelException(infer_response.error().message()) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -118,13 +118,14 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) # # Because the infer_response of the models contains the final # outputs with correct output names, we can just pass the list # of outputs to the InferenceResponse object. inference_response = pb_utils.InferenceResponse( - output_tensors=infer_response.output_tensors()) + output_tensors=infer_response.output_tensors() + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -136,4 +137,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/bls_decoupled/README.md b/examples/bls_decoupled/README.md index 6f5fc57b..1f64fee5 100644 --- a/examples/bls_decoupled/README.md +++ b/examples/bls_decoupled/README.md @@ -39,7 +39,7 @@ and the second section shows how to execute asynchronous BLS requests. ## Synchronous BLS Requests with Decoupled Models -The goal of `bls_decoupled_sync` model is to caculate the sum of the responses +The goal of `bls_decoupled_sync` model is to calculate the sum of the responses returned from the [square](../decoupled) model and return the summation as the final response. The value of input 'IN' will be passed as an input to the [square](../decoupled) model which determines how many responses the [square](../decoupled) model will generate. @@ -105,7 +105,7 @@ model execution and can lead to speedups under certain conditions. The `bls_decoupled_async` model will perform two async BLS requests on the [square](../decoupled) model. Then, it will wait until the inference requests -are completed. It will caculate the sum of the output `OUT` from the +are completed. It will calculate the sum of the output `OUT` from the [square](../decoupled) model in both two requests to construct the final inference response object using these tensors. diff --git a/examples/bls_decoupled/async_client.py b/examples/bls_decoupled/async_client.py index 4cf8364a..f701974d 100644 --- a/examples/bls_decoupled/async_client.py +++ b/examples/bls_decoupled/async_client.py @@ -24,11 +24,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient -import numpy as np import sys +import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * + model_name = "bls_decoupled_async" shape = [1] @@ -38,29 +39,30 @@ for in_value in in_values: input_data = np.array([in_value], dtype=np.int32) inputs = [ - httpclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) outputs = [httpclient.InferRequestedOutput("SUM")] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() # output_data contains two times of the square value of the input value. output_data = response.as_numpy("SUM") print("==========model result==========") - print("Two times the square value of {} is {}\n".format( - input_data, output_data)) + print( + "Two times the square value of {} is {}\n".format(input_data, output_data) + ) if not np.allclose((2 * input_data * input_data), output_data): print( - "BLS Decoupled Async example error: incorrect output value. Expected {}, got {}." - .format((2 * input_data * input_data), output_data)) + "BLS Decoupled Async example error: incorrect output value. Expected {}, got {}.".format( + (2 * input_data * input_data), output_data + ) + ) sys.exit(1) - print('PASS: BLS Decoupled Async') + print("PASS: BLS Decoupled Async") sys.exit(0) diff --git a/examples/bls_decoupled/async_model.py b/examples/bls_decoupled/async_model.py index 3b1d454c..0a69a628 100644 --- a/examples/bls_decoupled/async_model.py +++ b/examples/bls_decoupled/async_model.py @@ -24,14 +24,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import asyncio +import json + +import numpy as np + # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils -import numpy as np -import asyncio -import json class TritonPythonModel: @@ -45,7 +47,7 @@ class TritonPythonModel: - Input 'IN' shape must be equal to [1], datatype must be INT32. - For each response, output 'SUM' shape must be equal to [1], datatype must be INT32. - + For every request, the model will send a single response that contains an output named 'SUM'. We will send two BLS requests to the square model and the 'SUM' will contain the summation of the 'OUT' response output returned @@ -56,7 +58,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -71,7 +73,7 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = json.loads(args['model_config']) + self.model_config = json.loads(args["model_config"]) # You must add the Python 'async' keyword to the beginning of `execute` # function if you want to use `async_exec` function. @@ -100,8 +102,9 @@ async def execute(self, requests): # This model does not support batching, so 'request_count' should # always be 1. if len(requests) != 1: - raise pb_utils.TritonModelException("unsupported batch size " + - len(requests)) + raise pb_utils.TritonModelException( + "unsupported batch size " + len(requests) + ) response_num = pb_utils.get_input_tensor_by_name(requests[0], "IN") @@ -116,12 +119,12 @@ async def execute(self, requests): infer_request = pb_utils.InferenceRequest( model_name="square_int32", inputs=[response_num], - requested_output_names=["OUT"]) + requested_output_names=["OUT"], + ) # Store the awaitable inside the array. We don't need # the inference response immediately so we do not `await` # here. - inference_response_awaits.append( - infer_request.async_exec(decoupled=True)) + inference_response_awaits.append(infer_request.async_exec(decoupled=True)) # Wait for all the inference requests to finish. The execution # of the Python script will be blocked until all the awaitables @@ -139,16 +142,19 @@ async def execute(self, requests): # If inference response has an error, raise an exception if infer_response.has_error(): raise pb_utils.TritonModelException( - infer_response.error().message()) + infer_response.error().message() + ) # Check for the last empty response. if len(infer_response.output_tensors()) > 0: response_sum += pb_utils.get_output_tensor_by_name( - infer_response, "OUT").as_numpy() + infer_response, "OUT" + ).as_numpy() response = [ pb_utils.InferenceResponse( - output_tensors=[pb_utils.Tensor("SUM", response_sum)]) + output_tensors=[pb_utils.Tensor("SUM", response_sum)] + ) ] # Since the model is using the default mode in this example, we @@ -160,4 +166,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/bls_decoupled/sync_client.py b/examples/bls_decoupled/sync_client.py index 10fd4ffa..63156481 100644 --- a/examples/bls_decoupled/sync_client.py +++ b/examples/bls_decoupled/sync_client.py @@ -24,11 +24,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient -import numpy as np import sys +import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * + model_name = "bls_decoupled_sync" shape = [1] @@ -38,16 +39,14 @@ for in_value in in_values: input_data = np.array([in_value], dtype=np.int32) inputs = [ - httpclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) outputs = [httpclient.InferRequestedOutput("SUM")] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output_data = response.as_numpy("SUM") @@ -60,5 +59,5 @@ ).format(input_data * input_data, output_data) sys.exit(1) - print('PASS: BLS Decoupled Sync') + print("PASS: BLS Decoupled Sync") sys.exit(0) diff --git a/examples/bls_decoupled/sync_model.py b/examples/bls_decoupled/sync_model.py index 7ca397fd..afc755e5 100644 --- a/examples/bls_decoupled/sync_model.py +++ b/examples/bls_decoupled/sync_model.py @@ -24,13 +24,15 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json + +import numpy as np + # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils -import numpy as np -import json class TritonPythonModel: @@ -44,7 +46,7 @@ class TritonPythonModel: - Input 'IN' shape must be equal to [1], datatype must be INT32. - For each response, output 'SUM' shape must be equal to [1], datatype must be INT32. - + For every request, the model will send a single response that contains an output named 'SUM'. The 'SUM' will contain the summation of the 'OUT' response output returned by the square model. The input 'IN' determines how @@ -54,7 +56,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -69,7 +71,7 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = json.loads(args['model_config']) + self.model_config = json.loads(args["model_config"]) def execute(self, requests): """`execute` must be implemented in every Python model. `execute` @@ -96,8 +98,9 @@ def execute(self, requests): # This model does not support batching, so 'request_count' should # always be 1. if len(requests) != 1: - raise pb_utils.TritonModelException("unsupported batch size " + - len(requests)) + raise pb_utils.TritonModelException( + "unsupported batch size " + len(requests) + ) response_num = pb_utils.get_input_tensor_by_name(requests[0], "IN") @@ -108,7 +111,8 @@ def execute(self, requests): infer_request = pb_utils.InferenceRequest( model_name="square_int32", inputs=[response_num], - requested_output_names=["OUT"]) + requested_output_names=["OUT"], + ) # The variable that will store the sum of the responses. response_sum = np.array([0]) @@ -121,17 +125,18 @@ def execute(self, requests): for infer_response in infer_responses: # If inference response has an error, raise an exception if infer_response.has_error(): - raise pb_utils.TritonModelException( - infer_response.error().message()) + raise pb_utils.TritonModelException(infer_response.error().message()) # Check for the last empty response. if len(infer_response.output_tensors()) > 0: response_sum += pb_utils.get_output_tensor_by_name( - infer_response, "OUT").as_numpy() + infer_response, "OUT" + ).as_numpy() response = [ pb_utils.InferenceResponse( - output_tensors=[pb_utils.Tensor("SUM", response_sum)]) + output_tensors=[pb_utils.Tensor("SUM", response_sum)] + ) ] # Since the model is using the default mode in this example, we @@ -143,4 +148,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/custom_metrics/README.md b/examples/custom_metrics/README.md index 6b7bc250..88831e22 100644 --- a/examples/custom_metrics/README.md +++ b/examples/custom_metrics/README.md @@ -34,7 +34,7 @@ In this section we demonstrate an end-to-end example for should contain [custom_metrics](./model.py) model. The [custom_metrics](./model.py) model uses [Custom Metrics API](../../README.md#custom-metrics) to register and collect -custom metrics. +custom metrics. ## Deploying the Custom Metrics Models diff --git a/examples/custom_metrics/client.py b/examples/custom_metrics/client.py index e74c5bc0..64ae31e4 100644 --- a/examples/custom_metrics/client.py +++ b/examples/custom_metrics/client.py @@ -1,35 +1,35 @@ -#Copyright 2023, NVIDIA CORPORATION& AFFILIATES.All rights reserved. +# Copyright 2023, NVIDIA CORPORATION& AFFILIATES.All rights reserved. # -#Redistribution and use in source and binary forms, with or without -#modification, are permitted provided that the following conditions -#are met: -#* Redistributions of source code must retain the above copyright -#notice, this list of conditions and the following disclaimer. -#* Redistributions in binary form must reproduce the above copyright -#notice, this list of conditions and the following disclaimer in the -#documentation and / or other materials provided with the distribution. -#* Neither the name of NVIDIA CORPORATION nor the names of its -#contributors may be used to endorse or promote products derived -#from this software without specific prior written permission. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and / or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. # -#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR -#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, -#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient -import requests import sys import numpy as np +import requests +import tritonclient.http as httpclient +from tritonclient.utils import * model_name = "custom_metrics" shape = [4] @@ -46,10 +46,12 @@ def get_metrics(): input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), ] inputs[0].set_data_from_numpy(input0_data) @@ -60,10 +62,7 @@ def get_metrics(): httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") @@ -78,20 +77,22 @@ def get_metrics(): metrics = get_metrics() patterns = [ - '# HELP requests_process_latency_ns Cumulative time spent processing requests', - '# TYPE requests_process_latency_ns counter', - 'requests_process_latency_ns{model="custom_metrics",version="1"}' + "# HELP requests_process_latency_ns Cumulative time spent processing requests", + "# TYPE requests_process_latency_ns counter", + 'requests_process_latency_ns{model="custom_metrics",version="1"}', ] for pattern in patterns: if pattern not in metrics: print( - "custom_metrics example error: missing pattern '{}' in metrics". - format(pattern)) + "custom_metrics example error: missing pattern '{}' in metrics".format( + pattern + ) + ) sys.exit(1) else: print( - "custom_metrics example: found pattern '{}' in metrics".format( - pattern)) + "custom_metrics example: found pattern '{}' in metrics".format(pattern) + ) - print('PASS: custom_metrics') + print("PASS: custom_metrics") sys.exit(0) diff --git a/examples/custom_metrics/config.pbtxt b/examples/custom_metrics/config.pbtxt index a364058f..2a8192c3 100644 --- a/examples/custom_metrics/config.pbtxt +++ b/examples/custom_metrics/config.pbtxt @@ -57,9 +57,9 @@ output [ ] instance_group [ - { + { count: 3 - kind: KIND_CPU + kind: KIND_CPU } ] diff --git a/examples/custom_metrics/model.py b/examples/custom_metrics/model.py index 02abcc13..ad3b4e6f 100644 --- a/examples/custom_metrics/model.py +++ b/examples/custom_metrics/model.py @@ -42,7 +42,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -57,17 +57,13 @@ def initialize(self, args): """ # Parse model_config and extract OUTPUT0 and OUTPUT1 configuration - self.model_config = model_config = json.loads(args['model_config']) - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + self.model_config = model_config = json.loads(args["model_config"]) + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types - self.out0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) - self.out1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"]) + self.out1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"]) # Create a MetricFamily object to report the latency of the model # execution. The 'kind' parameter must be either 'COUNTER' or @@ -80,7 +76,7 @@ def initialize(self, args): self.metric_family = pb_utils.MetricFamily( name="requests_process_latency_ns", description="Cumulative time spent processing requests", - kind=pb_utils.MetricFamily.COUNTER # or pb_utils.MetricFamily.GAUGE + kind=pb_utils.MetricFamily.COUNTER, # or pb_utils.MetricFamily.GAUGE ) # Create a Metric object under the MetricFamily object. The 'labels' @@ -88,10 +84,9 @@ def initialize(self, args): # objects under the same MetricFamily object with unique labels. Empty # labels is allowed. The 'labels' parameter is optional. If you don't # specify the 'labels' parameter, empty labels will be used. - self.metric = self.metric_family.Metric(labels={ - "model": "custom_metrics", - "version": "1" - }) + self.metric = self.metric_family.Metric( + labels={"model": "custom_metrics", "version": "1"} + ) def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -127,15 +122,15 @@ def execute(self, requests): # Get INPUT1 in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(self.out0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(self.out1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.out0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.out1_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -143,9 +138,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # Record the end time of processing the requests @@ -162,8 +158,9 @@ def execute(self, requests): # - Metric.value(): Get the current value of the metric. self.metric.increment(end_ns - start_ns) logger = pb_utils.Logger - logger.log_info("Cumulative requests processing latency: {}".format( - self.metric.value())) + logger.log_info( + "Cumulative requests processing latency: {}".format(self.metric.value()) + ) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. @@ -174,4 +171,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/decoupled/README.md b/examples/decoupled/README.md index 22f4f68e..a13aa78f 100644 --- a/examples/decoupled/README.md +++ b/examples/decoupled/README.md @@ -181,7 +181,7 @@ stream stopped... ``` -Look how a single request generated 4 responses. +Look how a single request generated 4 responses. ## Running inference on Square model: diff --git a/examples/decoupled/repeat_client.py b/examples/decoupled/repeat_client.py index e29dd4e0..7d6a1719 100644 --- a/examples/decoupled/repeat_client.py +++ b/examples/decoupled/repeat_client.py @@ -24,17 +24,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import queue import sys from functools import partial -import numpy as np -import queue -from tritonclient.utils import * +import numpy as np import tritonclient.grpc as grpcclient +from tritonclient.utils import * class UserData: - def __init__(self): self._completed_requests = queue.Queue() @@ -56,18 +55,19 @@ def callback(user_data, result, error): wait_value = 5 inputs = [] -inputs.append(grpcclient.InferInput('IN', [len(in_value)], "INT32")) -inputs.append(grpcclient.InferInput('DELAY', [len(delay_value)], "UINT32")) -inputs.append(grpcclient.InferInput('WAIT', [1], "UINT32")) +inputs.append(grpcclient.InferInput("IN", [len(in_value)], "INT32")) +inputs.append(grpcclient.InferInput("DELAY", [len(delay_value)], "UINT32")) +inputs.append(grpcclient.InferInput("WAIT", [1], "UINT32")) outputs = [] -outputs.append(grpcclient.InferRequestedOutput('OUT')) -outputs.append(grpcclient.InferRequestedOutput('IDX')) +outputs.append(grpcclient.InferRequestedOutput("OUT")) +outputs.append(grpcclient.InferRequestedOutput("IDX")) user_data = UserData() -with grpcclient.InferenceServerClient(url="localhost:8001", - verbose=True) as triton_client: +with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True +) as triton_client: # Establish stream triton_client.start_stream(callback=partial(callback, user_data)) @@ -79,10 +79,12 @@ def callback(user_data, result, error): inputs[2].set_data_from_numpy(wait_data) request_id = "0" - triton_client.async_stream_infer(model_name=model_name, - inputs=inputs, - request_id=request_id, - outputs=outputs) + triton_client.async_stream_infer( + model_name=model_name, + inputs=inputs, + request_id=request_id, + outputs=outputs, + ) # Retrieve results... recv_count = 0 @@ -102,18 +104,22 @@ def callback(user_data, result, error): # Validate results... if len(result_dict[request_id]) != len(in_value): - print("expected {} many responses for request id {}, got {}".format( - len(in_value), request_id, len(result_dict[request_id]))) + print( + "expected {} many responses for request id {}, got {}".format( + len(in_value), request_id, len(result_dict[request_id]) + ) + ) sys.exit(1) result_list = result_dict[request_id] for i in range(len(result_list)): expected_data = np.array([in_value[i]], dtype=np.int32) - this_data = result_list[i][1].as_numpy('OUT') + this_data = result_list[i][1].as_numpy("OUT") if not np.array_equal(expected_data, this_data): - print("incorrect data: expected {}, got {}".format( - expected_data, this_data)) + print( + "incorrect data: expected {}, got {}".format(expected_data, this_data) + ) sys.exit(1) - print('PASS: repeat_int32') + print("PASS: repeat_int32") sys.exit(0) diff --git a/examples/decoupled/repeat_model.py b/examples/decoupled/repeat_model.py index addc81c7..b626e1a5 100644 --- a/examples/decoupled/repeat_model.py +++ b/examples/decoupled/repeat_model.py @@ -25,10 +25,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json -import numpy import threading import time +import numpy + # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config @@ -72,7 +73,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -87,15 +88,19 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + serve this model""".format( + args["model_name"] + ) + ) # Get OUT configuration out_config = pb_utils.get_output_config_by_name(model_config, "OUT") @@ -104,10 +109,8 @@ def initialize(self, args): idx_config = pb_utils.get_output_config_by_name(model_config, "IDX") # Convert Triton types to numpy types - self.out_dtype = pb_utils.triton_string_to_numpy( - out_config['data_type']) - self.idx_dtype = pb_utils.triton_string_to_numpy( - idx_config['data_type']) + self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"]) + self.idx_dtype = pb_utils.triton_string_to_numpy(idx_config["data_type"]) # To keep track of response threads so that we can delay # the finalizing the model until all response threads @@ -146,13 +149,12 @@ def execute(self, requests): # This model does not support batching, so 'request_count' should always # be 1. if len(requests) != 1: - raise pb_utils.TritonModelException("unsupported batch size " + - len(requests)) + raise pb_utils.TritonModelException( + "unsupported batch size " + len(requests) + ) - in_input = pb_utils.get_input_tensor_by_name(requests[0], - 'IN').as_numpy() - delay_input = pb_utils.get_input_tensor_by_name(requests[0], - 'DELAY').as_numpy() + in_input = pb_utils.get_input_tensor_by_name(requests[0], "IN").as_numpy() + delay_input = pb_utils.get_input_tensor_by_name(requests[0], "DELAY").as_numpy() if in_input.shape != delay_input.shape: raise pb_utils.TritonModelException( f"expected IN and DELAY shape to match, got {list(in_input.shape)} and {list(delay_input.shape)}." @@ -160,9 +162,10 @@ def execute(self, requests): # Start a separate thread to send the responses for the request. The # sending back the responses is delegated to this thread. - thread = threading.Thread(target=self.response_thread, - args=(requests[0].get_response_sender(), - in_input, delay_input)) + thread = threading.Thread( + target=self.response_thread, + args=(requests[0].get_response_sender(), in_input, delay_input), + ) # A model using decoupled transaction policy is not required to send all # responses for the current request before returning from the execute. @@ -177,8 +180,7 @@ def execute(self, requests): # Read WAIT input for wait time, then return so that Triton can call # execute again with another request. - wait_input = pb_utils.get_input_tensor_by_name(requests[0], - 'WAIT').as_numpy() + wait_input = pb_utils.get_input_tensor_by_name(requests[0], "WAIT").as_numpy() time.sleep(wait_input[0] / 1000) # Unlike in non-decoupled model transaction policy, execute function @@ -207,18 +209,17 @@ def response_thread(self, response_sender, in_input, delay_input): time.sleep(delay_value / 1000) idx_output = pb_utils.Tensor("IDX", numpy.array([idx], idx_dtype)) - out_output = pb_utils.Tensor("OUT", - numpy.array([in_value], out_dtype)) + out_output = pb_utils.Tensor("OUT", numpy.array([in_value], out_dtype)) response = pb_utils.InferenceResponse( - output_tensors=[idx_output, out_output]) + output_tensors=[idx_output, out_output] + ) response_sender.send(response) # We must close the response sender to indicate to Triton that we are # done sending responses for the corresponding request. We can't use the # response sender after closing it. The response sender is closed by # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL. - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1 @@ -230,17 +231,17 @@ def finalize(self): Here we will wait for all response threads to complete sending responses. """ - print('Finalize invoked') + print("Finalize invoked") inflight_threads = True cycles = 0 logging_time_sec = 5 sleep_time_sec = 0.1 - cycle_to_log = (logging_time_sec / sleep_time_sec) + cycle_to_log = logging_time_sec / sleep_time_sec while inflight_threads: with self.inflight_thread_count_lck: - inflight_threads = (self.inflight_thread_count != 0) - if (cycles % cycle_to_log == 0): + inflight_threads = self.inflight_thread_count != 0 + if cycles % cycle_to_log == 0: print( f"Waiting for {self.inflight_thread_count} response threads to complete..." ) @@ -248,4 +249,4 @@ def finalize(self): time.sleep(sleep_time_sec) cycles += 1 - print('Finalize complete...') + print("Finalize complete...") diff --git a/examples/decoupled/square_client.py b/examples/decoupled/square_client.py index e953398b..0751f13c 100644 --- a/examples/decoupled/square_client.py +++ b/examples/decoupled/square_client.py @@ -24,17 +24,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import queue import sys from functools import partial -import numpy as np -import queue -from tritonclient.utils import * +import numpy as np import tritonclient.grpc as grpcclient +from tritonclient.utils import * class UserData: - def __init__(self): self._completed_requests = queue.Queue() @@ -61,8 +60,9 @@ def callback(user_data, result, error): user_data = UserData() -with grpcclient.InferenceServerClient(url="localhost:8001", - verbose=True) as triton_client: +with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True +) as triton_client: # Establish stream triton_client.start_stream(callback=partial(callback, user_data)) @@ -71,10 +71,12 @@ def callback(user_data, result, error): in_data = np.array([in_values[i]], dtype=np.int32) inputs[0].set_data_from_numpy(in_data) - triton_client.async_stream_infer(model_name=model_name, - inputs=inputs, - request_id=str(i), - outputs=outputs) + triton_client.async_stream_infer( + model_name=model_name, + inputs=inputs, + request_id=str(i), + outputs=outputs, + ) # Retrieve results... recv_count = 0 @@ -99,24 +101,29 @@ def callback(user_data, result, error): print("response for request id {} not received".format(this_id)) sys.exit(1) elif in_values[i] == 0 and this_id in result_dict.keys(): - print("received unexpected response for request id {}".format( - this_id)) + print("received unexpected response for request id {}".format(this_id)) sys.exit(1) if in_values[i] != 0: if len(result_dict[this_id]) != in_values[i]: - print("expected {} many responses for request id {}, got {}". - format(in_values[i], this_id, result_dict[this_id])) + print( + "expected {} many responses for request id {}, got {}".format( + in_values[i], this_id, result_dict[this_id] + ) + ) sys.exit(1) if in_values[i] != 0: result_list = result_dict[this_id] expected_data = np.array([in_values[i]], dtype=np.int32) for j in range(len(result_list)): - this_data = result_list[j][1].as_numpy('OUT') + this_data = result_list[j][1].as_numpy("OUT") if not np.array_equal(expected_data, this_data): - print("incorrect data: expected {}, got {}".format( - expected_data, this_data)) + print( + "incorrect data: expected {}, got {}".format( + expected_data, this_data + ) + ) sys.exit(1) - print('PASS: square_int32') + print("PASS: square_int32") sys.exit(0) diff --git a/examples/decoupled/square_model.py b/examples/decoupled/square_model.py index d1bb9b46..b6f6fafb 100644 --- a/examples/decoupled/square_model.py +++ b/examples/decoupled/square_model.py @@ -25,10 +25,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json -import numpy as np import threading import time +import numpy as np + # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config @@ -57,7 +58,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -72,45 +73,59 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + serve this model""".format( + args["model_name"] + ) + ) # Get IN configuration in_config = pb_utils.get_input_config_by_name(model_config, "IN") # Validate the shape and data type of IN - in_shape = in_config['dims'] + in_shape = in_config["dims"] if (len(in_shape) != 1) or (in_shape[0] != 1): raise pb_utils.TritonModelException( """the model `{}` requires the shape of 'IN' to be - [1], got {}""".format(args['model_name'], in_shape)) - if in_config['data_type'] != 'TYPE_INT32': + [1], got {}""".format( + args["model_name"], in_shape + ) + ) + if in_config["data_type"] != "TYPE_INT32": raise pb_utils.TritonModelException( """the model `{}` requires the data_type of 'IN' to be - 'TYPE_INT32', got {}""".format(args['model_name'], - in_config['data_type'])) + 'TYPE_INT32', got {}""".format( + args["model_name"], in_config["data_type"] + ) + ) # Get OUT configuration out_config = pb_utils.get_output_config_by_name(model_config, "OUT") # Validate the shape and data type of OUT - out_shape = out_config['dims'] + out_shape = out_config["dims"] if (len(out_shape) != 1) or (out_shape[0] != 1): raise pb_utils.TritonModelException( """the model `{}` requires the shape of 'OUT' to be - [1], got {}""".format(args['model_name'], out_shape)) - if out_config['data_type'] != 'TYPE_INT32': + [1], got {}""".format( + args["model_name"], out_shape + ) + ) + if out_config["data_type"] != "TYPE_INT32": raise pb_utils.TritonModelException( """the model `{}` requires the data_type of 'OUT' to be - 'TYPE_INT32', got {}""".format(args['model_name'], - out_config['data_type'])) + 'TYPE_INT32', got {}""".format( + args["model_name"], out_config["data_type"] + ) + ) self.inflight_thread_count = 0 self.inflight_thread_count_lck = threading.Lock() @@ -164,10 +179,13 @@ def execute(self, requests): def process_request(self, request): # Start a separate thread to send the responses for the request. The # sending back the responses is delegated to this thread. - thread = threading.Thread(target=self.response_thread, - args=(request.get_response_sender(), - pb_utils.get_input_tensor_by_name( - request, 'IN').as_numpy())) + thread = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) # A model using decoupled transaction policy is not required to send all # responses for the current request before returning from the execute. @@ -185,8 +203,7 @@ def response_thread(self, response_sender, in_input): # corresponding request. for idx in range(in_input[0]): - out_output = pb_utils.Tensor("OUT", np.array([in_input[0]], - np.int32)) + out_output = pb_utils.Tensor("OUT", np.array([in_input[0]], np.int32)) response = pb_utils.InferenceResponse(output_tensors=[out_output]) response_sender.send(response) @@ -194,8 +211,7 @@ def response_thread(self, response_sender, in_input): # done sending responses for the corresponding request. We can't use the # response sender after closing it. The response sender is closed by # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL. - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1 @@ -208,17 +224,17 @@ def finalize(self): responses. """ - print('Finalize invoked') + print("Finalize invoked") inflight_threads = True cycles = 0 logging_time_sec = 5 sleep_time_sec = 0.1 - cycle_to_log = (logging_time_sec / sleep_time_sec) + cycle_to_log = logging_time_sec / sleep_time_sec while inflight_threads: with self.inflight_thread_count_lck: - inflight_threads = (self.inflight_thread_count != 0) - if (cycles % cycle_to_log == 0): + inflight_threads = self.inflight_thread_count != 0 + if cycles % cycle_to_log == 0: print( f"Waiting for {self.inflight_thread_count} response threads to complete..." ) @@ -226,4 +242,4 @@ def finalize(self): time.sleep(sleep_time_sec) cycles += 1 - print('Finalize complete...') + print("Finalize complete...") diff --git a/examples/instance_kind/README.md b/examples/instance_kind/README.md index 9eaa47f7..67b5e2a3 100644 --- a/examples/instance_kind/README.md +++ b/examples/instance_kind/README.md @@ -28,12 +28,12 @@ # Model Instance Kind Example -Triton model configuration allows users to provide kind to [instance group +Triton model configuration allows users to provide kind to [instance group settings.](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups) -A python backend model can be written to respect the kind setting to control +A python backend model can be written to respect the kind setting to control the execution of a model instance either on CPU or GPU. -In this example, we demonstrate how this can be achieved for your python model. +In this example, we demonstrate how this can be achieved for your python model. We will use a `ResNet50` model as our base model for this example. ## Create a ResNet50 model repository @@ -42,10 +42,10 @@ We will use the files that come with this example to create the model repository. First, download the [client.py](client.py), [config.pbtxt](config.pbtxt), -[resnet50_labels.txt](resnet50_labels.txt), and [model.py](model.py) +[resnet50_labels.txt](resnet50_labels.txt), and [model.py](model.py) to your local machine. -Next, in the same directory with the four aformentioned files, create the model +Next, in the same directory with the four aforementioned files, create the model repository with the following commands: ``` mkdir -p models/resnet50/1 && @@ -78,13 +78,13 @@ parts of this example. ## Start the Triton Server -At the directory where we copied our resnet50 model (at where the "models" +At the directory where we copied our resnet50 model (at where the "models" folder is located), run the following command: ``` docker run --gpus all --shm-size 1G -it --rm -p 8000:8000 -v `pwd`:/instance_kind nvcr.io/nvidia/tritonserver:-py3 /bin/bash ``` -Inside the container, we need to install `torch` and `pillow` to run +Inside the container, we need to install `torch` and `pillow` to run this example. We recommend to use `pip` method for the installation: ``` @@ -105,8 +105,8 @@ To start the sdk container, run the following command: docker run --gpus all --network=host --pid=host --ipc=host -v `pwd`:/instance_kind -ti nvcr.io/nvidia/tritonserver:-py3-sdk /bin/bash ``` -The `client.py` requires the following packages to be installed: `torch`, -`torchvision`, `pillow` and `validators`. Similarly, we recommend to use `pip` +The `client.py` requires the following packages to be installed: `torch`, +`torchvision`, `pillow` and `validators`. Similarly, we recommend to use `pip` method for the installation: ``` @@ -123,18 +123,18 @@ Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" t Results is class: TABBY PASS: ResNet50 ``` -It may take some time due to `torchhub` downloads, but any future calls +It may take some time due to `torchhub` downloads, but any future calls will be quicker, since the client will use already downloaded artifacts. ## Test Instance Kind -Provided `config.pbtxt` sets the instance group setting to `KIND_CPU`, -which enables the execution of a model on the CPU. +Provided `config.pbtxt` sets the instance group setting to `KIND_CPU`, +which enables the execution of a model on the CPU. To test that your model is actually loaded onto CPU, run the following: ``` python client.py -v ``` -The `-v` argument asks the client to request model's confiuration from +The `-v` argument asks the client to request model's confiuration from the server and prints it in your console: ``` { @@ -157,12 +157,12 @@ Results is class: TABBY PASS: ResNet50 instance kind ``` -Based on the printed model config, we can see that `instance_group` field -has `kind` entry, which is set to `KIND_CPU`. +Based on the printed model config, we can see that `instance_group` field +has `kind` entry, which is set to `KIND_CPU`. -To change an `instance_group` parameter to `KIND_GPU`, a user can simply replace -`KIND_CPU` with `KIND_GPU` in the `config.pbtxt`. After restarting the server -with an updated config file, a successful inference request with `-v` argument +To change an `instance_group` parameter to `KIND_GPU`, a user can simply replace +`KIND_CPU` with `KIND_GPU` in the `config.pbtxt`. After restarting the server +with an updated config file, a successful inference request with `-v` argument will result into the similar output, but with an updated `instance_group` entry: ``` { @@ -186,14 +186,14 @@ will result into the similar output, but with an updated `instance_group` entry: Results is class: TABBY PASS: ResNet50 instance kind ``` -It is also possible to load multiple model instances on CPU and GPU -if neccessary. +It is also possible to load multiple model instances on CPU and GPU +if necessary. -Below the instance group setting will create two model instances, +Below the instance group setting will create two model instances, one on CPU and other on GPU. ``` instance_group [{ kind: KIND_CPU }, { kind: KIND_GPU}] ``` -For more information on possible model configurations, +For more information on possible model configurations, check out the Triton Server documentation [here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-configuration) \ No newline at end of file diff --git a/examples/instance_kind/client.py b/examples/instance_kind/client.py index 376ee47f..f36c4e2b 100644 --- a/examples/instance_kind/client.py +++ b/examples/instance_kind/client.py @@ -34,46 +34,55 @@ import tritonclient.http as httpclient from tritonclient.utils import * -warnings.filterwarnings('ignore') +warnings.filterwarnings("ignore") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_name", - type=str, - required=False, - default="resnet50", - help="Model name") - parser.add_argument("--image_url", - type=str, - required=False, - default=\ - "http://images.cocodataset.org/test2017/000000557146.jpg", - help=\ - "Image URL. Default is:\ - http://images.cocodataset.org/test2017/000000557146.jpg" + parser.add_argument( + "--model_name", + type=str, + required=False, + default="resnet50", + help="Model name", + ) + parser.add_argument( + "--image_url", + type=str, + required=False, + default="http://images.cocodataset.org/test2017/000000557146.jpg", + help="Image URL. Default is:\ + http://images.cocodataset.org/test2017/000000557146.jpg", + ) + parser.add_argument( + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-v", + "--verbose", + action="http://github.com/store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "--label_file", + type=str, + required=False, + default="./resnet50_labels.txt", + help="Path to the file with text representation \ + of available labels", ) - parser.add_argument("--url", - type=str, - required=False, - default="localhost:8000", - help="Inference server URL. Default is localhost:8000.") - parser.add_argument('-v', - "--verbose", - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument("--label_file", - type=str, - required=False, - default="./resnet50_labels.txt", - help="Path to the file with text representation \ - of available labels") args = parser.parse_args() - utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', - 'nvidia_convnets_processing_utils', - skip_validation=True) + utils = torch.hub.load( + "NVIDIA/DeepLearningExamples:torchhub", + "nvidia_convnets_processing_utils", + skip_validation=True, + ) try: triton_client = httpclient.InferenceServerClient(args.url) @@ -85,9 +94,7 @@ labels_dict = {idx: line.strip() for idx, line in enumerate(f)} if args.verbose: - print( - json.dumps(triton_client.get_model_config(args.model_name), - indent=4)) + print(json.dumps(triton_client.get_model_config(args.model_name), indent=4)) input_name = "INPUT" output_name = "OUTPUT" @@ -97,13 +104,13 @@ output = httpclient.InferRequestedOutput(output_name) input.set_data_from_numpy(batch) - results = triton_client.infer(model_name=args.model_name, - inputs=[input], - outputs=[output]) + results = triton_client.infer( + model_name=args.model_name, inputs=[input], outputs=[output] + ) output_data = results.as_numpy(output_name) max_id = np.argmax(output_data, axis=1)[0] print("Results is class: {}".format(labels_dict[max_id])) - print('PASS: ResNet50 instance kind') + print("PASS: ResNet50 instance kind") sys.exit(0) diff --git a/examples/instance_kind/config.pbtxt b/examples/instance_kind/config.pbtxt old mode 100755 new mode 100644 diff --git a/examples/instance_kind/model.py b/examples/instance_kind/model.py index 24f51cfc..801a8593 100644 --- a/examples/instance_kind/model.py +++ b/examples/instance_kind/model.py @@ -31,31 +31,34 @@ class TritonPythonModel: - def initialize(self, args): """ This function initializes pre-trained ResNet50 model, depending on the value specified by an `instance_group` parameter - in `config.pbtxt`. + in `config.pbtxt`. - Depending on what `instance_group` was specified in + Depending on what `instance_group` was specified in the config.pbtxt file (KIND_CPU or KIND_GPU), the model instance will be initialised on a cpu, a gpu, or both. If `instance_group` was - not specified in the config file, then models will be loaded onto + not specified in the config file, then models will be loaded onto the default device of the framework. """ - self.device = 'cuda' if args["model_instance_kind"] == "GPU" else 'cpu' + self.device = "cuda" if args["model_instance_kind"] == "GPU" else "cpu" # This example is configured to work with torch=1.13 # and torchvision=0.14. Thus, we need to provide a proper tag `0.14.1` # to make sure loaded Resnet50 is compatible with # installed `torchvision`. # Refer to README for installation instructions. - self.model = torch.hub.load("pytorch/vision:v0.14.1", - "resnet50", - weights="IMAGENET1K_V2", - skip_validation=True)\ - .to(self.device)\ - .eval() + self.model = ( + torch.hub.load( + "pytorch/vision:v0.14.1", + "resnet50", + weights="IMAGENET1K_V2", + skip_validation=True, + ) + .to(self.device) + .eval() + ) def execute(self, requests): """ @@ -67,9 +70,8 @@ def execute(self, requests): input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT") with torch.no_grad(): result = self.model( - torch.as_tensor(input_tensor.as_numpy(), - device=self.device)) - out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", - to_dlpack(result)) + torch.as_tensor(input_tensor.as_numpy(), device=self.device) + ) + out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", to_dlpack(result)) responses.append(pb_utils.InferenceResponse([out_tensor])) return responses diff --git a/examples/instance_kind/resnet50_labels.txt b/examples/instance_kind/resnet50_labels.txt old mode 100755 new mode 100644 index e59113f7..2376a285 --- a/examples/instance_kind/resnet50_labels.txt +++ b/examples/instance_kind/resnet50_labels.txt @@ -517,7 +517,7 @@ COWBOY HAT CRADLE CRANE CRASH HELMET -CRATE +CREATE CRIB CROCK POT CROQUET BALL diff --git a/examples/jax/client.py b/examples/jax/client.py index d3c19a8f..a53d17e9 100644 --- a/examples/jax/client.py +++ b/examples/jax/client.py @@ -24,23 +24,25 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient import sys + import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * model_name = "jax" shape = [4] with httpclient.InferenceServerClient("localhost:8000") as client: - input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), ] inputs[0].set_data_from_numpy(input0_data) @@ -51,19 +53,22 @@ httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("jax example error: incorrect sum") @@ -73,5 +78,5 @@ print("jax example error: incorrect difference") sys.exit(1) - print('PASS: jax') + print("PASS: jax") sys.exit(0) diff --git a/examples/jax/model.py b/examples/jax/model.py index b6ea2d35..d6840dc9 100644 --- a/examples/jax/model.py +++ b/examples/jax/model.py @@ -25,8 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json -import numpy as np + import jax.numpy as jnp +import numpy as np # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also @@ -54,7 +55,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -69,21 +70,21 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): """`execute` must be implemented in every Python model. `execute` @@ -125,11 +126,11 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. out_tensor_0 = pb_utils.Tensor( - "OUTPUT0", - np.array(out_0).astype(output0_dtype)) + "OUTPUT0", np.array(out_0).astype(output0_dtype) + ) out_tensor_1 = pb_utils.Tensor( - "OUTPUT1", - np.array(out_1).astype(output1_dtype)) + "OUTPUT1", np.array(out_1).astype(output1_dtype) + ) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -137,9 +138,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -151,4 +153,4 @@ def finalize(self): Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/preprocessing/README.md b/examples/preprocessing/README.md index 18035192..c1b55529 100644 --- a/examples/preprocessing/README.md +++ b/examples/preprocessing/README.md @@ -8,13 +8,13 @@ Run onnx_exporter.py to convert ResNet50 PyTorch model to ONNX format. Width and $ docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash $ pip install numpy pillow torchvision $ python onnx_exporter.py --save model.onnx - + **2. Create the model repository:** $ mkdir -p model_repository/ensemble_python_resnet50/1 $ mkdir -p model_repository/preprocess/1 $ mkdir -p model_repository/resnet50_trt/1 - + # Copy the Python model $ cp model.py model_repository/preprocess/1 @@ -31,13 +31,13 @@ Under python_backend/examples/preprocessing, run this command to start the serve $ docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash $ pip install numpy pillow torchvision $ tritonserver --model-repository=/models - + **5. Start the client to test:** Under python_backend/examples/preprocessing, run the commands below to start the client Docker container: $ wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" - $ docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg - $ The result of classification is:COFFEE MUG + $ docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg + $ The result of classification is:COFFEE MUG Here, since we input an image of "mug" and the inference result is "COFFEE MUG" which is correct. diff --git a/examples/preprocessing/client.py b/examples/preprocessing/client.py index dc0ebf0d..202d411a 100644 --- a/examples/preprocessing/client.py +++ b/examples/preprocessing/client.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,53 +24,59 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os, sys -import numpy as np +import argparse import json +import sys + +import numpy as np import tritongrpcclient -import argparse def load_image(img_path: str): """ Loads an encoded image as an array of bytes. - + """ - return np.fromfile(img_path, dtype='uint8') + return np.fromfile(img_path, dtype="uint8") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model_name", - type=str, - required=False, - default="ensemble_python_resnet50", - help="Model name") - parser.add_argument("--image", - type=str, - required=True, - help="Path to the image") - parser.add_argument("--url", - type=str, - required=False, - default="localhost:8001", - help="Inference server URL. Default is localhost:8001.") - parser.add_argument('-v', - "--verbose", - action="http://github.com/store_true", - required=False, - default=False, - help='Enable verbose output') + parser.add_argument( + "--model_name", + type=str, + required=False, + default="ensemble_python_resnet50", + help="Model name", + ) + parser.add_argument("--image", type=str, required=True, help="Path to the image") + parser.add_argument( + "--url", + type=str, + required=False, + default="localhost:8001", + help="Inference server URL. Default is localhost:8001.", + ) + parser.add_argument( + "-v", + "--verbose", + action="http://github.com/store_true", + required=False, + default=False, + help="Enable verbose output", + ) parser.add_argument( "--label_file", type=str, default="./model_repository/resnet50_trt/labels.txt", - help="Path to the file with text representation of available labels") + help="Path to the file with text representation of available labels", + ) args = parser.parse_args() try: triton_client = tritongrpcclient.InferenceServerClient( - url=args.url, verbose=args.verbose) + url=args.url, verbose=args.verbose + ) except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) @@ -85,14 +91,13 @@ def load_image(img_path: str): image_data = load_image(args.image) image_data = np.expand_dims(image_data, axis=0) - inputs.append( - tritongrpcclient.InferInput(input_name, image_data.shape, "UINT8")) + inputs.append(tritongrpcclient.InferInput(input_name, image_data.shape, "UINT8")) outputs.append(tritongrpcclient.InferRequestedOutput(output_name)) inputs[0].set_data_from_numpy(image_data) - results = triton_client.infer(model_name=args.model_name, - inputs=inputs, - outputs=outputs) + results = triton_client.infer( + model_name=args.model_name, inputs=inputs, outputs=outputs + ) output0_data = results.as_numpy(output_name) print(output0_data) diff --git a/examples/preprocessing/model.py b/examples/preprocessing/model.py index d4117e2f..90259978 100644 --- a/examples/preprocessing/model.py +++ b/examples/preprocessing/model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,20 +24,18 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np -import sys -import json import io +import json + +import numpy as np +import torchvision.transforms as transforms # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils - from PIL import Image -import torchvision.transforms as transforms -import os class TritonPythonModel: @@ -48,7 +46,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -63,15 +61,15 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT_0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT_0") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) def execute(self, requests): """`execute` MUST be implemented in every Python model. `execute` @@ -105,18 +103,22 @@ def execute(self, requests): # Get INPUT0 in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT_0") - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) - loader = transforms.Compose([ - transforms.Resize([224, 224]), - transforms.CenterCrop(224), - transforms.ToTensor(), normalize - ]) + loader = transforms.Compose( + [ + transforms.Resize([224, 224]), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ) def image_loader(image_name): image = loader(image_name) - #expand the dimension to nchw + # expand the dimension to nchw image = image.unsqueeze(0) return image @@ -126,8 +128,7 @@ def image_loader(image_name): img_out = image_loader(image) img_out = np.array(img_out) - out_tensor_0 = pb_utils.Tensor("OUTPUT_0", - img_out.astype(output0_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT_0", img_out.astype(output0_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -135,9 +136,10 @@ def image_loader(image_name): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0]) + output_tensors=[out_tensor_0] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -149,4 +151,4 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/examples/preprocessing/model_repository/preprocess/config.pbtxt b/examples/preprocessing/model_repository/preprocess/config.pbtxt index 1125dea3..fcfbd93b 100644 --- a/examples/preprocessing/model_repository/preprocess/config.pbtxt +++ b/examples/preprocessing/model_repository/preprocess/config.pbtxt @@ -26,7 +26,7 @@ name: "preprocess" backend: "python" -max_batch_size: 256 +max_batch_size: 256 input [ { name: "INPUT_0" @@ -34,7 +34,7 @@ input [ dims: [ -1 ] } ] - + output [ { name: "OUTPUT_0" diff --git a/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt b/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt index d464d582..a4b94402 100644 --- a/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt +++ b/examples/preprocessing/model_repository/resnet50_trt/config.pbtxt @@ -32,7 +32,7 @@ input [ name: "input" data_type: TYPE_FP32 dims: [3, -1, -1 ] - + } ] output[ diff --git a/examples/preprocessing/model_repository/resnet50_trt/labels.txt b/examples/preprocessing/model_repository/resnet50_trt/labels.txt index e59113f7..2376a285 100644 --- a/examples/preprocessing/model_repository/resnet50_trt/labels.txt +++ b/examples/preprocessing/model_repository/resnet50_trt/labels.txt @@ -517,7 +517,7 @@ COWBOY HAT CRADLE CRANE CRASH HELMET -CRATE +CREATE CRIB CROCK POT CROQUET BALL diff --git a/examples/preprocessing/onnx_exporter.py b/examples/preprocessing/onnx_exporter.py index 9148e4e6..3be47b57 100644 --- a/examples/preprocessing/onnx_exporter.py +++ b/examples/preprocessing/onnx_exporter.py @@ -24,11 +24,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import torch -import torchvision.models as models import argparse import os +import torch +import torchvision.models as models + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--save", default="model.onnx") @@ -38,23 +39,19 @@ dummy_input = torch.randn(1, 3, 224, 224) resnet50 = resnet50.eval() - torch.onnx.export(resnet50, - dummy_input, - args.save, - export_params=True, - opset_version=10, - do_constant_folding=True, - input_names=['input'], - output_names=['output'], - dynamic_axes={ - 'input': { - 0: 'batch_size', - 2: "height", - 3: 'width' - }, - 'output': { - 0: 'batch_size' - } - }) + torch.onnx.export( + resnet50, + dummy_input, + args.save, + export_params=True, + opset_version=10, + do_constant_folding=True, + input_names=["input"], + output_names=["output"], + dynamic_axes={ + "input": {0: "batch_size", 2: "height", 3: "width"}, + "output": {0: "batch_size"}, + }, + ) print("Saved {}".format(args.save)) diff --git a/examples/pytorch/client.py b/examples/pytorch/client.py index ee29b5fe..af1abd39 100644 --- a/examples/pytorch/client.py +++ b/examples/pytorch/client.py @@ -24,10 +24,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from tritonclient.utils import * -import tritonclient.http as httpclient import sys + import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * model_name = "pytorch" shape = [4] @@ -36,10 +37,12 @@ input0_data = np.random.rand(*shape).astype(np.float32) input1_data = np.random.rand(*shape).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(input0_data.dtype)), - httpclient.InferInput("INPUT1", input1_data.shape, - np_to_triton_dtype(input1_data.dtype)), + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), ] inputs[0].set_data_from_numpy(input0_data) @@ -50,19 +53,22 @@ httpclient.InferRequestedOutput("OUTPUT1"), ] - response = client.infer(model_name, - inputs, - request_id=str(1), - outputs=outputs) + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) result = response.get_response() output0_data = response.as_numpy("OUTPUT0") output1_data = response.as_numpy("OUTPUT1") - print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output0_data)) - print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( - input0_data, input1_data, output1_data)) + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data + ) + ) if not np.allclose(input0_data + input1_data, output0_data): print("pytorch example error: incorrect sum") @@ -72,5 +78,5 @@ print("pytorch example error: incorrect difference") sys.exit(1) - print('PASS: pytorch') + print("PASS: pytorch") sys.exit(0) diff --git a/examples/pytorch/model.py b/examples/pytorch/model.py index 3383acc0..89b0c8a2 100644 --- a/examples/pytorch/model.py +++ b/examples/pytorch/model.py @@ -25,13 +25,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json -from torch import nn # triton_python_backend_utils is available in every Triton Python model. You # need to use this module to create inference requests and responses. It also # contains some utility functions for extracting information from model_config # and converting Triton input/output types to numpy types. import triton_python_backend_utils as pb_utils +from torch import nn class AddSubNet(nn.Module): @@ -55,7 +55,7 @@ class TritonPythonModel: def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -70,21 +70,21 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) # Get OUTPUT0 configuration - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") # Get OUTPUT1 configuration - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) # Instantiate the PyTorch model self.add_sub_model = AddSubNet() @@ -128,10 +128,8 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. @@ -139,9 +137,10 @@ def execute(self, requests): # response: # # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occured")) + # output_tensors=..., TritonError("An error occurred")) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0, out_tensor_1]) + output_tensors=[out_tensor_0, out_tensor_1] + ) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -153,4 +152,4 @@ def finalize(self): Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') + print("Cleaning up...") diff --git a/inferentia/README.md b/inferentia/README.md index 77a00d2c..6a90740d 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -29,7 +29,7 @@ # Using Triton with Inferentia 1 Starting from 21.11 release, Triton supports -[AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) +[AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-intro/get-started.html). ## Table of Contents @@ -65,7 +65,7 @@ Clone this repo with Github to home repo `/home/ubuntu`. ``` Then, start the Triton instance with: -``` +``` $docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 ``` Note 1: The user would need to list any neuron device to run during container initialization. @@ -73,11 +73,11 @@ For example, to use 4 neuron devices on an instance, the user would need to run ``` $docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` ``` -Note 2: `/mylib/udev` is used for Neuron parameter passing. +Note 2: `/mylib/udev` is used for Neuron parameter passing. -Note 3: For Triton container version xx.yy, please refer to +Note 3: For Triton container version xx.yy, please refer to [Triton Inference Server Container Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html). - The current build script has been tested with container version `21.10`. + The current build script has been tested with container version `21.10`. After starting the Triton container, go into the `python_backend` folder and run the setup script. ``` @@ -88,17 +88,17 @@ This script will: 2. Install [neuron-cc](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/index.html), the Neuron compiler. 3. Install neuron framework packages as per your preference e.g., either pytorch, or tensorflow or both. -There are user configurable options available for the script as well. +There are user configurable options available for the script as well. Please use the `-h` or `--help` options to learn about more configurable options. ## Setting up the Inferentia model Currently, we only support [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html) and [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/tensorflow-neuron/index.html) -workflows for execution on inferentia. +workflows for execution on inferentia. -The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels` -(for tensorflow) models. This is a critical step since Inferentia will need +The user is required to create their own `*.pt` (for pytorch) or `*.savedmodels` +(for tensorflow) models. This is a critical step since Inferentia will need the underlying `.NEFF` graph to execute the inference request. Please refer to: - [Neuron compiler CLI Reference Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-cc/command-line-reference.html) @@ -228,13 +228,13 @@ their need. ### Using Triton's Dynamic Batching To enable dynamic batching, `--enable_dynamic_batching` -flag needs to be specified. `gen_triton_model.py` supports following three +flag needs to be specified. `gen_triton_model.py` supports following three options for configuring [Triton's dynamic batching](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md): 1. `--preferred_batch_size`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#preferred-batch-sizes) for details on preferred batch size. To optimize performance, this is recommended to be multiples of engaged neuron cores. For example, if each instance is using 2 neuron cores, `preferred_batch_size` - could be 2, 4 or 6. + could be 2, 4 or 6. 2. `--max_queue_delay_microseconds`: Please refer to [model configuration documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching) for details. 3. `--disable_batch_requests_to_neuron`: Enable the non-default way for Triton to @@ -256,7 +256,7 @@ requires an instance with more than 8 inferentia cores to run, eg:`inf1.6xlarge` start the test, run ``` $source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh -``` +``` where `` is usually `/home/ubuntu`/. This script will pull the [server repo](https://github.com/triton-inference-server/server) that contains the tests for inferentia. It will then build the most recent @@ -266,7 +266,7 @@ Note: If you would need to change some of the tests in the server repo, you would need to run ``` $export TRITON_SERVER_REPO_TAG= -``` +``` before running the script. # Using Triton with Inferentia 2, or Trn1 @@ -291,11 +291,11 @@ python3 inferentia/scripts/gen_triton_model.py --inf2 --model_type pytorch --tri ``` 4. **Note**: When using the `--inf2` option, the `--compiled_model` path should be provided relative to the triton model directory. The `initialize()` function in model.py will derive the full path by concatenating the model path within the repository and the relative `--compiled_model` path. ## transformers-neuronx -To use inf2/trn1 instances with transformers-neuronx packages for serving models, generate a `pytorch` model as per above instructions. The transformers-neuronx currently supports the models listed [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/transformers-neuronx/readme.html#currently-supported-models). +To use inf2/trn1 instances with transformers-neuronx packages for serving models, generate a `pytorch` model as per above instructions. The transformers-neuronx currently supports the models listed [here](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/transformers-neuronx/readme.html#currently-supported-models). -As prescribed on the neuronx documentation page, while the neuronx load API differs per model, it follows the same pattern. +As prescribed on the neuronx documentation page, while the neuronx load API differs per model, it follows the same pattern. -1. To serve transformers-neuronx models, first trace the model using `save_pretrained_split()` API on an inf2 instance (recommed inf2.24xl for Large Language Models). Following that, package the folder as the '--compiled_model' when using `gen_triton_model.py` file. +1. To serve transformers-neuronx models, first trace the model using `save_pretrained_split()` API on an inf2 instance (recommend inf2.24xl for Large Language Models). Following that, package the folder as the '--compiled_model' when using `gen_triton_model.py` file. 2. The following tree shows a sample model structure for OPT model: ``` opt/ diff --git a/inferentia/qa/setup_test_enviroment_and_test.sh b/inferentia/qa/setup_test_enviroment_and_test.sh old mode 100644 new mode 100755 index 7972dae7..cf6057ac --- a/inferentia/qa/setup_test_enviroment_and_test.sh +++ b/inferentia/qa/setup_test_enviroment_and_test.sh @@ -51,7 +51,7 @@ UPSTREAM_CONTAINER_VERSION="" USAGE=" usage: setup_test_enviroment_and_test.sh [options]. These setting will override exported variables -Setup enviroment for testing on Inferentia chips and run perf analyzer tests. +Setup environment for testing on Inferentia chips and run perf analyzer tests. -h|--help Shows usage -d|--default-repo-tag DEFAULT_REPO_TAG for building the test container. Default is main -s|--server-repo-tag TRITON_SERVER_REPO_TAG for building test container. Default same DEFAULT_REPO_TAG @@ -113,7 +113,7 @@ cd ${TRITON_PATH}/server git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} \ https://github.com/triton-inference-server/client.git clientrepo -# First set up inferentia and run in detatched mode +# First set up inferentia and run in detached mode cd ${TRITON_PATH}/python_backend chmod 777 ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh sudo ${TRITON_PATH}/python_backend/inferentia/scripts/setup-pre-container.sh @@ -131,7 +131,7 @@ if [ "${UPSTREAM_CONTAINER_VERSION}" = "" ]; then echo "found upstream container version: ${UPSTREAM_CONTAINER_VERSION} from build.py" fi -# Build container with only python backend +# Build container with only python backend cd ${TRITON_PATH}/server pip3 install docker ./build.py --container-version=${CONTAINER_VERSION} \ diff --git a/inferentia/scripts/gen_triton_model.py b/inferentia/scripts/gen_triton_model.py index 75f0425b..caa2450c 100644 --- a/inferentia/scripts/gen_triton_model.py +++ b/inferentia/scripts/gen_triton_model.py @@ -30,6 +30,7 @@ def tf_to_triton_dtype(dtype): import tensorflow as tf + if dtype == tf.float16: return "FP16" elif dtype == tf.float32: @@ -62,12 +63,13 @@ def tf_to_triton_dtype(dtype): def parse_tf_tensors(saved_model_dir, tag_set, signature_def_key): from tensorflow.python.tools import saved_model_utils - meta_graph_def = saved_model_utils.get_meta_graph_def( - saved_model_dir, tag_set) + + meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir, tag_set) input_dict = {} input_signatures = list( - meta_graph_def.signature_def[signature_def_key].inputs.values()) + meta_graph_def.signature_def[signature_def_key].inputs.values() + ) for input_signature in input_signatures: datatype = tf_to_triton_dtype(input_signature.dtype) shape = [] @@ -77,7 +79,8 @@ def parse_tf_tensors(saved_model_dir, tag_set, signature_def_key): output_dict = {} output_signatures = list( - meta_graph_def.signature_def[signature_def_key].outputs.values()) + meta_graph_def.signature_def[signature_def_key].outputs.values() + ) for output_signature in output_signatures: datatype = tf_to_triton_dtype(output_signature.dtype) shape = [] @@ -98,61 +101,81 @@ def parse_io_tensors(tensors): def get_parameter_spec(key1, value): - param_spec = "parameters: {{key: \"{}\", value: {{string_value: \"{}\"}}}} \n".format( - key1, value) + param_spec = 'parameters: {{key: "{}", value: {{string_value: "{}"}}}} \n'.format( + key1, value + ) return param_spec -def create_modelconfig(model_name, max_batch_size, inputs, outputs, - compiled_model_path, nc_start_idx, nc_end_idx, - threads_per_core, instance_count, - enable_dynamic_batching, preferred_batch_size, - max_queue_delay_microseconds): - config = "name: \"{}\"\n".format(model_name) - config += "backend: \"python\"\n" +def create_modelconfig( + model_name, + max_batch_size, + inputs, + outputs, + compiled_model_path, + nc_start_idx, + nc_end_idx, + threads_per_core, + instance_count, + enable_dynamic_batching, + preferred_batch_size, + max_queue_delay_microseconds, +): + config = 'name: "{}"\n'.format(model_name) + config += 'backend: "python"\n' config += "max_batch_size: {}\n".format(max_batch_size) if enable_dynamic_batching: - config += ''' + config += """ dynamic_batching { -''' +""" if preferred_batch_size is not None: - config += ''' + config += """ preferred_batch_size: {} -'''.format(preferred_batch_size) +""".format( + preferred_batch_size + ) if max_queue_delay_microseconds is not None: - config += ''' + config += """ max_queue_delay_microseconds: {} -'''.format(max_queue_delay_microseconds) - config += ''' -}\n''' +""".format( + max_queue_delay_microseconds + ) + config += """ +}\n""" for input_name in inputs.keys(): data_type, shape = inputs[input_name] - config += ''' + config += """ input [ {{ name: \"{}\" data_type: {} dims: {} }} -]\n'''.format(input_name, "TYPE_" + data_type, shape) +]\n""".format( + input_name, "TYPE_" + data_type, shape + ) for output_name in outputs.keys(): data_type, shape = outputs[output_name] - config += ''' + config += """ output [ {{ name: \"{}\" data_type: {} dims: {} }} -]\n'''.format(output_name, "TYPE_" + data_type, shape) - config += ''' +]\n""".format( + output_name, "TYPE_" + data_type, shape + ) + config += """ instance_group [ {{ kind: KIND_MODEL count: {} }} -]\n'''.format(instance_count) +]\n""".format( + instance_count + ) config += get_parameter_spec("COMPILED_MODEL", compiled_model_path) config += get_parameter_spec("NEURON_CORE_START_INDEX", nc_start_idx) config += get_parameter_spec("NEURON_CORE_END_INDEX", nc_end_idx) @@ -161,7 +184,7 @@ def create_modelconfig(model_name, max_batch_size, inputs, outputs, def get_model_license(): - lic = '''# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + lic = """# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -186,7 +209,7 @@ def get_model_license(): # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - ''' + """ return lic @@ -195,7 +218,7 @@ def get_common_initialize_impl(): def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows - the model to intialize any state associated with this model. + the model to initialize any state associated with this model. Parameters ---------- @@ -258,7 +281,7 @@ def initialize(self, args): def get_tensorflow_initialize_impl(is_inf2=False): init_impl = get_common_initialize_impl() - init_impl += ''' + init_impl += """ self.input_list = [] for config_input in model_config['input']: self.input_list.append( @@ -272,27 +295,27 @@ def get_tensorflow_initialize_impl(is_inf2=False): config_output['dims'])) os.environ["NEURON_RT_NUM_CORES"] = str(cores_per_instance) -''' +""" if is_inf2: - init_impl += ''' + init_impl += """ compiled_model = os.path.join(args['model_repository'], compiled_model) self.pred_list = [ tf.keras.models.load_model(compiled_model) for _ in range(cores_per_instance) - ] * threads_per_core -''' + ] * threads_per_core +""" else: - init_impl += ''' + init_impl += """ self.pred_list = [ tf.contrib.predictor.from_saved_model(compiled_model) for _ in range(cores_per_instance) ] * threads_per_core -''' +""" return init_impl def get_pytorch_initialize_impl(is_inf2=False): - init_impl = ''' + init_impl = """ def _validate_and_get_index(self, name): parts = name.split('__') if len(parts) != 2: @@ -318,9 +341,9 @@ def _validate_output_dict(self, expected_count): if i not in self.output_dict: raise pb_utils.TritonModelException( "output corresponding to index {} not found".format(i)) -''' +""" init_impl += get_common_initialize_impl() - init_impl += ''' + init_impl += """ self.input_dict = {} expected_input_count = 0 for config_input in model_config['input']: @@ -348,20 +371,20 @@ def _validate_output_dict(self, expected_count): os.environ["NEURON_RT_VISIBLE_CORES"] = cores_range consumed_cores_list = [i for i in range(cores_per_instance)] -''' +""" if is_inf2: - init_impl += ''' + init_impl += """ compiled_model = os.path.join(args['model_repository'], compiled_model) self.model_neuron = torch.jit.load(compiled_model) -''' +""" else: - init_impl += ''' + init_impl += """ self.model_neuron = torch.neuron.DataParallel( - torch.jit.load(compiled_model), device_ids=consumed_cores_list) -''' - init_impl += ''' + torch.jit.load(compiled_model), device_ids=consumed_cores_list) +""" + init_impl += """ self.model_neuron.num_workers = num_threads -''' +""" return init_impl @@ -394,7 +417,7 @@ def execute(self, requests): """ ''' if disable_batch_requests_to_neuron: - exec_impl += ''' + exec_impl += """ responses = [] num_threads = len(self.pred_list) model_feed_dict_list = [{} for _ in range(num_threads)] @@ -436,9 +459,9 @@ def execute(self, requests): output_tensors=output_tensors) responses.append(inference_response) return responses -''' +""" else: - exec_impl += ''' + exec_impl += """ responses = [] num_threads = len(self.pred_list) model_feed_dict_list = [{} for _ in range(num_threads)] @@ -483,7 +506,7 @@ def execute(self, requests): full_tensor = np.concatenate( (full_tensor, out_list[idx + 1]), axis=0) chuncky_tensors.append(np.split(full_tensor, request_batch_sizes, axis=0)) - + for i in range(num_requests): output_tensors = [] for j in range(len(self.output_list)): @@ -498,7 +521,7 @@ def execute(self, requests): responses.append(inference_response) return responses -''' +""" return exec_impl @@ -527,7 +550,7 @@ def execute(self, requests): """ ''' if disable_batch_requests_to_neuron: - exec_impl += ''' + exec_impl += """ responses = [] for request in requests: inputs = [] @@ -549,9 +572,9 @@ def execute(self, requests): output_tensors=output_tensors) responses.append(inference_response) return responses -''' +""" else: - exec_impl += ''' + exec_impl += """ responses = [] inputs = [] num_requests = len(requests) @@ -588,7 +611,7 @@ def execute(self, requests): responses.append(inference_response) return responses -''' +""" return exec_impl @@ -605,9 +628,9 @@ def finalize(self): return finalize_impl -def get_triton_python_model_impl(using_tensorflow_model, - disable_batch_requests_to_neuron, - is_inf2=False): +def get_triton_python_model_impl( + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False +): triton_pmi = ''' class TritonPythonModel: """Your Python model must use the same class name. Every Python model @@ -617,8 +640,7 @@ class TritonPythonModel: if using_tensorflow_model: triton_pmi += get_tensorflow_initialize_impl(is_inf2) - triton_pmi += get_tensorflow_execute_impl( - disable_batch_requests_to_neuron) + triton_pmi += get_tensorflow_execute_impl(disable_batch_requests_to_neuron) else: triton_pmi += get_pytorch_initialize_impl(is_inf2) triton_pmi += get_pytorch_execute_impl(disable_batch_requests_to_neuron) @@ -628,141 +650,154 @@ class TritonPythonModel: return triton_pmi -def create_model_file(using_tensorflow_model, - disable_batch_requests_to_neuron, - is_inf2=False): +def create_model_file( + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2=False +): triton_model = get_model_license() - triton_model += ''' + triton_model += """ import json import numpy as np import os import sys import triton_python_backend_utils as pb_utils -''' +""" if using_tensorflow_model: - triton_model += ''' + triton_model += """ import tensorflow as tf from concurrent import futures -''' +""" else: - triton_model += ''' + triton_model += """ import torch - ''' + """ if not is_inf2: - triton_model += ''' + triton_model += """ import torch.neuron - ''' + """ else: - triton_model += ''' + triton_model += """ import torch_neuronx -''' +""" triton_model += get_triton_python_model_impl( - using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2) + using_tensorflow_model, disable_batch_requests_to_neuron, is_inf2 + ) return triton_model -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--inf2', + "--inf2", required=False, default=False, - action='http://github.com/store_true', - help= - "Specify whether the model should be generate for inf2 or inf1, default is inf1" + action="http://github.com/store_true", + help="Specify whether the model should be generate for inf2 or inf1, default is inf1", + ) + parser.add_argument( + "--model_type", + type=str, + required=True, + choices=["pytorch", "tensorflow"], + help="""The type of the compiled model. Currently, + only supports \"pytorch\" and \"tensorflow\".""", + ) + parser.add_argument( + "--model_version", type=int, default=1, help="The version of the model" ) - parser.add_argument('--model_type', - type=str, - required=True, - choices=['pytorch', 'tensorflow'], - help='''The type of the compiled model. Currently, - only supports \"pytorch\" and \"tensorflow\".''') - parser.add_argument('--model_version', - type=int, - default=1, - help='The version of the model') parser.add_argument( - '--enable_dynamic_batching', + "--enable_dynamic_batching", action="http://github.com/store_true", - help='''Enable dynamic batching. Please see model configuration - documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher''' + help="""Enable dynamic batching. Please see model configuration + documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#dynamic-batcher""", ) parser.add_argument( - '--max_batch_size', + "--max_batch_size", type=int, default=0, - help='''The maximum batch size for the model being generated. - Please see model configuration documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size''' + help="""The maximum batch size for the model being generated. + Please see model configuration documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#maximum-batch-size""", ) - parser.add_argument('--preferred_batch_size', - type=int, - help='''The preferred batch size. Should be multiples + parser.add_argument( + "--preferred_batch_size", + type=int, + help="""The preferred batch size. Should be multiples of cores available to ensure proper utilization of - neuron cores. - This flag is ignored if --enable_dynamic_batching is - not specified. Please see model configuration - documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes''' - ) - parser.add_argument('--max_queue_delay_microseconds', - type=int, - help='''Max queue delay time(ms) for dynamic batching. - This flag is ignored if --enable_dynamic_batching is not specified. - Please see model configuration documentation for details: - https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching''' - ) + neuron cores. + This flag is ignored if --enable_dynamic_batching is + not specified. Please see model configuration + documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#preferred-batch-sizes""", + ) + parser.add_argument( + "--max_queue_delay_microseconds", + type=int, + help="""Max queue delay time(ms) for dynamic batching. + This flag is ignored if --enable_dynamic_batching is not specified. + Please see model configuration documentation for details: + https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#delayed-batching""", + ) parser.add_argument( - '--disable_batch_requests_to_neuron', + "--disable_batch_requests_to_neuron", action="http://github.com/store_true", - help='''Send each request separately to neuron if enabled. - If not specified, then requests are combined and sent to - neuron as a single batch''') - parser.add_argument('--tag_set', - type=str, - default="serve", - help='''The tag set to use for the TF model. + help="""Send each request separately to neuron if enabled. + If not specified, then requests are combined and sent to + neuron as a single batch""", + ) + parser.add_argument( + "--tag_set", + type=str, + default="serve", + help="""The tag set to use for the TF model. This option is ignored if `--model_type` is - not \"tensorflow\". Default value is \'serve\'.''') - parser.add_argument('--signature_def_key', - type=str, - default="serving_default", - help='''The signature def key to use for the TF + not \"tensorflow\". Default value is \'serve\'.""", + ) + parser.add_argument( + "--signature_def_key", + type=str, + default="serving_default", + help="""The signature def key to use for the TF model. This option is ignored if `--model_type` is not \"tensorflow\". Default value - is \'serving_default\'.''') - parser.add_argument('--compiled_model', - type=str, - required=True, - help='Fullpath to the compiled model') + is \'serving_default\'.""", + ) + parser.add_argument( + "--compiled_model", + type=str, + required=True, + help="Fullpath to the compiled model", + ) parser.add_argument( - '--triton_input', + "--triton_input", type=str, - action='http://github.com/append', + action="http://github.com/append", nargs="*", - help='''The name, datatype and shape of the model input in + help="""The name, datatype and shape of the model input in format ,,. This option can be provided multiple times for multiple inputs. For example, to provide a FP16 input with shape [1,384] specify the following: INPUT0,FP16,1x384. - This option is not required when using tensorflow model''') + This option is not required when using tensorflow model""", + ) parser.add_argument( - '--triton_output', + "--triton_output", type=str, - action='http://github.com/append', + action="http://github.com/append", nargs="*", - help='''The name, datatype and shape of the model output in + help="""The name, datatype and shape of the model output in format ,,. This option can be provided multiple times for multiple outputs. For example, to provide a FP16 output with shape [1,384] specify the following: OUTPUT0,FP16,1x384. - This option is not required when using tensorflow model''') - parser.add_argument('--neuron_core_range', - type=str, - required=True, - help='''The range of neuron core indices + This option is not required when using tensorflow model""", + ) + parser.add_argument( + "--neuron_core_range", + type=str, + required=True, + help="""The range of neuron core indices where the model needs to be loaded. The range should be specified in format :. For example to @@ -774,49 +809,61 @@ def create_model_file(using_tensorflow_model, loaded on cores 0:1, Instance1 will get loaded on cores 2:3, Instance2 will get loaded on cores 4:5 and Instance 3 will get loaded on - cores 6:7''') - parser.add_argument('--threads_per_core', - type=int, - default=1, - help='The number of threads per neuron core.') - parser.add_argument('--triton_model_instance_count', - type=int, - default=1, - help='The number of triton model instances.') - parser.add_argument('--triton_model_dir', - type=str, - required=True, - help='''Path to the triton model + cores 6:7""", + ) + parser.add_argument( + "--threads_per_core", + type=int, + default=1, + help="The number of threads per neuron core.", + ) + parser.add_argument( + "--triton_model_instance_count", + type=int, + default=1, + help="The number of triton model instances.", + ) + parser.add_argument( + "--triton_model_dir", + type=str, + required=True, + help="""Path to the triton model directory where script will generate - config.pbtxt and model.py''') + config.pbtxt and model.py""", + ) FLAGS, unparsed = parser.parse_known_args() if len(unparsed) > 0: raise Exception("Unrecognized options: {}".format(unparsed)) - if FLAGS.model_type == 'tensorflow': + if FLAGS.model_type == "tensorflow": is_tensorflow_model = True - elif FLAGS.model_type == 'pytorch': + elif FLAGS.model_type == "pytorch": is_tensorflow_model = False - print('''Triton Dynamic Batching is enabled: {}, - preferred_batch_size: {} and max_batch_size: {} - with max_queue_delay_microseconds: {}. - Batch requests to neruon are disabled: {}'''.format( - FLAGS.enable_dynamic_batching, FLAGS.preferred_batch_size, - FLAGS.max_batch_size, FLAGS.max_queue_delay_microseconds, - FLAGS.disable_batch_requests_to_neuron)) + print( + """Triton Dynamic Batching is enabled: {}, + preferred_batch_size: {} and max_batch_size: {} + with max_queue_delay_microseconds: {}. + Batch requests to neruon are disabled: {}""".format( + FLAGS.enable_dynamic_batching, + FLAGS.preferred_batch_size, + FLAGS.max_batch_size, + FLAGS.max_queue_delay_microseconds, + FLAGS.disable_batch_requests_to_neuron, + ) + ) - if not is_tensorflow_model or (FLAGS.triton_input != None and - FLAGS.triton_output != None): + if not is_tensorflow_model or ( + FLAGS.triton_input != None and FLAGS.triton_output != None + ): inputs = parse_io_tensors(FLAGS.triton_input) outputs = parse_io_tensors(FLAGS.triton_output) else: - inputs, outputs = parse_tf_tensors(FLAGS.compiled_model, FLAGS.tag_set, - FLAGS.signature_def_key) + inputs, outputs = parse_tf_tensors( + FLAGS.compiled_model, FLAGS.tag_set, FLAGS.signature_def_key + ) - nc_start_idx, nc_end_idx = [ - int(i) for i in FLAGS.neuron_core_range.split(":") - ] + nc_start_idx, nc_end_idx = [int(i) for i in FLAGS.neuron_core_range.split(":")] model_version_dir = FLAGS.triton_model_dir + "/" + str(FLAGS.model_version) try: @@ -826,16 +873,26 @@ def create_model_file(using_tensorflow_model, model_name = os.path.basename(FLAGS.triton_model_dir) mc = create_modelconfig( - model_name, FLAGS.max_batch_size, inputs, outputs, FLAGS.compiled_model, - nc_start_idx, nc_end_idx, FLAGS.threads_per_core, - FLAGS.triton_model_instance_count, FLAGS.enable_dynamic_batching, - FLAGS.preferred_batch_size, FLAGS.max_queue_delay_microseconds) + model_name, + FLAGS.max_batch_size, + inputs, + outputs, + FLAGS.compiled_model, + nc_start_idx, + nc_end_idx, + FLAGS.threads_per_core, + FLAGS.triton_model_instance_count, + FLAGS.enable_dynamic_batching, + FLAGS.preferred_batch_size, + FLAGS.max_queue_delay_microseconds, + ) with open(FLAGS.triton_model_dir + "/config.pbtxt", "w") as config_file: config_file.write(mc) is_inf2 = FLAGS.inf2 - mf = create_model_file(is_tensorflow_model, - FLAGS.disable_batch_requests_to_neuron, is_inf2) + mf = create_model_file( + is_tensorflow_model, FLAGS.disable_batch_requests_to_neuron, is_inf2 + ) with open(FLAGS.triton_model_dir + "/1/model.py", "w") as model_file: model_file.write(mf) diff --git a/inferentia/scripts/setup-pre-container.sh b/inferentia/scripts/setup-pre-container.sh index 1d3e9a43..f6f5ae16 100755 --- a/inferentia/scripts/setup-pre-container.sh +++ b/inferentia/scripts/setup-pre-container.sh @@ -85,7 +85,7 @@ then return 1 fi -if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_TRN1} -eq 1 ] +if [ ${INSTALL_INF1} -eq 1 ] && [ ${INSTALL_TRN1} -eq 1 ] then echo "Error: cannot install both inf1 and trn1 dependencies. Selecting -trn1 will install inf2 dependencies and EFA." fi @@ -115,13 +115,13 @@ echo "Installation complete for inf2 runtime and tools." if [ ${INSTALL_TRN1} -eq 1 ] then # Install EFA Driver (only required for multi-instance training) - curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz - wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key - cat aws-efa-installer.key | gpg --fingerprint - wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig - tar -xvf aws-efa-installer-latest.tar.gz - cd aws-efa-installer && sudo bash efa_installer.sh --yes - cd + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz + wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key + cat aws-efa-installer.key | gpg --fingerprint + wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig + tar -xvf aws-efa-installer-latest.tar.gz + cd aws-efa-installer && sudo bash efa_installer.sh --yes + cd sudo rm -rf aws-efa-installer-latest.tar.gz aws-efa-installer fi diff --git a/inferentia/scripts/setup.sh b/inferentia/scripts/setup.sh old mode 100644 new mode 100755 index 550da0ce..cc295530 --- a/inferentia/scripts/setup.sh +++ b/inferentia/scripts/setup.sh @@ -141,13 +141,13 @@ apt-get update && \ # Set Pip repository to point to the Neuron repository -# since we need to use pip to update: +# since we need to use pip to update: # https://aws.amazon.com/blogs/developer/neuron-conda-packages-eol/ pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com pip install --upgrade pip if [ ${INSTALL_INF2} -eq 1 ];then - # Install Neuron Runtime + # Install Neuron Runtime # Then install new neuron libraries . /etc/os-release tee /etc/apt/sources.list.d/neuron.list > /dev/null <= responses_.size()) { idx_ = 0; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 3353a5e5..3d39f005 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -291,7 +291,7 @@ Stub::RunCommand() shm_pool_->Construct(); // The initialization is done in three steps. First the main process sends - // a message to the stub process asking to begin to initilize the Python + // a message to the stub process asking to begin to initialize the Python // model. After that is finished stub process sends a message to the // parent process that the initialization is finished. Finally, the // parent process sends a message to the stub process asking the stub diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 20d5302f..080d3ed8 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -295,7 +295,7 @@ PbTensor::ToDLPack() py::handle tensor_handle = py::cast(tensor); // Increase the reference count by one to make sure that the DLPack - // represenation doesn't become invalid when the tensor object goes out of + // representation doesn't become invalid when the tensor object goes out of // scope. tensor_handle.inc_ref(); diff --git a/src/pb_utils.cc b/src/pb_utils.cc index c6897631..c078c226 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -61,7 +61,7 @@ CUDAHandler::CUDAHandler() { dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY); - // If libcuda.so is succesfully opened, it must be able to find + // If libcuda.so is successfully opened, it must be able to find // "cuPointerGetAttribute" and "cuGetErrorString" symbols. if (dl_open_handle_ != nullptr) { void* cu_pointer_get_attribute_fn = diff --git a/src/python_be.cc b/src/python_be.cc index 08110d0a..cc07b473 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -503,7 +503,7 @@ ModelInstanceState::GetInputTensor( } else { #ifdef TRITON_ENABLE_GPU - // Retreiving GPU input tensors + // Retrieving GPU input tensors const void* buffer = nullptr; std::vector> alloc_perference; alloc_perference = {{TRITONSERVER_MEMORY_GPU, src_memory_type_id}}; @@ -1309,9 +1309,9 @@ ModelInstanceState::ProcessRequests( Stub()->ShmPool(), response_message)); // If the stub command is no longer PYTHONSTUB_InferExecRequest, it indicates - // that inference request exeuction has finished and there are no more BLS - // requests to execute. Otherwise, the Python backend will continuosly execute - // BLS requests pushed to the message queue. + // that inference request execution has finished and there are no more BLS + // requests to execute. Otherwise, the Python backend will continuously + // execute BLS requests pushed to the message queue. while (ipc_message->Command() == PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest || ipc_message->Command() == diff --git a/src/request_executor.cc b/src/request_executor.cc index 00a9b201..2590ee37 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -153,7 +153,7 @@ InferResponseComplete( output_tensors, pb_error, true /* is_last_response */); } else { if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - // Not the last reponse. + // Not the last response. infer_response = std::make_unique( output_tensors, pb_error, false /* is_last_response */, userp /* id */); @@ -171,7 +171,7 @@ InferResponseComplete( } else if ( (infer_payload)->IsDecoupled() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { - // An empty response may be the last reponse for decoupled models. + // An empty response may be the last response for decoupled models. infer_response = std::make_unique( output_tensors, pb_error, true /* is_last_response */, userp /* id */); } else { diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index e2045429..560a3198 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -24,24 +24,25 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np -import struct import json +import struct + +import numpy as np TRITON_STRING_TO_NUMPY = { - 'TYPE_BOOL': bool, - 'TYPE_UINT8': np.uint8, - 'TYPE_UINT16': np.uint16, - 'TYPE_UINT32': np.uint32, - 'TYPE_UINT64': np.uint64, - 'TYPE_INT8': np.int8, - 'TYPE_INT16': np.int16, - 'TYPE_INT32': np.int32, - 'TYPE_INT64': np.int64, - 'TYPE_FP16': np.float16, - 'TYPE_FP32': np.float32, - 'TYPE_FP64': np.float64, - 'TYPE_STRING': np.object_ + "TYPE_BOOL": bool, + "TYPE_UINT8": np.uint8, + "TYPE_UINT16": np.uint16, + "TYPE_UINT32": np.uint32, + "TYPE_UINT64": np.uint64, + "TYPE_INT8": np.int8, + "TYPE_INT16": np.int16, + "TYPE_INT32": np.int32, + "TYPE_INT64": np.int64, + "TYPE_FP16": np.float16, + "TYPE_FP32": np.float32, + "TYPE_FP64": np.float64, + "TYPE_STRING": np.object_, } @@ -71,10 +72,9 @@ def serialize_byte_tensor(input_tensor): # If the input is a tensor of string/bytes objects, then must flatten those # into a 1-dimensional array containing the 4-byte byte size followed by the # actual element bytes. All elements are concatenated together in "C" order. - if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type - == np.bytes_): + if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type == np.bytes_): flattened_ls = [] - for obj in np.nditer(input_tensor, flags=["refs_ok"], order='C'): + for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"): # If directly passing bytes to BYTES type, # don't convert it to str as Python will encode the # bytes which may distort the meaning @@ -82,12 +82,12 @@ def serialize_byte_tensor(input_tensor): if type(obj.item()) == bytes: s = obj.item() else: - s = str(obj.item()).encode('utf-8') + s = str(obj.item()).encode("utf-8") else: s = obj.item() flattened_ls.append(struct.pack(" max_batch_size: raise ValueError( - "configuration specified max_batch_size " + - str(self._model_config["max_batch_size"]) + - ", but in auto-complete-config function for model '" + - self._model_config["name"] + "' specified max_batch_size " + - str(max_batch_size)) + "configuration specified max_batch_size " + + str(self._model_config["max_batch_size"]) + + ", but in auto-complete-config function for model '" + + self._model_config["name"] + + "' specified max_batch_size " + + str(max_batch_size) + ) else: self._model_config["max_batch_size"] = max_batch_size def set_dynamic_batching(self): - """Set dynamic_batching as the scheduler for the model if no scheduler - is set. If dynamic_batching is set in the model configuration, then no + """Set dynamic_batching as the scheduler for the model if no scheduler + is set. If dynamic_batching is set in the model configuration, then no action is taken and return success. Raises ------ ValueError - If the 'sequence_batching' or 'ensemble_scheduling' scheduler is + If the 'sequence_batching' or 'ensemble_scheduling' scheduler is set for this model configuration. """ found_scheduler = None @@ -359,10 +361,13 @@ def set_dynamic_batching(self): if found_scheduler != None: raise ValueError( - "Configuration specified scheduling_choice as '" \ - + found_scheduler + "', but auto-complete-config " \ - "function for model '" + self._model_config["name"] - + "' tries to set scheduling_choice as 'dynamic_batching'") + "Configuration specified scheduling_choice as '" + + found_scheduler + + "', but auto-complete-config " + "function for model '" + + self._model_config["name"] + + "' tries to set scheduling_choice as 'dynamic_batching'" + ) if "dynamic_batching" not in self._model_config: self._model_config["dynamic_batching"] = {} @@ -381,53 +386,70 @@ def add_input(self, input): input with the same name already exists in the configuration but has different data_type or dims property """ - valid_properties = ['name', 'data_type', 'dims'] + valid_properties = ["name", "data_type", "dims"] for current_property in input: if current_property not in valid_properties: raise ValueError( - "input '" + input['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' contains property other than 'name', 'data_type' and 'dims'." + "input '" + + input["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' contains property other than 'name', 'data_type' and 'dims'." ) - if 'name' not in input: + if "name" not in input: + raise ValueError( + "input in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'name' property." + ) + elif "data_type" not in input: raise ValueError( - "input in auto-complete-config function for model '" + - self._model_config["name"] + "' is missing 'name' property.") - elif 'data_type' not in input: - raise ValueError("input '" + input['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'data_type' property.") - elif 'dims' not in input: - raise ValueError("input '" + input['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'dims' property.") + "input '" + + input["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'data_type' property." + ) + elif "dims" not in input: + raise ValueError( + "input '" + + input["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'dims' property." + ) for current_input in self._model_config["input"]: - if input['name'] == current_input['name']: - if current_input[ - 'data_type'] != "TYPE_INVALID" and current_input[ - 'data_type'] != input['data_type']: - raise ValueError("unable to load model '" + - self._model_config["name"] + - "', configuration expects datatype " + - current_input['data_type'] + - " for input '" + input['name'] + - "', model provides " + input['data_type']) - elif current_input[ - 'dims'] and current_input['dims'] != input['dims']: + if input["name"] == current_input["name"]: + if ( + current_input["data_type"] != "TYPE_INVALID" + and current_input["data_type"] != input["data_type"] + ): + raise ValueError( + "unable to load model '" + + self._model_config["name"] + + "', configuration expects datatype " + + current_input["data_type"] + + " for input '" + + input["name"] + + "', model provides " + + input["data_type"] + ) + elif current_input["dims"] and current_input["dims"] != input["dims"]: raise ValueError( - "model '" + self._model_config["name"] + "', tensor '" + - input['name'] + "': the model expects dims " + - str(input['dims']) + - " but the model configuration specifies dims " + - str(current_input['dims'])) + "model '" + + self._model_config["name"] + + "', tensor '" + + input["name"] + + "': the model expects dims " + + str(input["dims"]) + + " but the model configuration specifies dims " + + str(current_input["dims"]) + ) else: - current_input['data_type'] = input['data_type'] - current_input['dims'] = input['dims'] + current_input["data_type"] = input["data_type"] + current_input["dims"] = input["dims"] return self._model_config["input"].append(input) @@ -446,53 +468,72 @@ def add_output(self, output): output with the same name already exists in the configuration but has different data_type or dims property """ - valid_properties = ['name', 'data_type', 'dims'] + valid_properties = ["name", "data_type", "dims"] for current_property in output: if current_property not in valid_properties: raise ValueError( - "output '" + output['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' contains property other than 'name', 'data_type' and 'dims'." + "output '" + + output["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' contains property other than 'name', 'data_type' and 'dims'." ) - if 'name' not in output: + if "name" not in output: + raise ValueError( + "output in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'name' property." + ) + elif "data_type" not in output: raise ValueError( - "output in auto-complete-config function for model '" + - self._model_config["name"] + "' is missing 'name' property.") - elif 'data_type' not in output: - raise ValueError("output '" + output['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'data_type' property.") - elif 'dims' not in output: - raise ValueError("output '" + output['name'] + - "' in auto-complete-config function for model '" + - self._model_config["name"] + - "' is missing 'dims' property.") + "output '" + + output["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'data_type' property." + ) + elif "dims" not in output: + raise ValueError( + "output '" + + output["name"] + + "' in auto-complete-config function for model '" + + self._model_config["name"] + + "' is missing 'dims' property." + ) for current_output in self._model_config["output"]: - if output['name'] == current_output['name']: - if current_output[ - 'data_type'] != "TYPE_INVALID" and current_output[ - 'data_type'] != output['data_type']: - raise ValueError("unable to load model '" + - self._model_config["name"] + - "', configuration expects datatype " + - current_output['data_type'] + - " for output '" + output['name'] + - "', model provides " + output['data_type']) - elif current_output[ - 'dims'] and current_output['dims'] != output['dims']: + if output["name"] == current_output["name"]: + if ( + current_output["data_type"] != "TYPE_INVALID" + and current_output["data_type"] != output["data_type"] + ): + raise ValueError( + "unable to load model '" + + self._model_config["name"] + + "', configuration expects datatype " + + current_output["data_type"] + + " for output '" + + output["name"] + + "', model provides " + + output["data_type"] + ) + elif ( + current_output["dims"] and current_output["dims"] != output["dims"] + ): raise ValueError( - "model '" + self._model_config["name"] + "', tensor '" + - output['name'] + "': the model expects dims " + - str(output['dims']) + - " but the model configuration specifies dims " + - str(current_output['dims'])) + "model '" + + self._model_config["name"] + + "', tensor '" + + output["name"] + + "': the model expects dims " + + str(output["dims"]) + + " but the model configuration specifies dims " + + str(current_output["dims"]) + ) else: - current_output['data_type'] = output['data_type'] - current_output['dims'] = output['dims'] + current_output["data_type"] = output["data_type"] + current_output["dims"] = output["dims"] return self._model_config["output"].append(output) diff --git a/src/shm_manager.h b/src/shm_manager.h index ef2e5cb1..bd462403 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -84,7 +84,7 @@ class SharedMemoryManager { bi::managed_external_buffer::handle_t handle = 0; { - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; std::size_t requested_bytes = sizeof(T) * count + sizeof(AllocatedShmOwnership); GrowIfNeeded(0); @@ -121,7 +121,7 @@ class SharedMemoryManager { AllocatedShmOwnership* shm_ownership_data; { - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; GrowIfNeeded(0); shm_ownership_data = reinterpret_cast( managed_buffer_->get_address_from_handle(handle)); @@ -140,7 +140,7 @@ class SharedMemoryManager { void Deallocate(bi::managed_external_buffer::handle_t handle) { - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; GrowIfNeeded(0); void* ptr = managed_buffer_->get_address_from_handle(handle); managed_buffer_->deallocate(ptr); @@ -181,7 +181,7 @@ class SharedMemoryManager { std::function deleter = [this, handle, shm_ownership_data](T* memory) { bool destroy = false; - bi::scoped_lock gaurd{*shm_mutex_}; + bi::scoped_lock guard{*shm_mutex_}; // Before using any shared memory function you need to make sure that you // are using the correct mapping. For example, shared memory growth may // happen between the time an object was created and the time the object diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 79863fb6..fc3bacd4 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -291,7 +291,7 @@ StubLauncher::Launch() if (pid == 0) { // Replace this child process with the new stub process. execvp("bash", (char**)stub_args); - // execvp() never return if succeeded. Otherwise, an error has occured. + // execvp() never return if succeeded. Otherwise, an error has occurred. std::stringstream ss; ss << "Failed to run python backend stub. Errno = " << errno << '\n' << "Python backend stub path: " << python_backend_stub << '\n' From 611a298dfee601c2552e9152524aeb2e6d2886c6 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 29 Jun 2023 13:57:16 -0400 Subject: [PATCH 119/216] Improve CUDA context management (#263) * Improve CUDA context management * Review edits * Fix spelling --- src/pb_memory.cc | 5 +- src/pb_tensor.cc | 15 +----- src/pb_utils.cc | 125 +++++++++++++++++++++++++---------------------- src/pb_utils.h | 26 ++++++++++ 4 files changed, 95 insertions(+), 76 deletions(-) diff --git a/src/pb_memory.cc b/src/pb_memory.cc index beecb3d9..c18bf912 100644 --- a/src/pb_memory.cc +++ b/src/pb_memory.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -189,8 +189,7 @@ PbMemory::FillShmData( #ifdef TRITON_ENABLE_GPU if (data != nullptr) { if (copy_gpu) { - // [FIXME] Restore the previous device - THROW_IF_CUDA_ERROR(cudaSetDevice(memory_type_id)); + ScopedSetDevice scoped_set_device(memory_type_id); THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle( reinterpret_cast(memory_data_shm), data)); } diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 080d3ed8..4011faad 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -358,15 +358,9 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor) if (err != cudaSuccess) { throw PythonBackendException("Failed to get current CUDA device id."); } + ScopedSetDevice scoped_set_device(capsule_device_info.second); bool overridden = (current_device != capsule_device_info.second); - err = overridden ? cudaSetDevice(capsule_device_info.second) : cudaSuccess; - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set CUDA device to device with id " + - std::to_string(capsule_device_info.second)); - } - cudaStream_t proxy_stream = stub->GetProxyStream(current_device); // Array API requirements for the stream argument: @@ -394,13 +388,6 @@ PbTensor::FromDLPack(const std::string& name, const py::object& tensor) overridden ? capsule_device_info.second : current_device)); } - err = overridden ? cudaSetDevice(current_device) : cudaSuccess; - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set CUDA device back to initial compute device " - "with id " + - std::to_string(current_device)); - } return ptr_to_tensor; #else throw PythonBackendException( diff --git a/src/pb_utils.cc b/src/pb_utils.cc index c078c226..523f4fed 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -62,7 +62,8 @@ CUDAHandler::CUDAHandler() dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY); // If libcuda.so is successfully opened, it must be able to find - // "cuPointerGetAttribute" and "cuGetErrorString" symbols. + // "cuPointerGetAttribute", "cuGetErrorString", and + // "cuDevicePrimaryCtxGetState" symbols. if (dl_open_handle_ != nullptr) { void* cu_pointer_get_attribute_fn = dlsym(dl_open_handle_, "cuPointerGetAttribute"); @@ -88,6 +89,16 @@ CUDAHandler::CUDAHandler() } *((void**)&cu_init_fn_) = cu_init_fn; + void* cu_device_primary_ctx_get_state_fn = + dlsym(dl_open_handle_, "cuDevicePrimaryCtxGetState"); + if (cu_device_primary_ctx_get_state_fn == nullptr) { + throw PythonBackendException( + std::string("Failed to dlsym 'cuDevicePrimaryCtxGetState'. Error: ") + + dlerror()); + } + *((void**)&cu_device_primary_ctx_get_state_fn_) = + cu_device_primary_ctx_get_state_fn; + // Initialize the driver API. CUresult cuda_err = (*cu_init_fn_)(0 /* flags */); if (cuda_err != CUDA_SUCCESS) { @@ -132,41 +143,9 @@ CUDAHandler::OpenCudaHandle( void** data_ptr) { std::lock_guard guard{mu_}; - int current_device; - - // Save the previous device - cudaError_t err = cudaGetDevice(¤t_device); - if (err != cudaSuccess) { - throw PythonBackendException( - std::string("Failed to get the current CUDA device. error: ") + - cudaGetErrorString(err)); - } - - bool overridden = (current_device != memory_type_id); + ScopedSetDevice scoped_set_device(memory_type_id); - // Restore the previous device before returning from the function. - ScopedDefer _(std::bind([&overridden, ¤t_device] { - if (overridden) { - cudaError_t err = cudaSetDevice(current_device); - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set the CUDA device to " + - std::to_string(current_device) + - ". error: " + cudaGetErrorString(err)); - } - } - })); - - if (overridden) { - err = cudaSetDevice(memory_type_id); - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set the CUDA device to " + std::to_string(memory_type_id) + - ". error: " + cudaGetErrorString(err)); - } - } - - err = cudaIpcOpenMemHandle( + cudaError_t err = cudaIpcOpenMemHandle( data_ptr, *cuda_mem_handle, cudaIpcMemLazyEnablePeerAccess); if (err != cudaSuccess) { throw PythonBackendException( @@ -189,31 +168,8 @@ CUDAHandler::CloseCudaHandle(int64_t memory_type_id, void* data_ptr) cudaGetErrorString(err)); } - bool overridden = (current_device != memory_type_id); - // Restore the previous device before returning from the function. - ScopedDefer _(std::bind([&overridden, ¤t_device] { - if (overridden) { - cudaError_t err = cudaSetDevice(current_device); - if (err != cudaSuccess) { - throw PythonBackendException( - "Failed to set the CUDA device to " + - std::to_string(current_device) + - ". error: " + cudaGetErrorString(err)); - } - } - })); - - if (overridden) { - err = cudaSetDevice(memory_type_id); - if (err != cudaSuccess) { - throw PythonBackendException( - std::string("Failed to set the CUDA device to ") + - std::to_string(memory_type_id) + - ". error: " + cudaGetErrorString(err)); - } - } - + ScopedSetDevice scoped_set_device(memory_type_id); err = cudaIpcCloseMemHandle(data_ptr); if (err != cudaSuccess) { throw PythonBackendException( @@ -222,6 +178,39 @@ CUDAHandler::CloseCudaHandle(int64_t memory_type_id, void* data_ptr) } } +bool +CUDAHandler::HasPrimaryContext(int device) +{ + unsigned int ctx_flags; + int ctx_is_active = 0; + CUresult cuda_err = (*cu_device_primary_ctx_get_state_fn_)( + device, &ctx_flags, &ctx_is_active); + if (cuda_err != CUDA_SUCCESS) { + const char* error_string; + (*cu_get_error_string_fn_)(cuda_err, &error_string); + throw PythonBackendException( + std::string( + "failed to get primary context state: " + std::string(error_string)) + .c_str()); + } + + return ctx_is_active == 1; +} + +void +CUDAHandler::MaybeSetDevice(int device) +{ + if (HasPrimaryContext(device)) { + cudaError_t err = cudaSetDevice(device); + if (err != cudaSuccess) { + throw PythonBackendException( + std::string("Failed to set the CUDA device to ") + + std::to_string(device) + ". error: " + cudaGetErrorString(err)); + } + } +} + + CUDAHandler::~CUDAHandler() noexcept(false) { if (dl_open_handle_ != nullptr) { @@ -231,6 +220,24 @@ CUDAHandler::~CUDAHandler() noexcept(false) } } } + +ScopedSetDevice::ScopedSetDevice(int device) +{ + device_ = device; + THROW_IF_CUDA_ERROR(cudaGetDevice(¤t_device_)); + + if (current_device_ != device_) { + THROW_IF_CUDA_ERROR(cudaSetDevice(device_)); + } +} + +ScopedSetDevice::~ScopedSetDevice() +{ + if (current_device_ != device_) { + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.MaybeSetDevice(current_device_); + } +} #endif #ifndef TRITON_PB_STUB diff --git a/src/pb_utils.h b/src/pb_utils.h index a46aa8fa..9645085b 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -243,7 +243,12 @@ class CUDAHandler { CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr; CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr; CUresult (*cu_init_fn_)(unsigned int) = nullptr; + CUresult (*cu_device_primary_ctx_get_state_fn_)( + CUdevice, unsigned int*, int*) = nullptr; CUDAHandler(); + + /// Check if a primary context has already been created for a device. + bool HasPrimaryContext(int device); ~CUDAHandler() noexcept(false); public: @@ -257,7 +262,28 @@ class CUDAHandler { int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle, void** data_ptr); void CloseCudaHandle(int64_t memory_type_id, void* data_ptr); + + /// Set the device only if the primary context has already been created for + /// this device. Inspired from PyTorch's MaybeSetDevice. + /// \param device The cuda device index. + void MaybeSetDevice(int device); }; + + +/// A helper class to change the current device and restore the old context. The +/// old context will be restored only if the primary context for that device is +/// already created, otherwise the CUDA context will remain as the primary +/// context of 'device'. +class ScopedSetDevice { + public: + ScopedSetDevice(int device); + ~ScopedSetDevice(); + + private: + int device_; + int current_device_; +}; + #endif // TRITON_ENABLE_GPU #ifndef TRITON_PB_STUB From c48fd3791603780960a847073dd1df77bef954cb Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 29 Jun 2023 13:51:01 -0700 Subject: [PATCH 120/216] Model loading API for Python BLS (#262) * Use template functions for custom metrics * Add model loading API for Python BLS * Fix up and remove rebasing artifacts * Add documentation * Formatting * Use py::none as initializer * Update the documentation * Fix the lifetime of CustomMetricsMessage and ModelLoaderMessage * Manage the lifetime of 'AllocatedSharedMemory' from the caller function * Address comment * Address comment --- CMakeLists.txt | 2 + README.md | 66 +++++++++++ src/ipc_message.h | 5 +- src/metric.cc | 33 +++--- src/metric_family.cc | 25 ++-- src/metric_family.h | 2 +- src/model_loader.cc | 267 +++++++++++++++++++++++++++++++++++++++++++ src/model_loader.h | 165 ++++++++++++++++++++++++++ src/pb_stub.cc | 91 ++++----------- src/pb_stub.h | 81 +++++++++++-- src/pb_utils.h | 8 ++ src/python_be.cc | 83 ++++++++++---- src/python_be.h | 16 ++- 13 files changed, 716 insertions(+), 128 deletions(-) create mode 100644 src/model_loader.cc create mode 100644 src/model_loader.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a9f070d2..917400a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,6 +166,8 @@ set( src/metric_family.cc src/gpu_buffers.cc src/gpu_buffers.h + src/model_loader.h + src/model_loader.cc ) set( diff --git a/README.md b/README.md index de29a257..91b34b9b 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,9 @@ any C++ code. - [Multiple Model Instance Support](#multiple-model-instance-support) - [Running Multiple Instances of Triton Server](#running-multiple-instances-of-triton-server) - [Business Logic Scripting](#business-logic-scripting) + - [Using BLS with Decoupled Models](#using-bls-with-decoupled-models) - [Using BLS with Stateful Models](#using-bls-with-stateful-models) + - [Model Loading API](#model-loading-api) - [Limitation](#limitation) - [Interoperability and GPU Support](#interoperability-and-gpu-support) - [`pb_utils.Tensor.to_dlpack() -> PyCapsule`](#pb_utilstensorto_dlpack---pycapsule) @@ -994,6 +996,8 @@ class TritonPythonModel: A complete example for sync and async BLS in Python backend is included in the [Examples](#examples) section. +## Using BLS with Decoupled Models + Starting from 23.03 release, you can execute inference requests on decoupled models in both [default mode](#default-mode) and [decoupled mode](#decoupled-mode). By setting the `decoupled` parameter to @@ -1148,6 +1152,68 @@ shared memory error. Note: Async BLS is not supported on Python 3.6 or lower due to the `async` keyword and `asyncio.run` being introduced in Python 3.7. +## Model Loading API + +Starting from 23.07 release, you can use the model loading API to load models +required by your BLS model. The model loading API is equivalent to the Triton C +API for loading models which are documented in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). +Below is an example of how to use the model loading API: + +```python +import triton_python_backend_utils as pb_utils + +class TritonPythonModel: + def initialize(self, args): + self.model_name="onnx_model" + # Check if the model is ready, and load the model if it is not ready. + # You can specify the model version in string format. The version is + # optional, and if not provided, the server will choose a version based + # on the model and internal policy. + if not pb_utils.is_model_ready(model_name=self.model_name, + model_version="1"): + # Load the model from the model repository + pb_utils.load_model(model_name=self.model_name) + + # Load the model with an optional override model config in JSON + # representation. If provided, this config will be used for + # loading the model. + config = "{\"backend\":\"onnxruntime\", \"version_policy\":{\"specific\":{\"versions\":[1]}}}" + pb_utils.load_model(model_name=self.model_name, config=config) + + # Load the mode with optional override files. The override files are + # specified as a dictionary where the key is the file path (with + # "file:" prefix) and the value is the file content as bytes. The + # files will form the model directory that the model will be loaded + # from. If specified, 'config' must be provided to be the model + # configuration of the override model directory. + with open('models/onnx_int32_int32_int32/1/model.onnx', 'rb') as file: + data = file.read() + files = {"file:1/model.onnx": data} + pb_utils.load_model(model_name=self.model_name, + config=config, files=files) + + def execute(self, requests): + # Execute the model + ... + # If the model is no longer needed, you can unload it. You can also + # specify whether the dependents of the model should also be unloaded by + # setting the 'unload_dependents' parameter to True. The default value + # is False. + pb_utils.unload_model(model_name=self.model_name, + unload_dependents=True) + +``` + +Note that the model loading API is only supported if the server is running in +[explicit model control mode](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_management.md#model-control-mode-explicit). +Additionally, the model loading API should only be used after the server has +been running, which means that the BLS model should not be loaded during server +startup. You can use different +[client endpoints](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md) +to load the model after the server has been started. The model loading API is +currently not supported during the `finalize` phase. + ## Using BLS with Stateful Models [Stateful models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models) diff --git a/src/ipc_message.h b/src/ipc_message.h index 04268d93..7040f2b4 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -59,7 +59,10 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_MetricRequestDelete, PYTHONSTUB_MetricRequestValue, PYTHONSTUB_MetricRequestIncrement, - PYTHONSTUB_MetricRequestSet + PYTHONSTUB_MetricRequestSet, + PYTHONSTUB_LoadModelRequest, + PYTHONSTUB_UnloadModelRequest, + PYTHONSTUB_ModelReadinessRequest } PYTHONSTUB_CommandType; /// diff --git a/src/metric.cc b/src/metric.cc index cabf8352..f67c55bf 100644 --- a/src/metric.cc +++ b/src/metric.cc @@ -65,7 +65,7 @@ Metric::SaveToSharedMemory(std::unique_ptr& shm_pool) // Save the references to shared memory. custom_metric_shm_ = std::move(custom_metric_shm); labels_shm_ = std::move(labels_shm); - shm_handle_ = custom_metric_shm.handle_; + shm_handle_ = custom_metric_shm_.handle_; } std::unique_ptr @@ -110,14 +110,17 @@ Metric::SendCreateMetricRequest() std::unique_ptr& stub = Stub::GetOrCreateInstance(); SaveToSharedMemory(stub->ShmPool()); CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; try { - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricRequestNew, shm_handle_); + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestNew, shm_handle_); } catch (const PythonBackendException& pb_exception) { throw PythonBackendException( "Error when creating Metric: " + std::string(pb_exception.what())); } + + custom_metrics_msg = custom_metrics_shm.data_.get(); metric_address_ = custom_metrics_msg->address; } @@ -129,9 +132,9 @@ Metric::SendIncrementRequest(const double& value) std::unique_ptr& stub = Stub::GetOrCreateInstance(); operation_value_ = value; SaveToSharedMemory(stub->ShmPool()); - CustomMetricsMessage* custom_metrics_msg = nullptr; - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricRequestIncrement, shm_handle_); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestIncrement, shm_handle_); } catch (const PythonBackendException& pb_exception) { throw PythonBackendException( @@ -148,9 +151,9 @@ Metric::SendSetValueRequest(const double& value) std::unique_ptr& stub = Stub::GetOrCreateInstance(); operation_value_ = value; SaveToSharedMemory(stub->ShmPool()); - CustomMetricsMessage* custom_metrics_msg = nullptr; - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricRequestSet, shm_handle_); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestSet, shm_handle_); } catch (const PythonBackendException& pb_exception) { throw PythonBackendException( @@ -162,18 +165,20 @@ double Metric::SendGetValueRequest() { CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; try { CheckIfCleared(); std::unique_ptr& stub = Stub::GetOrCreateInstance(); SaveToSharedMemory(stub->ShmPool()); - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricRequestValue, shm_handle_); + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestValue, shm_handle_); } catch (const PythonBackendException& pb_exception) { throw PythonBackendException( "Failed to get metric value: " + std::string(pb_exception.what())); } + custom_metrics_msg = custom_metrics_shm.data_.get(); return custom_metrics_msg->value; } @@ -188,10 +193,10 @@ Metric::Clear() is_cleared_ = true; std::unique_ptr& stub = Stub::GetOrCreateInstance(); SaveToSharedMemory(stub->ShmPool()); - CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; try { - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricRequestDelete, shm_handle_); + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestDelete, shm_handle_); } catch (const PythonBackendException& pb_exception) { std::cerr << "Error when deleting Metric: " << pb_exception.what() diff --git a/src/metric_family.cc b/src/metric_family.cc index 195e9828..fb0fb93a 100644 --- a/src/metric_family.cc +++ b/src/metric_family.cc @@ -57,10 +57,10 @@ MetricFamily::~MetricFamily() // Send the request to delete the MetricFamily to the parent process std::unique_ptr& stub = Stub::GetOrCreateInstance(); SaveToSharedMemory(stub->ShmPool()); - CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; try { - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricFamilyRequestDelete, shm_handle_); + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricFamilyRequestDelete, shm_handle_); } catch (const PythonBackendException& pb_exception) { std::cerr << "Error when deleting MetricFamily: " << pb_exception.what() @@ -90,7 +90,7 @@ MetricFamily::SaveToSharedMemory(std::unique_ptr& shm_pool) custom_metric_family_shm_ = std::move(custom_metric_family_shm); name_shm_ = std::move(name_shm); description_shm_ = std::move(description_shm); - shm_handle_ = custom_metric_family_shm.handle_; + shm_handle_ = custom_metric_family_shm_.handle_; } std::unique_ptr @@ -150,21 +150,32 @@ MetricFamily::SendCreateMetricFamilyRequest() std::unique_ptr& stub = Stub::GetOrCreateInstance(); SaveToSharedMemory(stub->ShmPool()); CustomMetricsMessage* custom_metrics_msg = nullptr; + AllocatedSharedMemory custom_metrics_shm; try { - stub->SendCustomMetricsMessage( - &custom_metrics_msg, PYTHONSTUB_MetricFamilyRequestNew, shm_handle_); + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricFamilyRequestNew, shm_handle_); } catch (const PythonBackendException& pb_exception) { throw PythonBackendException( "Error when creating MetricFamily: " + std::string(pb_exception.what())); } + + custom_metrics_msg = custom_metrics_shm.data_.get(); metric_family_address_ = custom_metrics_msg->address; } std::shared_ptr -MetricFamily::CreateMetric(py::dict labels) +MetricFamily::CreateMetric(const py::object& labels) { + if (!labels.is_none()) { + if (!py::isinstance(labels)) { + throw PythonBackendException( + "Failed to create metric. Labels must be a " + "dictionary."); + } + } + py::module json = py::module_::import("json"); std::string labels_str = std::string(py::str(json.attr("dumps")(labels))); auto metric = std::make_shared(labels_str, metric_family_address_); diff --git a/src/metric_family.h b/src/metric_family.h index 54574892..04374a68 100644 --- a/src/metric_family.h +++ b/src/metric_family.h @@ -98,7 +98,7 @@ class MetricFamily { /// Create a metric from the metric family and store it in the metric map. /// \param labels The labels of the metric. /// \return Returns the shared pointer to the created metric. - std::shared_ptr CreateMetric(py::dict labels); + std::shared_ptr CreateMetric(const py::object& labels); #else /// Initialize the TRITONSERVER_MetricFamily object. /// \return Returns the address of the TRITONSERVER_MetricFamily object. diff --git a/src/model_loader.cc b/src/model_loader.cc new file mode 100644 index 00000000..0be45fa5 --- /dev/null +++ b/src/model_loader.cc @@ -0,0 +1,267 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#include "model_loader.h" + +#ifdef TRITON_PB_STUB +#include "pb_stub.h" +#endif + +namespace triton { namespace backend { namespace python { + +void +ModelLoader::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + AllocatedSharedMemory model_loader_req_shm = + shm_pool->Construct(); + model_loader_req_shm_ptr_ = model_loader_req_shm.data_.get(); + + std::unique_ptr name_shm = PbString::Create(shm_pool, name_); + std::unique_ptr version_shm = PbString::Create(shm_pool, version_); + std::unique_ptr config_shm = PbString::Create(shm_pool, config_); + std::unique_ptr files_shm = PbMap::Create(shm_pool, files_); + + model_loader_req_shm_ptr_->name_shm_handle = name_shm->ShmHandle(); + model_loader_req_shm_ptr_->version_shm_handle = version_shm->ShmHandle(); + model_loader_req_shm_ptr_->config_shm_handle = config_shm->ShmHandle(); + model_loader_req_shm_ptr_->files_shm_handle = files_shm->ShmHandle(); + model_loader_req_shm_ptr_->unload_dependents = unload_dependents_; + + // Save the references to shared memory. + model_loader_req_shm_ = std::move(model_loader_req_shm); + name_shm_ = std::move(name_shm); + version_shm_ = std::move(version_shm); + config_shm_ = std::move(config_shm); + files_shm_ = std::move(files_shm); + + shm_handle_ = model_loader_req_shm_.handle_; +} + +std::unique_ptr +ModelLoader::LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle) +{ + AllocatedSharedMemory model_loader_req_shm = + shm_pool->Load(handle); + ModelLoaderRequestShm* model_loader_req_shm_ptr = + model_loader_req_shm.data_.get(); + + std::unique_ptr name_shm = PbString::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->name_shm_handle); + std::unique_ptr version_shm = PbString::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->version_shm_handle); + std::unique_ptr config_shm = PbString::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->config_shm_handle); + std::unique_ptr files_shm = PbMap::LoadFromSharedMemory( + shm_pool, model_loader_req_shm_ptr->files_shm_handle); + + return std::unique_ptr(new ModelLoader( + model_loader_req_shm, name_shm, version_shm, config_shm, files_shm)); +} + +ModelLoader::ModelLoader( + AllocatedSharedMemory& model_loader_req_shm, + std::unique_ptr& name_shm, std::unique_ptr& version_shm, + std::unique_ptr& config_shm, std::unique_ptr& files_shm) + : model_loader_req_shm_(std::move(model_loader_req_shm)), + name_shm_(std::move(name_shm)), version_shm_(std::move(version_shm)), + config_shm_(std::move(config_shm)), files_shm_(std::move(files_shm)) +{ + model_loader_req_shm_ptr_ = model_loader_req_shm_.data_.get(); + name_ = name_shm_->String(); + version_ = version_shm_->String(); + config_ = config_shm_->String(); + files_ = files_shm_->UnorderedMap(); + unload_dependents_ = model_loader_req_shm_ptr_->unload_dependents; +} +#ifdef TRITON_PB_STUB +void +ModelLoader::SendLoadModelRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory model_loader_msg_shm; + + try { + stub->SendMessage( + model_loader_msg_shm, PYTHONSTUB_LoadModelRequest, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to load model: " + std::string(pb_exception.what())); + } +} + +void +ModelLoader::SendUnloadModelRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory model_loader_msg_shm; + try { + stub->SendMessage( + model_loader_msg_shm, PYTHONSTUB_UnloadModelRequest, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to unload model: " + std::string(pb_exception.what())); + } +} + +bool +ModelLoader::SendModelReadinessRequest() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + SaveToSharedMemory(stub->ShmPool()); + ModelLoaderMessage* model_loader_msg = nullptr; + AllocatedSharedMemory model_loader_msg_shm; + try { + stub->SendMessage( + model_loader_msg_shm, PYTHONSTUB_ModelReadinessRequest, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to check model readiness: " + std::string(pb_exception.what())); + } + + model_loader_msg = model_loader_msg_shm.data_.get(); + return model_loader_msg->is_model_ready; +} + +void +LoadModel( + const std::string& name, const std::string& config, const py::object& files) +{ + std::unordered_map files_map; + + if (!files.is_none()) { + if (!py::isinstance(files)) { + throw PythonBackendException( + "failed to load model '" + name + + "', files should be a dictionary of file paths and file contents"); + } + + py::dict files_dict = py::cast(files); + for (const auto& item : files_dict) { + std::string key = py::cast(item.first); + py::bytes value = py::cast(item.second); + std::string content(value); + files_map[key] = content; + } + } + + ModelLoader model_loader(name, config, files_map); + model_loader.SendLoadModelRequest(); +} + +void +UnloadModel(const std::string& name, const bool unload_dependents) +{ + ModelLoader model_loader(name, unload_dependents); + model_loader.SendUnloadModelRequest(); +} + +bool +IsModelReady(const std::string& name, const std::string& version) +{ + ModelLoader model_loader(name, version); + return model_loader.SendModelReadinessRequest(); +} +#else +void +ModelLoader::LoadModel(TRITONSERVER_Server* server) +{ + std::string path = ""; + std::string file_content = ""; + std::vector const_params; + if (!config_.empty()) { + const_params.emplace_back(TRITONSERVER_ParameterNew( + "config", TRITONSERVER_PARAMETER_STRING, config_.c_str())); + } + if (!files_.empty()) { + for (auto& file : files_) { + path = file.first; + file_content = file.second; + const_params.emplace_back(TRITONSERVER_ParameterBytesNew( + path.c_str(), file_content.data(), file_content.size())); + } + } + + THROW_IF_TRITON_ERROR(TRITONSERVER_ServerLoadModelWithParameters( + server, name_.c_str(), const_params.data(), const_params.size())); + + for (const auto param : const_params) { + TRITONSERVER_ParameterDelete(const_cast(param)); + } +} + +void +ModelLoader::UnloadModel(TRITONSERVER_Server* server) +{ + if (unload_dependents_) { + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerUnloadModelAndDependents(server, name_.c_str())); + } else { + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerUnloadModel(server, name_.c_str())); + } +} + +bool +ModelLoader::IsModelReady(TRITONSERVER_Server* server) +{ + bool is_ready = false; + int64_t model_version = GetModelVersionFromString(version_); + THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady( + server, name_.c_str(), model_version, &is_ready)); + return is_ready; +} + +int64_t +ModelLoader::GetModelVersionFromString(const std::string& version_string) +{ + int64_t version = -1; + if (!version_string.empty()) { + try { + version = std::stol(version_string); + } + catch (std::exception& e) { + throw PythonBackendException( + "failed to get model version from specified version string '" + + version_string + "' (details: " + e.what() + + "), version should be an integral value > 0"); + } + + if (version < 0) { + throw PythonBackendException( + "failed to get model version from specified version string '" + + version_string + "', version should be an integral value > 0"); + } + } + return version; +} +#endif +}}} // namespace triton::backend::python diff --git a/src/model_loader.h b/src/model_loader.h new file mode 100644 index 00000000..e4fe9fd6 --- /dev/null +++ b/src/model_loader.h @@ -0,0 +1,165 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "ipc_message.h" +#include "pb_map.h" +#include "pb_string.h" +#include "pb_utils.h" + +#ifdef TRITON_PB_STUB +#include +namespace py = pybind11; +#else +#include "triton/core/tritonserver.h" +#endif + +namespace triton { namespace backend { namespace python { + +// The 'ModelLoaderRequestShm' struct is utilized by the 'ModelLoader' class for +// saving the essential data to shared memory and for loading the data from +// shared memory in order to reconstruct the 'ModelLoader' object. +struct ModelLoaderRequestShm { + // The shared memory handle of the model name in PbString format. + bi::managed_external_buffer::handle_t name_shm_handle; + // The shared memory handle of the model version in PbString format. + bi::managed_external_buffer::handle_t version_shm_handle; + // The flag to unload the dependent models. + bool unload_dependents; + // The shared memory handle of the config in PbString format. + bi::managed_external_buffer::handle_t config_shm_handle; + // The shared memory handle of the files in PbMap format. + bi::managed_external_buffer::handle_t files_shm_handle; +}; + +class ModelLoader { + public: + ModelLoader( + const std::string& name, const std::string& config, + const std::unordered_map& files) + : name_(name), version_(""), config_(config), files_(files), + unload_dependents_(false) + { + } + + ModelLoader(const std::string& name, const bool unload_dependents) + : name_(name), version_(""), config_(""), files_({}), + unload_dependents_(unload_dependents) + { + } + + ModelLoader(const std::string& name, const std::string& version) + : name_(name), version_(version), config_(""), files_({}), + unload_dependents_(false) + { + } + + /// Save ModelLoader object to shared memory. + /// \param shm_pool Shared memory pool to save the ModelLoader object. + void SaveToSharedMemory(std::unique_ptr& shm_pool); + + /// Create a ModelLoader object from shared memory. + /// \param shm_pool Shared memory pool + /// \param handle Shared memory handle of the ModelLoader. + /// \return Returns the ModelLoaders in the specified request_handle + /// location. + static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, + bi::managed_external_buffer::handle_t handle); +#ifdef TRITON_PB_STUB + /// Send a request to load the model. + void SendLoadModelRequest(); + + /// Send a request to unload the model. + void SendUnloadModelRequest(); + + /// Send a request to check if the model is ready. + bool SendModelReadinessRequest(); +#else + /// Use Triton C API to load the model. + /// \param server The Triton server object. + void LoadModel(TRITONSERVER_Server* server); + + /// Use Triton C API to unload the model. + /// \param server The Triton server object. + void UnloadModel(TRITONSERVER_Server* server); + + /// Use Triton C API to check if the model is ready. + /// \param server The Triton server object. + /// \return Returns true if the model is ready. + bool IsModelReady(TRITONSERVER_Server* server); + + /// Get the model version from the version string. + /// \param version_string The version string. + /// \return Returns the model version in uint64_t. + int64_t GetModelVersionFromString(const std::string& version_string); +#endif + /// Disallow copying the ModelLoader object. + DISALLOW_COPY_AND_ASSIGN(ModelLoader); + + private: + // The private constructor for creating a Metric object from shared memory. + ModelLoader( + AllocatedSharedMemory& model_loader_req_shm, + std::unique_ptr& name_shm, + std::unique_ptr& version_shm, + std::unique_ptr& config_shm, std::unique_ptr& files_shm); + + // The name of the model. + std::string name_; + // The version of the model. + std::string version_; + // The configuration of the model. + std::string config_; + // The files of the model. + std::unordered_map files_; + // The flag to unload the dependent models. + bool unload_dependents_; + + // // Shared Memory Data Structures + AllocatedSharedMemory model_loader_req_shm_; + ModelLoaderRequestShm* model_loader_req_shm_ptr_; + bi::managed_external_buffer::handle_t shm_handle_; + std::unique_ptr name_shm_; + std::unique_ptr version_shm_; + std::unique_ptr config_shm_; + std::unique_ptr files_shm_; +}; + +#ifdef TRITON_PB_STUB +// The binding functions for the Python stub. +void LoadModel( + const std::string& name, const std::string& config, + const py::object& files = py::none()); +void UnloadModel(const std::string& name, const bool unload_dependents); +bool IsModelReady(const std::string& name, const std::string& version); +#endif + +}}}; // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 3d39f005..4f62a5bb 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -43,6 +43,7 @@ #include #include +#include "model_loader.h" #include "pb_error.h" #include "pb_map.h" #include "pb_preferred_memory.h" @@ -434,6 +435,15 @@ Stub::StubSetup() py::setattr( python_backend_utils, "MetricFamily", c_python_backend_utils.attr("MetricFamily")); + py::setattr( + python_backend_utils, "load_model", + c_python_backend_utils.attr("load_model")); + py::setattr( + python_backend_utils, "unload_model", + c_python_backend_utils.attr("unload_model")); + py::setattr( + python_backend_utils, "is_model_ready", + c_python_backend_utils.attr("is_model_ready")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); @@ -1203,74 +1213,6 @@ Stub::EnqueueUtilsMessage( stub_to_parent_message_cv_.notify_one(); } -void -Stub::PrepareCustomMetricsMessage( - AllocatedSharedMemory& custom_metrics_msg_shm, - CustomMetricsMessage** custom_metrics_msg) -{ - custom_metrics_msg_shm = shm_pool_->Construct(); - *custom_metrics_msg = custom_metrics_msg_shm.data_.get(); - new (&((*custom_metrics_msg)->mu)) bi::interprocess_mutex; - new (&((*custom_metrics_msg)->cv)) bi::interprocess_condition; - (*custom_metrics_msg)->waiting_on_stub = false; - (*custom_metrics_msg)->is_error_set = false; - (*custom_metrics_msg)->has_error = false; -} - -void -Stub::SendCustomMetricsMessage( - CustomMetricsMessage** custom_metrics_msg, - PYTHONSTUB_CommandType command_type, - bi::managed_external_buffer::handle_t handle) -{ - AllocatedSharedMemory custom_metrics_msg_shm; - PrepareCustomMetricsMessage(custom_metrics_msg_shm, custom_metrics_msg); - - (*custom_metrics_msg)->message = handle; - - std::unique_ptr ipc_message = - IPCMessage::Create(shm_pool_, false /* inline_response */); - ipc_message->Command() = command_type; - ipc_message->Args() = custom_metrics_msg_shm.handle_; - - std::unique_lock guard{stub_to_parent_message_mu_}; - { - ScopedDefer _([&ipc_message, custom_metrics_msg] { - { - bi::scoped_lock guard{ - (*custom_metrics_msg)->mu}; - (*custom_metrics_msg)->waiting_on_stub = false; - (*custom_metrics_msg)->cv.notify_all(); - } - }); - - { - bi::scoped_lock guard{(*custom_metrics_msg)->mu}; - SendIPCUtilsMessage(ipc_message); - while (!(*custom_metrics_msg)->waiting_on_stub) { - (*custom_metrics_msg)->cv.wait(guard); - } - } - } - if ((*custom_metrics_msg)->has_error) { - if ((*custom_metrics_msg)->is_error_set) { - std::unique_ptr pb_string = PbString::LoadFromSharedMemory( - shm_pool_, (*custom_metrics_msg)->error); - std::string err_message = - std::string( - "Failed to process the custom metrics request for model '" + - name_ + "', message: ") + - pb_string->String(); - throw PythonBackendException(err_message); - } else { - std::string err_message = std::string( - "Failed to process the custom metrics request for model '" + name_ + - "'."); - throw PythonBackendException(err_message); - } - } -} - cudaStream_t Stub::GetProxyStream(const int& device_id) { @@ -1600,10 +1542,21 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("kind").none(false)) .def( "Metric", &MetricFamily::CreateMetric, - py::arg("labels").none(false) = py::dict()); + py::arg("labels").none(true) = py::none()); module.attr("MetricFamily").attr("COUNTER") = MetricKind::COUNTER; module.attr("MetricFamily").attr("GAUGE") = MetricKind::GAUGE; + module.def( + "load_model", &LoadModel, py::arg("model_name").none(false), + py::arg("config").none(false) = "", + py::arg("files").none(true) = py::none()); + module.def( + "unload_model", &UnloadModel, py::arg("model_name").none(false), + py::arg("unload_dependents").none(false) = false); + module.def( + "is_model_ready", &IsModelReady, py::arg("model_name").none(false), + py::arg("model_version").none(false) = ""); + // This class is not part of the public API for Python backend. This is only // used for internal testing purposes. py::class_(module, "SharedMemory") diff --git a/src/pb_stub.h b/src/pb_stub.h index f5af89c9..031a058f 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -292,16 +292,18 @@ class Stub { void EnqueueUtilsMessage( std::unique_ptr utils_msg_payload); - /// Send the custom metrics message to the python backend - void SendCustomMetricsMessage( - CustomMetricsMessage** custom_metrics_msg, + /// Send the message to the python backend. MessageType should be either + // 'MetricFamilyMessage', 'MetricMessage' or 'ModelLoaderMessage'. + template + void SendMessage( + AllocatedSharedMemory& msg_shm, PYTHONSTUB_CommandType command_type, bi::managed_external_buffer::handle_t handle); - /// Helper function to prepare the custom metrics message - void PrepareCustomMetricsMessage( - AllocatedSharedMemory& custom_metrics_msg_shm, - CustomMetricsMessage** custom_metrics_msg); + /// Helper function to prepare the message. MessageType should be either + // 'MetricFamilyMessage', 'MetricMessage' or 'ModelLoaderMessage'. + template + void PrepareMessage(AllocatedSharedMemory& msg_shm); /// Helper function to retrieve a proxy stream for dlpack synchronization /// for provided device @@ -348,4 +350,69 @@ class Stub { std::mutex dlpack_proxy_stream_pool_mu_; std::unordered_map dlpack_proxy_stream_pool_; }; + +template +void +Stub::PrepareMessage(AllocatedSharedMemory& msg_shm) +{ + msg_shm = shm_pool_->Construct(); + MessageType* msg = msg_shm.data_.get(); + new (&(msg->mu)) bi::interprocess_mutex; + new (&(msg->cv)) bi::interprocess_condition; + msg->waiting_on_stub = false; + msg->is_error_set = false; + msg->has_error = false; +} + +template +void +Stub::SendMessage( + AllocatedSharedMemory& msg_shm, + PYTHONSTUB_CommandType command_type, + bi::managed_external_buffer::handle_t handle) +{ + PrepareMessage(msg_shm); + MessageType* msg = msg_shm.data_.get(); + msg->message = handle; + + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + ipc_message->Command() = command_type; + ipc_message->Args() = msg_shm.handle_; + + std::unique_lock guard{stub_to_parent_message_mu_}; + { + ScopedDefer _([&ipc_message, msg] { + { + bi::scoped_lock guard{msg->mu}; + msg->waiting_on_stub = false; + msg->cv.notify_all(); + } + }); + + { + bi::scoped_lock guard{msg->mu}; + SendIPCUtilsMessage(ipc_message); + while (!msg->waiting_on_stub) { + msg->cv.wait(guard); + } + } + } + if (msg->has_error) { + if (msg->is_error_set) { + std::unique_ptr pb_string = + PbString::LoadFromSharedMemory(shm_pool_, msg->error); + std::string err_message = + std::string( + "Failed to process the request for model '" + name_ + + "', message: ") + + pb_string->String(); + throw PythonBackendException(err_message); + } else { + std::string err_message = std::string( + "Failed to process the request for model '" + name_ + "'."); + throw PythonBackendException(err_message); + } + } +} }}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 9645085b..06d4e4ea 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -200,6 +200,14 @@ struct CustomMetricsMessage : SendMessageBase { void* address; }; +struct ModelLoaderMessage : SendMessageBase { + bi::managed_external_buffer::handle_t message; + bool has_error; + bool is_error_set; + bi::managed_external_buffer::handle_t error; + bool is_model_ready; +}; + struct ResponseSenderBase { bi::interprocess_mutex mu; bi::interprocess_condition cv; diff --git a/src/python_be.cc b/src/python_be.cc index cc07b473..b72e1b35 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -27,6 +27,7 @@ #include "gpu_buffers.h" #include "infer_payload.h" +#include "model_loader.h" #include "pb_log.h" namespace triton { namespace backend { namespace python { @@ -811,6 +812,12 @@ ModelInstanceState::StubToParentMQMonitor() ProcessMetricRequest(message); break; } + case PYTHONSTUB_ModelReadinessRequest: + case PYTHONSTUB_LoadModelRequest: + case PYTHONSTUB_UnloadModelRequest: { + ProcessModelControlRequest(message); + break; + } default: { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "Unexpected message type received."); @@ -893,47 +900,46 @@ ModelInstanceState::ProcessBLSCleanupRequest( } } -template +template void -ModelInstanceState::ProcessCustomMetricsRequest( - const std::unique_ptr& message, - std::function&, CustomMetricsMessage*)> - request_handler) +ModelInstanceState::ProcessMessage( + const std::unique_ptr& ipc_message, + std::function&, MessageType*)> request_handler) { - AllocatedSharedMemory metrics_message = - Stub()->ShmPool()->Load(message->Args()); - CustomMetricsMessage* metrics_message_ptr = - reinterpret_cast(metrics_message.data_.get()); + AllocatedSharedMemory message = + Stub()->ShmPool()->Load(ipc_message->Args()); + MessageType* message_ptr = + reinterpret_cast(message.data_.get()); std::unique_ptr pb_error_message; PythonBackendException pb_exception(std::string{}); - std::unique_ptr metrics_object = - T::LoadFromSharedMemory(Stub()->ShmPool(), metrics_message_ptr->message); + std::unique_ptr object = + T::LoadFromSharedMemory(Stub()->ShmPool(), message_ptr->message); - ScopedDefer _([metrics_message_ptr] { + ScopedDefer _([message_ptr] { { - bi::scoped_lock guard{metrics_message_ptr->mu}; - metrics_message_ptr->waiting_on_stub = true; - metrics_message_ptr->cv.notify_all(); - while (metrics_message_ptr->waiting_on_stub) { - metrics_message_ptr->cv.wait(guard); + bi::scoped_lock guard{message_ptr->mu}; + message_ptr->waiting_on_stub = true; + message_ptr->cv.notify_all(); + while (message_ptr->waiting_on_stub) { + message_ptr->cv.wait(guard); } } }); try { - request_handler(metrics_object, metrics_message_ptr); + request_handler(object, message_ptr); } catch (const PythonBackendException& exception) { pb_exception = exception; } if (pb_exception.what() != std::string{}) { - metrics_message_ptr->has_error = true; + message_ptr->has_error = true; LOG_IF_EXCEPTION( pb_error_message = PbString::Create(Stub()->ShmPool(), pb_exception.what())); - metrics_message_ptr->error = pb_error_message->ShmHandle(); - metrics_message_ptr->is_error_set = true; + message_ptr->error = pb_error_message->ShmHandle(); + message_ptr->is_error_set = true; } } @@ -942,7 +948,7 @@ ModelInstanceState::ProcessMetricFamilyRequest( const std::unique_ptr& message) { auto command = message->Command(); - ProcessCustomMetricsRequest( + ProcessMessage( message, [this, command]( std::unique_ptr& metric_family, CustomMetricsMessage* metrics_message_ptr) { @@ -968,7 +974,7 @@ ModelInstanceState::ProcessMetricRequest( const std::unique_ptr& message) { auto command = message->Command(); - ProcessCustomMetricsRequest( + ProcessMessage( message, [this, command]( std::unique_ptr& metric, CustomMetricsMessage* metrics_message_ptr) { @@ -999,6 +1005,37 @@ ModelInstanceState::ProcessMetricRequest( }); } +void +ModelInstanceState::ProcessModelControlRequest( + const std::unique_ptr& message) +{ + auto command = message->Command(); + ModelState* model_state = reinterpret_cast(Model()); + ProcessMessage( + message, [this, command, model_state]( + std::unique_ptr& model_loader, + ModelLoaderMessage* model_loader_msg_ptr) { + switch (command) { + case PYTHONSTUB_LoadModelRequest: { + model_loader->LoadModel(model_state->TritonServer()); + break; + } + case PYTHONSTUB_UnloadModelRequest: { + model_loader->UnloadModel(model_state->TritonServer()); + break; + } + case PYTHONSTUB_ModelReadinessRequest: { + model_loader_msg_ptr->is_model_ready = + model_loader->IsModelReady(model_state->TritonServer()); + break; + } + default: { + throw PythonBackendException("Unknown model loader request kind"); + } + } + }); +} + void ModelInstanceState::StartMonitor() { diff --git a/src/python_be.h b/src/python_be.h index b1a44b23..a3b8d303 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -390,18 +390,22 @@ class ModelInstanceState : public BackendModelInstance { // Process the bls decoupled cleanup request void ProcessBLSCleanupRequest(const std::unique_ptr& message); - // Process a custom metrics request. The function 'request_handler' is invoked - // to handle the request. T should be either 'MetricFamily' or 'Metric'. - template - void ProcessCustomMetricsRequest( + // Process a message. The function 'request_handler' is invoked + // to handle the request. T should be either 'MetricFamily', 'Metric' or + // 'ModelLoader', and MessageType should be either 'MetricFamilyMessage', + // 'MetricMessage' or 'ModelLoaderMessage'. + template + void ProcessMessage( const std::unique_ptr& message, - std::function&, CustomMetricsMessage*)> - request_handler); + std::function&, MessageType*)> request_handler); // Process a metric family request void ProcessMetricFamilyRequest(const std::unique_ptr& message); // Process a metric request void ProcessMetricRequest(const std::unique_ptr& message); + + // Process a model control request + void ProcessModelControlRequest(const std::unique_ptr& message); }; }}} // namespace triton::backend::python From 240714bf083f58e93b067172878118a8b36e2a75 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 6 Jul 2023 09:58:42 -0700 Subject: [PATCH 121/216] Update the documentation to mention that the InferenceResponse object should not be reused for multiple requests (#268) --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 91b34b9b..5703e726 100644 --- a/README.md +++ b/README.md @@ -337,11 +337,12 @@ class TritonPythonModel: responses = [] # Every Python backend must iterate through list of requests and create - # an instance of pb_utils.InferenceResponse class for each of them. You - # should avoid storing any of the input Tensors in the class attributes - # as they will be overridden in subsequent inference requests. You can - # make a copy of the underlying NumPy array and store it if it is - # required. + # an instance of pb_utils.InferenceResponse class for each of them. + # Reusing the same pb_utils.InferenceResponse object for multiple + # requests may result in segmentation faults. You should avoid storing + # any of the input Tensors in the class attributes as they will be + # overridden in subsequent inference requests. You can make a copy of + # the underlying NumPy array and store it if it is required. for request in requests: # Perform inference on the request and append it to responses # list... From 34cc89f0a943e2abb532c2109f63c282d8a2fe1a Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Fri, 9 Jun 2023 12:40:25 -0700 Subject: [PATCH 122/216] Interfacing with platform models, support TF model serving --- CMakeLists.txt | 7 + src/pb_stub.cc | 147 +++-- src/pb_stub.h | 33 +- src/python_be.cc | 10 + src/python_be.h | 4 + .../tensorflow_savedmodel/model.py | 536 ++++++++++++++++++ src/stub_launcher.cc | 20 +- src/stub_launcher.h | 1 + 8 files changed, 709 insertions(+), 49 deletions(-) create mode 100644 src/resources/platform_handlers/tensorflow_savedmodel/model.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 917400a8..93a7ae60 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,6 +307,13 @@ install( ${INSTALL_CONFIGDIR} ) +install( + DIRECTORY + src/resources/platform_handlers + DESTINATION + ${CMAKE_INSTALL_PREFIX}/backends/python +) + install( FILES src/resources/triton_python_backend_utils.py diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 4f62a5bb..eb561dec 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -82,11 +82,9 @@ Stub::Instantiate( const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& name) + const std::string& name, const std::string& platform) { - model_path_ = model_path; - model_version_ = model_version; - triton_install_path_ = triton_install_path; + model_context_.Init(model_path, platform, triton_install_path, model_version); name_ = name; health_mutex_ = nullptr; initialized_ = false; @@ -378,30 +376,7 @@ Stub::StubSetup() { py::module sys = py::module_::import("sys"); - std::string model_name = - model_path_.substr(model_path_.find_last_of("/") + 1); - - // Model name without the .py extension - auto dotpy_pos = model_name.find_last_of(".py"); - if (dotpy_pos == std::string::npos || dotpy_pos != model_name.size() - 1) { - throw PythonBackendException( - "Model name must end with '.py'. Model name is \"" + model_name + - "\"."); - } - - // The position of last character of the string that is searched for is - // returned by 'find_last_of'. Need to manually adjust the position. - std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2); - std::string model_path_parent = - model_path_.substr(0, model_path_.find_last_of("/")); - std::string model_path_parent_parent = - model_path_parent.substr(0, model_path_parent.find_last_of("/")); - std::string python_backend_folder = triton_install_path_; - sys.attr("path").attr("append")(model_path_parent); - sys.attr("path").attr("append")(model_path_parent_parent); - sys.attr("path").attr("append")(python_backend_folder); - sys = py::module_::import( - (std::string(model_version_) + "." + model_name_trimmed).c_str()); + model_context_.StubSetup(sys); py::module python_backend_utils = py::module_::import("triton_python_backend_utils"); @@ -467,6 +442,13 @@ Stub::AutoCompleteModelConfig( py::module_::import("triton_python_backend_utils"); py::object model_config = python_backend_utils.attr("ModelConfig")(pb_string_shm->String()); + python_backend_utils.def( + "get_model_dir", + []() { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + return stub->GetModelDir(); + }, + py::return_value_policy::reference); if (py::hasattr(sys.attr("TritonPythonModel"), "auto_complete_config")) { model_config = sys.attr("TritonPythonModel") @@ -511,6 +493,13 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor"); + python_backend_utils.def( + "get_model_dir", + []() { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + return stub->GetModelDir(); + }, + py::return_value_policy::reference); model_instance_ = TritonPythonModel(); std::unordered_map map; @@ -648,7 +637,7 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) response_batch_shm_ptr->is_error_set = false; if (!py::hasattr(model_instance_, "execute")) { - std::string message = "Python model " + model_path_ + + std::string message = "Python model " + model_context_.PythonModelPath() + " does not implement `execute` method."; throw PythonBackendException(message); } @@ -735,7 +724,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) LoadRequestsFromSharedMemory(request_batch_shm_ptr); if (!py::hasattr(model_instance_, "execute")) { - std::string message = "Python model " + model_path_ + + std::string message = "Python model " + model_context_.PythonModelPath() + " does not implement `execute` method."; throw PythonBackendException(message); } @@ -1566,6 +1555,99 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) module, "TritonModelException"); } + +void +ModelContext::Init( + const std::string& model_path, const std::string& platform, + const std::string& triton_install_path, const std::string& model_version) +{ + bool python_model_found = false; + std::string platform_model_path; + + if (platform != "NONE") { + platform_model_path = + triton_install_path + "/platform_handlers/" + platform + "/model.py"; + // Check if model file exists in the path. + struct stat buffer; + if (stat(platform_model_path.c_str(), &buffer) == 0) { + // Use the Platform model for serving the model. + python_model_found = true; + type_ = ModelType::PLATFORM; + python_model_path_ = platform_model_path; + // Trimming the model name from the model path, the platform model + // will populate the expected default model file name into model_path_. + model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); + } else { + LOG_WARN << "Unable to find model(handler) \'" << platform_model_path + << "\' for platform field \'" << platform << "\'"; + } + } + + if (!python_model_found) { + python_model_path_ = model_path; + // Check if model file exists in this path. + struct stat buffer; + if (stat(python_model_path_.c_str(), &buffer) == 0) { + python_model_found = true; + type_ = ModelType::DEFAULT; + } + // Initializing here for consistency with platform model case. + model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); + } + + if (!python_model_found) { + if (platform != "NONE") { + throw PythonBackendException( + ("Python model file not found in neither \'" + platform_model_path + + "\' nor \'" + model_path + "\'")); + } else { + throw PythonBackendException( + ("Python model file not found in \'" + model_path + "\'")); + } + } + + python_backend_folder_ = triton_install_path; + model_version_ = model_version; + platform_ = platform; +} + +void +ModelContext::StubSetup(py::module& sys) +{ + std::string model_name = + python_model_path_.substr(python_model_path_.find_last_of("/") + 1); + + // Model name without the .py extension + auto dotpy_pos = model_name.find_last_of(".py"); + if (dotpy_pos == std::string::npos || dotpy_pos != model_name.size() - 1) { + throw PythonBackendException( + "Model name must end with '.py'. Model name is \"" + model_name + + "\"."); + } + // The position of last character of the string that is searched for is + // returned by 'find_last_of'. Need to manually adjust the position. + std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2); + + if (type_ == ModelType::DEFAULT) { + std::string model_path_parent = + python_model_path_.substr(0, python_model_path_.find_last_of("/")); + std::string model_path_parent_parent = + model_path_parent.substr(0, model_path_parent.find_last_of("/")); + sys.attr("path").attr("append")(model_path_parent); + sys.attr("path").attr("append")(model_path_parent_parent); + sys.attr("path").attr("append")(python_backend_folder_); + sys = py::module_::import( + (std::string(model_version_) + "." + model_name_trimmed).c_str()); + } else { + std::string platform_model_dir( + python_backend_folder_ + "/platform_handlers/" + platform_ + "/"); + sys.attr("path").attr("append")(platform_model_dir); + sys.attr("path").attr("append")(python_backend_folder_); + sys = py::module_::import(model_name_trimmed.c_str()); + } +} + + extern "C" { int @@ -1580,7 +1662,7 @@ main(int argc, char** argv) signal(SIGINT, SignalHandler); signal(SIGTERM, SignalHandler); - // Path to model.py + // Path to model std::string model_path = argv[1]; std::string shm_region_name = argv[2]; int64_t shm_default_size = std::stol(argv[3]); @@ -1608,13 +1690,14 @@ main(int argc, char** argv) int64_t shm_growth_size = std::stol(argv[4]); std::string triton_install_path = argv[6]; std::string name = argv[8]; + std::string platform = argv[9]; std::unique_ptr& stub = Stub::GetOrCreateInstance(); try { stub->Instantiate( shm_growth_size, shm_default_size, shm_region_name, model_path, model_version, argv[6] /* triton install path */, - std::stoi(argv[7]) /* IPCControl handle */, name); + std::stoi(argv[7]) /* IPCControl handle */, name, platform); } catch (const PythonBackendException& pb_exception) { LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what(); diff --git a/src/pb_stub.h b/src/pb_stub.h index 031a058f..6d047d29 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -161,6 +161,30 @@ class LogMessage { #define LOG_FL(FN, LN, LVL) LogMessage((char*)(FN), LN, LVL).stream() + +class ModelContext { + public: + // Scans and establishes path for serving the python model. + void Init( + const std::string& model_path, const std::string& platform, + const std::string& triton_install_path, const std::string& model_version); + // Sets up the python stub with appropriate paths. + void StubSetup(py::module& sys); + + std::string& PythonModelPath() { return python_model_path_; } + std::string& ModelDir() { return model_dir_; } + + private: + std::string python_model_path_; + std::string model_dir_; + std::string model_version_; + std::string python_backend_folder_; + std::string platform_; + + enum ModelType { DEFAULT, PLATFORM }; + ModelType type_; +}; + // The payload for the stub_to_parent message queue. This struct serves as a // wrapper for different types of messages so that they can be sent through the // same buffer. @@ -185,7 +209,7 @@ class Stub { const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& model_instance_name); + const std::string& model_instance_name, const std::string& platform); /// Get the health of the stub process. bool& Health(); @@ -199,6 +223,9 @@ class Stub { /// Setup for the stub process py::module StubSetup(); + /// Return the path to the model + py::str GetModelDir() { return model_context_.ModelDir(); } + /// Set the model configuration for auto-complete void AutoCompleteModelConfig( bi::managed_external_buffer::handle_t string_handle, @@ -315,10 +342,8 @@ class Stub { bi::interprocess_mutex* parent_mutex_; bi::interprocess_condition* parent_cond_; bi::interprocess_mutex* health_mutex_; - std::string model_path_; - std::string model_version_; + ModelContext model_context_; std::string name_; - std::string triton_install_path_; IPCControlShm* ipc_control_; std::unique_ptr shm_pool_; py::object model_instance_; diff --git a/src/python_be.cc b/src/python_be.cc index b72e1b35..793998e8 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1715,6 +1715,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) python_execution_env_ = ""; force_cpu_only_input_tensors_ = true; decoupled_ = false; + platform_ = ""; void* bstate; THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_BackendState(backend, &bstate)); @@ -1755,6 +1756,14 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) } } + triton::common::TritonJson::Value platform; + if (model_config_.Find("platform", &platform)) { + auto error = platform.AsString(&platform_); + if (error != nullptr) { + throw BackendModelException(error); + } + } + // Skip the FORCE_CPU_ONLY_INPUT_TENSORS variable if it doesn't exits. std::string force_cpu_only_input_tensor; error = nullptr; @@ -1830,6 +1839,7 @@ ModelState::ValidateModelConfig() return nullptr; } + extern "C" { TRITONSERVER_Error* diff --git a/src/python_be.h b/src/python_be.h index a3b8d303..825c45de 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -237,6 +237,9 @@ class ModelState : public BackendModel { // Is decoupled API being used. bool IsDecoupled() { return decoupled_; } + // Returns the value in the platform field + std::string Platform() { return platform_; } + // Launch auto-complete stub process. TRITONSERVER_Error* LaunchAutoCompleteStubProcess(); @@ -252,6 +255,7 @@ class ModelState : public BackendModel { std::string python_execution_env_; bool force_cpu_only_input_tensors_; bool decoupled_; + std::string platform_; std::unique_ptr auto_complete_stub_; }; diff --git a/src/resources/platform_handlers/tensorflow_savedmodel/model.py b/src/resources/platform_handlers/tensorflow_savedmodel/model.py new file mode 100644 index 00000000..24b95472 --- /dev/null +++ b/src/resources/platform_handlers/tensorflow_savedmodel/model.py @@ -0,0 +1,536 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os + +try: + import tensorflow as tf + from tensorflow.core.framework import types_pb2 + from tensorflow.python.client import session + from tensorflow.python.saved_model import loader, signature_constants + from tensorflow.python.tools import saved_model_utils +except ModuleNotFoundError as error: + raise RuntimeError( + "Missing/Incomplete tensorflow package installation..." + ) from error + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + +TF_STRING_TO_TRITON = { + "DT_BOOL": "TYPE_BOOL", + "DT_UINT8": "TYPE_UINT8", + "DT_UINT16": "TYPE_UINT16", + "DT_UINT32": "TYPE_UINT32", + "DT_UINT64": "TYPE_UINT64", + "DT_INT8": "TYPE_INT8", + "DT_INT16": "TYPE_INT16", + "DT_INT32": "TYPE_INT32", + "DT_INT64": "TYPE_INT64", + "DT_HALF": "TYPE_FP16", + "DT_FLOAT": "TYPE_FP32", + "DT_DOUBLE": "TYPE_FP64", + "DT_STRING": "TYPE_STRING", +} + +_DEFAULT_ARTIFACT_NAME = "model.savedmodel" + + +def _get_savedmodel_path(config): + artifact_name = config["default_model_filename"] + if not artifact_name: + artifact_name = _DEFAULT_ARTIFACT_NAME + + savedmodel_path = os.path.join(pb_utils.get_model_dir(), artifact_name) + if not os.path.exists(savedmodel_path): + raise pb_utils.TritonModelException( + f"No savedmodel dir found in " + savedmodel_path + ) + + return savedmodel_path + + +def _parse_signature_def(config): + if config["parameters"]: + if "TF_SIGNATURE_DEF" in config["parameters"].keys(): + return config["parameters"]["TF_SIGNATURE_DEF"]["string_value"] + return None + + +def _parse_graph_tag(config): + if config["parameters"]: + if "TF_GRAPH_TAG" in config["parameters"].keys(): + return config["parameters"]["TF_GRAPH_TAG"]["string_value"] + return None + + +def _parse_num_intra_threads(config): + if config["parameters"]: + if "TF_NUM_INTRA_THREADS" in config["parameters"].keys(): + return int(config["parameters"]["TF_NUM_INTRA_THREADS"]["string_value"]) + return None + + +def _parse_num_inter_threads(config): + if config["parameters"]: + if "TF_NUM_INTER_THREADS" in config["parameters"].keys(): + return int(config["parameters"]["TF_NUM_INTER_THREADS"]["string_value"]) + return None + + +def _get_truth_value(string_value): + val = string_value.casefold() + if val == "yes" or val == "1" or val == "on" or val == "true": + return True + else: + return False + + +def _parse_use_per_session_thread(config): + if config["parameters"]: + if "USE_PER_SESSION_THREAD" in config["parameters"].keys(): + val = config["parameters"]["USE_PER_SESSION_THREAD"]["string_value"] + return _get_truth_value(val) + return False + + +def _get_signature_def(savedmodel_path, config): + tag_sets = saved_model_utils.get_saved_model_tag_sets(savedmodel_path) + graph_tag = _parse_graph_tag(config) + if graph_tag is None: + if "serve" in tag_sets[0]: + graph_tag = "serve" + else: + graph_tag = tag_sets[0][0] + + meta_graph_def = saved_model_utils.get_meta_graph_def(savedmodel_path, graph_tag) + signature_def_map = meta_graph_def.signature_def + signature_def_k = _parse_signature_def(config) + if signature_def_k is None: + serving_default = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY + if serving_default in signature_def_map.keys(): + signature_def_k = serving_default + else: + signature_def_k = signature_def_map.keys()[0] + + if signature_def_k not in signature_def_map.keys(): + raise pb_utils.TritonModelException( + f" The model does not include the signature_def '" + signature_def_k + "'" + ) + + return graph_tag, signature_def_map[signature_def_k] + + +def _has_batch_dim(tensor_info): + if tensor_info.tensor_shape.unknown_rank: + return True + elif tensor_info.tensor_shape.dim[0].size == -1: + return True + else: + return False + + +def _get_batching_hint_from_signature(signature_def): + for input_info in signature_def.inputs.values(): + if not _has_batch_dim(input_info): + return False + + for output_info in signature_def.outputs.values(): + if not _has_batch_dim(output_info): + return False + + return True + + +def _convert_proto_to_dict_tensor(name, tensor_proto, batching_enabled): + tensor_dict = {} + tensor_dict["name"] = name + dtype_dict = {value: key for (key, value) in types_pb2.DataType.items()} + tensor_dict["data_type"] = TF_STRING_TO_TRITON[dtype_dict[tensor_proto.dtype]] + if tensor_proto.tensor_shape.unknown_rank: + # FIXME: Fix the handling of unknown rank + dims = [-1] + else: + dims = [dim.size for dim in tensor_proto.tensor_shape.dim] + if batching_enabled: + tensor_dict["dims"] = dims[1:] + else: + tensor_dict["dims"] = dims + + return tensor_dict + + +def _validate_datatype(tf_dtype, triton_datatype, tensor_name): + dtype_dict = {value: key for (key, value) in types_pb2.DataType.items()} + if triton_datatype != TF_STRING_TO_TRITON[dtype_dict[tf_dtype]]: + raise pb_utils.TritonModelException( + f" Mismatch between datatype for tensor '" + + tensor_name + + "', expected '" + + TF_STRING_TO_TRITON[dtype_dict[tf_dtype]] + + "', got '" + + triton_datatype + ) + + +def _validate_dims(tf_shape, triton_dims, batching_enabled, tensor_name): + if tf_shape.unknown_rank: + return + + index = 0 + offset = 1 if batching_enabled else 0 + if len(tf_shape.dim) != (offset + len(triton_dims)): + raise pb_utils.TritonModelException( + f" Mismatch in the number of dimension with the model for tensor '" + + tensor_name + + "', expected " + + str(len(tf_shape.dim) - offset) + + ", got " + + str(len(triton_dims)) + ) + + for dim in tf_shape.dim: + if index == 0 and batching_enabled: + if dim.size != -1: + raise pb_utils.TritonModelException( + f" The first dimension of a batching model should be dynamic, " + "however, got shape of first dimension in model for tensor '" + + tensor_name + + "' as " + + str(dim.size) + ) + else: + if dim.size != triton_dims[index - offset]: + raise pb_utils.TritonModelException( + f" Mismatch in " + + str(index - offset) + + "th dimension for tensor '" + + tensor_name + + "', expected " + + str(dim.size) + + ", got " + + str(triton_dims[index - offset]) + ) + index = index + 1 + + +def _validate_model_config(model_config, signature_def): + signature_supports_batching = _get_batching_hint_from_signature(signature_def) + if (not signature_supports_batching) and (model_config["max_batch_size"] != 0): + raise pb_utils.TritonModelException( + f" The model signature does not support batching, yet model config" + " has max_batch_size set to '" + str(model_config["max_batch_size"]) + "'" + ) + + batching_enabled = model_config["max_batch_size"] != 0 + + if model_config["platform"] != "tensorflow_savedmodel": + raise pb_utils.TritonModelException( + f"[INTERNAL]: The platform field for using this model should be set to" + " 'tensorflow_savedmodel' in model config, got '" + + model_config["platform"] + + "'" + ) + if model_config["batch_input"]: + raise pb_utils.TritonModelException( + f"The platform model '" + + model_config["platform"] + + "' does not support model with batch_input" + ) + if model_config["batch_output"]: + raise pb_utils.TritonModelException( + f"The platform model '" + + model_config["platform"] + + "' does not support model with batch_output" + ) + + # Validate input tensors + input_tensor_info = signature_def.inputs + config_input_names = [input["name"] for input in model_config["input"]] + for input_name in input_tensor_info.keys(): + if input_name not in config_input_names: + raise pb_utils.TritonModelException( + f" Missing input tensor configuration for tensor '" + input_name + "'" + ) + for input in model_config["input"]: + config_input_name = input["name"] + if config_input_name not in input_tensor_info.keys(): + supported_names = "" + for valid_name in input_tensor_info.keys(): + supported_names = supported_names + ";" + valid_name + raise pb_utils.TritonModelException( + f" No input tensor with name '" + + config_input_name + + "', only supported input names are " + + supported_names + ) + _validate_datatype( + input_tensor_info[config_input_name].dtype, + input["data_type"], + config_input_name, + ) + _validate_dims( + input_tensor_info[config_input_name].tensor_shape, + input["dims"], + batching_enabled, + config_input_name, + ) + + # Validate output tensors + output_tensor_info = signature_def.outputs + for output in model_config["output"]: + config_output_name = output["name"] + if config_output_name not in output_tensor_info.keys(): + supported_names = "" + for valid_name in output_tensor_info.keys(): + supported_names = supported_names + ";" + valid_name + raise pb_utils.TritonModelException( + f" No output tensor with name '" + + config_output_name + + "', only supported output names are " + + supported_names + ) + + _validate_datatype( + output_tensor_info[config_output_name].dtype, + output["data_type"], + config_output_name, + ) + _validate_dims( + output_tensor_info[config_output_name].tensor_shape, + output["dims"], + batching_enabled, + config_output_name, + ) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + @staticmethod + def auto_complete_config(auto_complete_model_config): + config = auto_complete_model_config.as_dict() + + if config["platform"] != "tensorflow_savedmodel": + raise pb_utils.TritonModelException( + f"[INTERNAL]: The platform field for using this model should be set to" + " 'tensorflow_savedmodel' in model config, got '" + + config["platform"] + + "'" + ) + if config["batch_input"]: + raise pb_utils.TritonModelException( + f"The platform model '" + + config["platform"] + + "' does not support model with batch_input" + ) + if config["batch_output"]: + raise pb_utils.TritonModelException( + f"The platform model '" + + config["platform"] + + "' does not support model with batch_output" + ) + + savedmodel_path = _get_savedmodel_path(config) + + if savedmodel_path is None: + raise pb_utils.TritonModelException( + f"[INTERNAL]: The path to the framework model should be" " provided" + ) + + batching_enabled = False + if config["max_batch_size"] != 0: + batching_enabled = True + + _, signature_def = _get_signature_def(savedmodel_path, config) + + input_tensor_info = signature_def.inputs + output_tensor_info = signature_def.outputs + + batching_hint = False + if not batching_enabled: + batching_hint = _get_batching_hint_from_signature(signature_def) + + # FIXME: Currently the presence of dynamic batch dimension is + # being treated as sufficient proof for enabling batching. + # Need to visit the tensors that are already provided in config + # to confirm the hint + batching_enabled = batching_hint + + config_input_names = [input["name"] for input in config["input"]] + config_output_names = [output["name"] for output in config["output"]] + + # TODO: Add auto-completion of partial tensor specification. + for input_name in input_tensor_info.keys(): + if input_name not in config_input_names: + auto_complete_model_config.add_input( + _convert_proto_to_dict_tensor( + input_name, input_tensor_info[input_name], batching_enabled + ) + ) + + for output_name in output_tensor_info.keys(): + if output_name not in config_output_names: + auto_complete_model_config.add_output( + _convert_proto_to_dict_tensor( + output_name, output_tensor_info[output_name], batching_enabled + ) + ) + + if batching_enabled: + if config["max_batch_size"] == 0: + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + + return auto_complete_model_config + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args["model_config"]) + + savedmodel_path = _get_savedmodel_path(model_config) + + self.model_name = args["model_name"] + self.logger = pb_utils.Logger + self.logger.log_info("Initializing model for " + self.model_name) + + if args["model_instance_kind"] != "CPU": + self.logger.log_warn( + "GPU instances are not supported by this backend. Falling back to KIND_CPU for " + + self.model_name + ) + + tag_set, signature_def = _get_signature_def(savedmodel_path, model_config) + _validate_model_config(model_config, signature_def) + + self.signature_def = signature_def + self.input_tensor_info = self.signature_def.inputs + output_tensor_info = self.signature_def.outputs + + # Get the input output names from model config + self.input_names = [input["name"] for input in model_config["input"]] + self.output_names = [output["name"] for output in model_config["output"]] + + # Get the output tensor names + self.output_tensor_names = [ + output_tensor_info[output_name].name for output_name in self.output_names + ] + + # load the session model + # FIXME Add more configuration options for the model. + sess_config = tf.compat.v1.ConfigProto( + inter_op_parallelism_threads=_parse_num_inter_threads(model_config), + intra_op_parallelism_threads=_parse_num_intra_threads(model_config), + use_per_session_threads=_parse_use_per_session_thread(model_config), + ) + self.tf_session = session.Session(graph=tf.Graph(), config=sess_config) + loader.load(self.tf_session, [tag_set], savedmodel_path) + + # Hoding the input dict for caching input tensor data for + # better inference performance + self.input_feed_dict = {} + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # FIXME: Instead of iterating through each request, run + # the inference as a single batch. + for request in requests: + # Prepare the input feed for the model. + for input_name in self.input_names: + self.input_feed_dict[ + self.input_tensor_info[input_name].name + ] = pb_utils.get_input_tensor_by_name(request, input_name).as_numpy() + + # FIXME: Add GPU Tensor handling. DLpack should be utilized + # for better performance + outputs = self.tf_session.run( + self.output_tensor_names, feed_dict=self.input_feed_dict + ) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + output_tensors = [] + for i, output in enumerate(outputs): + output_tensors.append(pb_utils.Tensor(self.output_names[i], output)) + + inference_response = pb_utils.InferenceResponse( + output_tensors=output_tensors + ) + responses.append(inference_response) + + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + if self.tf_session is not None: + self.tf_session.close + self.logger.log_info("Removed model instance for " + self.model_name) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index fc3bacd4..de4dd46c 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -62,6 +62,10 @@ StubLauncher::Initialize(ModelState* model_state) model_state->ModelConfig().Write(&model_config_buffer_); is_decoupled_ = model_state->IsDecoupled(); model_repository_path_ = model_state->RepositoryPath(); + platform_ = model_state->Platform(); + if (platform_.empty()) { + platform_ = "NONE"; + } // Atomically increase and read the stub process count to avoid shared memory // region name collision @@ -73,11 +77,10 @@ StubLauncher::Initialize(ModelState* model_state) model_version_ = model_state->Version(); std::stringstream ss; + ss << model_repository_path_ << "/" << model_version_ << "/"; std::string artifact_name; RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( "default_model_filename", &artifact_name)); - ss << model_repository_path_ << "/" << model_version_ << "/"; - if (artifact_name.size() > 0) { ss << artifact_name; } else { @@ -86,15 +89,6 @@ StubLauncher::Initialize(ModelState* model_state) } model_path_ = ss.str(); - struct stat buffer; - - // Check if model.py exists - if (stat(model_path_.c_str(), &buffer) != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("model.py does not exist in the model repository path: " + model_path_) - .c_str()); - } // Path to the extracted Python env std::string python_execution_env = ""; @@ -244,7 +238,7 @@ StubLauncher::Launch() << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name; + << " " << ipc_control_handle_ << " " << stub_name << " " << platform_; ipc_control_->uses_env = true; bash_argument = ss.str(); } else { @@ -252,7 +246,7 @@ StubLauncher::Launch() ss << " exec " << python_backend_stub << " " << model_path_ << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name; + << " " << ipc_control_handle_ << " " << stub_name << " " << platform_; bash_argument = ss.str(); } LOG_MESSAGE( diff --git a/src/stub_launcher.h b/src/stub_launcher.h index fc5b6578..89f35422 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -161,6 +161,7 @@ class StubLauncher { std::string shm_region_name_; std::string model_repository_path_; std::string model_path_; + std::string platform_; const std::string stub_process_kind_; std::string model_name_; const std::string model_instance_name_; From 34bb9d81317ef95283ff10f672f5bf85c2e26f72 Mon Sep 17 00:00:00 2001 From: Katherine Yang <80359429+jbkyang-nvi@users.noreply.github.com> Date: Fri, 7 Jul 2023 14:36:27 -0700 Subject: [PATCH 123/216] Allow uncompressed conda execution enviroments (#266) Allow uncompressed conda execution environments and add documentation for custom execution environments. --- README.md | 23 ++++++++++++++++++----- src/pb_env.cc | 16 ++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5703e726..a5ff153e 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ any C++ code. - [`initialize`](#initialize) - [`execute`](#execute) - [Default Mode](#default-mode) - - [Error Handling](#error-handling) + - [Error Handling](#error-handling) - [Decoupled mode](#decoupled-mode) - [Use Cases](#use-cases) - [Known Issues](#known-issues) @@ -62,8 +62,8 @@ any C++ code. - [Running Multiple Instances of Triton Server](#running-multiple-instances-of-triton-server) - [Business Logic Scripting](#business-logic-scripting) - [Using BLS with Decoupled Models](#using-bls-with-decoupled-models) - - [Using BLS with Stateful Models](#using-bls-with-stateful-models) - [Model Loading API](#model-loading-api) + - [Using BLS with Stateful Models](#using-bls-with-stateful-models) - [Limitation](#limitation) - [Interoperability and GPU Support](#interoperability-and-gpu-support) - [`pb_utils.Tensor.to_dlpack() -> PyCapsule`](#pb_utilstensorto_dlpack---pycapsule) @@ -72,7 +72,9 @@ any C++ code. - [Input Tensor Device Placement](#input-tensor-device-placement) - [Frameworks](#frameworks) - [PyTorch](#pytorch) + - [PyTorch Determinism](#pytorch-determinism) - [TensorFlow](#tensorflow) + - [TensorFlow Determinism](#tensorflow-determinism) - [Custom Metrics](#custom-metrics) - [Examples](#examples) - [AddSub in NumPy](#addsub-in-numpy) @@ -81,7 +83,8 @@ any C++ code. - [Business Logic Scripting](#business-logic-scripting-1) - [Preprocessing](#preprocessing) - [Decoupled Models](#decoupled-models) - - [Auto-complete Config](#auto-complete-config) + - [Model Instance Kind](#model-instance-kind) + - [Auto-complete config](#auto-complete-config) - [Custom Metrics](#custom-metrics-1) - [Running with Inferentia](#running-with-inferentia) - [Logging](#logging) @@ -678,7 +681,7 @@ above. If you want to create a tar file that contains all your Python dependencies or you want to use different Python environments for each Python model you need to create a *Custom Execution Environment* in Python backend. -Currently, Python backend only supports +Currently, Python backend supports [conda-pack](https://conda.github.io/conda-pack/) for this purpose. [conda-pack](https://conda.github.io/conda-pack/) ensures that your conda environment is portable. You can create a tar file for your conda environment @@ -704,7 +707,17 @@ If this variable is not exported and similar packages are installed outside your conda environment, your tar file may not contain all the dependencies required for an isolated Python environment. -After creating the tar file from the conda environment, you need to tell Python +Alternatively, Python backend also supports unpacked conda execution +environments, given it points to an activation script to setup the conda +environment. To do this, the execution environment can be first packed using +[conda-pack](https://conda.github.io/conda-pack/) and then unpacked, or created +using [conda create -p](https://docs.conda.io/projects/conda/en/latest/commands/create.html). +In this case, the conda activation script is located in: +```$path_to_conda_pack/lib/python/site-packages/conda_pack/scripts/posix/activate``` +This speeds up the server loading time for models. + +After creating the packed file from the conda environment or creating a conda +environment with a custom activation script, you need to tell Python backend to use that environment for your model. You can do this by adding the lines below to the `config.pbtxt` file: diff --git a/src/pb_env.cc b/src/pb_env.cc index a4278102..0b6eb9ec 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -253,6 +254,21 @@ EnvironmentManager::ExtractIfNotExtracted(std::string env_path) bool env_extracted = false; bool re_extraction = false; + + // If the path is not a conda-packed file, then bypass the extraction process + struct stat info; + if (stat(canonical_env_path, &info) != 0) { + throw PythonBackendException( + std::string("stat() of : ") + canonical_env_path + " returned error."); + } else if (S_ISDIR(info.st_mode)) { + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Returning canonical path since EXECUTION_ENV_PATH does " + "not contain compressed path. Path: ") + + canonical_env_path) + .c_str()); + return canonical_env_path; + } const auto env_itr = env_map_.find(canonical_env_path); if (env_itr != env_map_.end()) { // Check if the environment has been modified and would From d448340a52b5cbee054dbe9d356ee43745cba87e Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Fri, 7 Jul 2023 17:30:16 -0700 Subject: [PATCH 124/216] Fallback to CPU tensors if cuInit call fails (#270) * Fallback to CPU tensors if cuInit fails * Update src/python_be.cc Co-authored-by: Iman Tabrizian --------- Co-authored-by: Iman Tabrizian --- src/pb_utils.cc | 11 ++++++----- src/pb_utils.h | 3 +++ src/python_be.cc | 10 +++++++++- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 523f4fed..089f4cf0 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -104,11 +104,12 @@ CUDAHandler::CUDAHandler() if (cuda_err != CUDA_SUCCESS) { const char* error_string; (*cu_get_error_string_fn_)(cuda_err, &error_string); - throw PythonBackendException( - std::string( - "failed to get cuda pointer device attribute: " + - std::string(error_string)) - .c_str()); + error_str_ = std::string("failed to call cuInit: ") + error_string; + int status = dlclose(dl_open_handle_); + if (status != 0) { + throw PythonBackendException("Failed to close the libcuda handle."); + } + dl_open_handle_ = nullptr; } } } diff --git a/src/pb_utils.h b/src/pb_utils.h index 06d4e4ea..1d651f3f 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -247,6 +247,7 @@ class CUDAHandler { private: std::mutex mu_; void* dl_open_handle_ = nullptr; + std::string error_str_; CUresult (*cu_pointer_get_attribute_fn_)( CUdeviceptr*, CUpointer_attribute, CUdeviceptr) = nullptr; CUresult (*cu_get_error_string_fn_)(CUresult, const char**) = nullptr; @@ -263,6 +264,8 @@ class CUDAHandler { CUDAHandler(CUDAHandler const&) = delete; void operator=(CUDAHandler const&) = delete; bool IsAvailable(); + const std::string& GetErrorString() const { return error_str_; } + void ClearErrorString() { return error_str_.clear(); } void PointerGetAttribute( CUdeviceptr* start_address, CUpointer_attribute attr, CUdeviceptr device_ptr); diff --git a/src/python_be.cc b/src/python_be.cc index 793998e8..6f25e024 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -462,7 +462,15 @@ ModelInstanceState::GetInputTensor( CUDAHandler& cuda_handler = CUDAHandler::getInstance(); // If CUDA driver API is not available, the input tensors will be moved to // CPU. - if (!cuda_handler.IsAvailable()) { + if (!cuda_handler.IsAvailable() && !cpu_only_tensors) { + if (!cuda_handler.GetErrorString().empty()) { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, (std::string( + "Forcing CPU only input tensors: " + + cuda_handler.GetErrorString())) + .c_str()); + } + cuda_handler.ClearErrorString(); cpu_only_tensors = true; } #endif From 1f7f6942fd59f3fb1a7f9bb8f5a8ae6197ef577b Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Mon, 10 Jul 2023 13:19:37 -0700 Subject: [PATCH 125/216] Update the documentation about the model unloading issue (#271) * Update the doc to mention the model unloading issue * Address comment --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a5ff153e..e7eb107f 100644 --- a/README.md +++ b/README.md @@ -1213,7 +1213,8 @@ class TritonPythonModel: # If the model is no longer needed, you can unload it. You can also # specify whether the dependents of the model should also be unloaded by # setting the 'unload_dependents' parameter to True. The default value - # is False. + # is False. Need to be careful when unloading the model as it can affect + # other model instances or other models that depend on it. pb_utils.unload_model(model_name=self.model_name, unload_dependents=True) From 8c93267e7cae79e828678359b08d528018546e13 Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Mon, 10 Jul 2023 19:35:49 -0700 Subject: [PATCH 126/216] Update iSort to use Black style (#267) * Add Black profile for isort * Remove clang-format * Restore clang-format * Fix yaml spacing * Normalize spacing * Normalize config indentation * Update line limit in clang-format to 80 chars * Update workflows to run on every PR --- .clang-format | 3 +- .github/workflows/codeql.yml | 6 ---- .github/workflows/pre-commit.yml | 2 -- .pre-commit-config.yaml | 48 ++++++++++++++++---------------- pyproject.toml | 1 + 5 files changed, 27 insertions(+), 33 deletions(-) diff --git a/.clang-format b/.clang-format index 98c64973..bf96a593 100644 --- a/.clang-format +++ b/.clang-format @@ -2,6 +2,7 @@ BasedOnStyle: Google IndentWidth: 2 +ColumnLimit: 80 ContinuationIndentWidth: 4 UseTab: Never MaxEmptyLinesToKeep: 2 @@ -34,4 +35,4 @@ BinPackArguments: true BinPackParameters: true ConstructorInitializerAllOnOneLineOrOnePerLine: false -IndentCaseLabels: true \ No newline at end of file +IndentCaseLabels: true diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4f3f98cc..745a3373 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,13 +27,7 @@ name: "CodeQL" on: - push: - branches: [ 'main' ] pull_request: - # The branches below must be a subset of the branches above - branches: [ 'main' ] - schedule: - - cron: '0 1 * * 1-6' jobs: analyze: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 40cbd972..ab4bd951 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -28,8 +28,6 @@ name: pre-commit on: pull_request: - push: - branches: [main] jobs: pre-commit: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9c0fff8a..298baab6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,41 +33,41 @@ repos: - repo: https://github.com/psf/black rev: 23.1.0 hooks: - - id: black - types_or: [python, cython] + - id: black + types_or: [python, cython] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: - - id: flake8 - args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] - types_or: [python, cython] + - id: flake8 + args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] + types_or: [python, cython] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v16.0.5 hooks: - - id: clang-format - types_or: [c, c++, cuda, proto, textproto, java] - args: ["-fallback-style=none", "-style=file", "-i"] + - id: clang-format + types_or: [c, c++, cuda, proto, textproto, java] + args: ["-fallback-style=none", "-style=file", "-i"] - repo: https://github.com/codespell-project/codespell rev: v2.2.4 hooks: - - id: codespell - additional_dependencies: [tomli] - args: ["--toml", "pyproject.toml"] - exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) # More details about these pre-commit hooks here: # https://pre-commit.com/hooks.html - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - - id: check-case-conflict - - id: check-executables-have-shebangs - - id: check-merge-conflict - - id: check-json - - id: check-toml - - id: check-yaml - - id: check-shebang-scripts-are-executable - - id: end-of-file-fixer - types_or: [c, c++, cuda, proto, textproto, java, python] - - id: mixed-line-ending - - id: requirements-txt-fixer - - id: trailing-whitespace + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: check-yaml + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + types_or: [c, c++, cuda, proto, textproto, java, python] + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace diff --git a/pyproject.toml b/pyproject.toml index d51b9f62..5e8749f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ builtin = "clear" quiet-level = 3 [tool.isort] +profile = "black" use_parentheses = true multi_line_output = 3 include_trailing_comma = true From 30b19c7bdad2c63babff6befd400ce9cfb5c420c Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 11 Jul 2023 11:47:20 -0700 Subject: [PATCH 127/216] Update documentation: clarify model loading API in auto_complete_config (#273) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e7eb107f..03b45d85 100644 --- a/README.md +++ b/README.md @@ -1227,7 +1227,8 @@ been running, which means that the BLS model should not be loaded during server startup. You can use different [client endpoints](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md) to load the model after the server has been started. The model loading API is -currently not supported during the `finalize` phase. +currently not supported during the `auto_complete_config` and `finalize` +functions. ## Using BLS with Stateful Models From 23d1a215cb2d639c37adb8bf77e345c2ef90f4d7 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 11 Jul 2023 14:44:50 -0700 Subject: [PATCH 128/216] Update documentation: Make the known issue clearer (#275) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 03b45d85..eb4e22a9 100644 --- a/README.md +++ b/README.md @@ -549,7 +549,7 @@ for more details on how to host a decoupled model. ##### Known Issues -* Currently, async BLS requests are not supported in decoupled mode. +* Currently, decoupled Python models can not make async infer requests. ### `finalize` From c4b3e639729a2d239d85f736aa5df955bd9234c9 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Fri, 14 Jul 2023 16:28:37 -0700 Subject: [PATCH 129/216] Document TF platform handler (#276) * Document TF platform handler * Move the documentation on TF platform handler * Update src/resources/platform_handlers/tensorflow_savedmodel/README.md Co-authored-by: Kris Hung * Update src/resources/platform_handlers/tensorflow_savedmodel/README.md Co-authored-by: Kris Hung * Address review comments * Fix * Add a disclaimer note * Update src/resources/platform_handlers/tensorflow_savedmodel/README.md Co-authored-by: Neelay Shah --------- Co-authored-by: Kris Hung Co-authored-by: Neelay Shah --- .../tensorflow_savedmodel/README.md | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 src/resources/platform_handlers/tensorflow_savedmodel/README.md diff --git a/src/resources/platform_handlers/tensorflow_savedmodel/README.md b/src/resources/platform_handlers/tensorflow_savedmodel/README.md new file mode 100644 index 00000000..23199e7b --- /dev/null +++ b/src/resources/platform_handlers/tensorflow_savedmodel/README.md @@ -0,0 +1,87 @@ + + +# Serving Tensorflow SavedModels using Python Backend \[Experimental\] + +*NOTE*: This feature is subject to change and removal, and should not +be used in production. + +Starting from 23.07, we are adding experimental support for loading +and serving of models in [TensorFlow SavedModel](https://www.tensorflow.org/guide/saved_model) +format via Python backend. The `model.savedmodel` can be provided within +the triton server model repository without `model.py` and backend will +automatically use a pre-built python model (`model.py`)[model.py] to load +and serve provided TF SavedModel. The handler can [auto-complete](../../../../README.md#auto_complete_config) +the missing model configuration. + +The model repository structure can look like: + +``` +model_repository/ +`-- resnet_v1_50_savedmodel + |-- 1 + | `-- model.savedmodel + | |-- saved_model.pb + | `-- variables + |-- config.pbtxt + `-- resnet50_labels.txt +``` + +In order to use this feature, make sure that [TensorFlow pip package](https://pypi.org/project/tensorflow/2.13.0/) +is available in the same Python environment. + +``` +pip install tensorfow==2.13.0 +``` + +Alternatively, you can create a +[Python Execution Environment](#using-custom-python-execution-environments) +with the TensorFlow dependency. + +By default, Triton will use the [TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend) +to load and serve the saved model. In order to use the Python backend with +TensorFlow SavedModel, [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md) +should explicitly provide the following settings: + +``` +backend: "python" +platform: "tensorflow_savedmodel" +``` + +It has been observed that certain DLFW like TensorFlow do not release the entire +memory allocated for loading a model back to the system when the model gets +unloaded. This can be problematic when working with a large number of models and +dynamically loading/unloading them. Using Python backend for TF SavedModel serving +will allow the models to be loaded in a separate process, which ensures that entire +memory allocated within the process would be released to the system upon a model +unload. + +Following are few known limitations of this feature: +- GPU execution is not supported. +- List of requests received in model [`execute`](../../../../README.md#execute) function are +not run in a single batch but one after the other. From b507a1a516e5c43eef959d365d7ca3e54e4540bb Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 19 Jul 2023 11:45:51 -0700 Subject: [PATCH 130/216] Add a note on the GLIBCXX_3.4.30 not found issue when using custom execution env (#280) --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index eb4e22a9..6c445d86 100644 --- a/README.md +++ b/README.md @@ -800,6 +800,15 @@ may use dependencies that are not available in the Triton container that you are using for deployment. For example, compiling the Python backend stub on an OS other than Ubuntu 22.04 can lead to unexpected errors. +7. If you encounter the "GLIBCXX_3.4.30 not found" error during runtime, we +recommend upgrading your conda version and installing `libstdcxx-ng=12` by +running `conda install -c conda-forge libstdcxx-ng=12 -y`. If this solution does +not resolve the issue, please feel free to open an issue on the +[GitHub issue page](https://github.com/triton-inference-server/server/issues) +following the provided +[instructions](https://github.com/triton-inference-server/server#reporting-problems-asking-questions). + + ## Error Handling If there is an error that affects the `initialize`, `execute`, or `finalize` From bae80b1e49759745208fd8c0aaf0bfece0261256 Mon Sep 17 00:00:00 2001 From: dsgibbons Date: Thu, 20 Jul 2023 05:37:48 +0930 Subject: [PATCH 131/216] Fix typo in client.py (#279) --- examples/add_sub/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/add_sub/client.py b/examples/add_sub/client.py index 1c08ae74..80fc4133 100644 --- a/examples/add_sub/client.py +++ b/examples/add_sub/client.py @@ -65,7 +65,7 @@ ) ) print( - "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format( input0_data, input1_data, output1_data ) ) From d9de83e7a6fb660dfa4af7773e07f9ff871075d8 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 27 Jul 2023 16:53:52 -0400 Subject: [PATCH 132/216] Improving instance kind example (#283) --- examples/instance_kind/README.md | 4 ++-- examples/instance_kind/model.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/instance_kind/README.md b/examples/instance_kind/README.md index 67b5e2a3..360f72a6 100644 --- a/examples/instance_kind/README.md +++ b/examples/instance_kind/README.md @@ -84,11 +84,11 @@ folder is located), run the following command: docker run --gpus all --shm-size 1G -it --rm -p 8000:8000 -v `pwd`:/instance_kind nvcr.io/nvidia/tritonserver:-py3 /bin/bash ``` -Inside the container, we need to install `torch` and `pillow` to run +Inside the container, we need to install `torch`, `torchvision` and `pillow` to run this example. We recommend to use `pip` method for the installation: ``` -pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html pillow +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html torchvision==0.14.0+cu117 pillow ``` Finally, we need to start the Triton Server: diff --git a/examples/instance_kind/model.py b/examples/instance_kind/model.py index 801a8593..baff8e7b 100644 --- a/examples/instance_kind/model.py +++ b/examples/instance_kind/model.py @@ -43,7 +43,12 @@ def initialize(self, args): not specified in the config file, then models will be loaded onto the default device of the framework. """ - self.device = "cuda" if args["model_instance_kind"] == "GPU" else "cpu" + # Here we set up the device onto which our model will beloaded, + # based on specified `model_instance_kind` and `model_instance_device_id` + # fields. + device = "cuda" if args["model_instance_kind"] == "GPU" else "cpu" + device_id = args["model_instance_device_id"] + self.device = f"{device}:{device_id}" # This example is configured to work with torch=1.13 # and torchvision=0.14. Thus, we need to provide a proper tag `0.14.1` # to make sure loaded Resnet50 is compatible with From 823f628d721f2600bc5f0edd7616c90618e7d84f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 1 Aug 2023 14:03:00 -0700 Subject: [PATCH 133/216] Enable parallel instance loading backend attribute (#284) --- src/python_be.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/python_be.cc b/src/python_be.cc index 6f25e024..458e651a 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -2274,6 +2274,11 @@ TRITONBACKEND_GetBackendAttribute( backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0)); #endif + // This backend can safely handle parallel calls to + // TRITONBACKEND_ModelInstanceInitialize (thread-safe). + RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading( + backend_attributes, true)); + return nullptr; } From a0ddfa911cd6cf80bf8c4ff4c1764e7f317e5f7d Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Fri, 4 Aug 2023 06:27:18 +0800 Subject: [PATCH 134/216] Add example to use unpacked conda execution environments (#285) Signed-off-by: Xiaodong Ye --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 6c445d86..af5ef8f7 100644 --- a/README.md +++ b/README.md @@ -764,6 +764,23 @@ models In the example above, `$$TRITON_MODEL_DIRECTORY` is resolved to `$pwd/models/model_a`. +To accelerate the loading time of `model_a`, you can follow the steps below to +unpack the conda environment in the model folder: + +```bash +mkdir -p $pwd/models/model_a/python3.6 +tar -xvf $pwd/models/model_a/python3.6.tar.gz -C $pwd/models/model_a/python3.6 +``` + +Then you can change the `EXECUTION_ENV_PATH` to point to the unpacked directory: + +``` +parameters: { + key: "EXECUTION_ENV_PATH", + value: {string_value: "$$TRITON_MODEL_DIRECTORY/python3.6"} +} +``` + This is useful if you want to use S3, GCS, or Azure and you do not have access to the absolute path of the execution env that is stored in the cloud object storage service. From a9e6a778b3943f24bbc5c35da39ab7628f060c0f Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Mon, 7 Aug 2023 14:54:23 -0700 Subject: [PATCH 135/216] Add PyTorch platform handler (#282) * Add pytorch_libtorch platform handler * Add GPU and dlpack support * Add ability to auto find model class * Add TorchScript support * Fix inconsistent PyTorch output type * Update documentation * Add ability to disable torch.compile * Update comments * Torch tensors already have dlpack info * Fix typo * Simplify logic for checking is py class model * Update documentation * Change platform name * Add KIND_MODEL support * Enable torch infer mode * Enable setting torch parallelism parameters * Add ability to supply torch.compile parameters * Add ability to batch requests into a single tensor * Update docs * Use default value 1 for torch parallelism * Rename model.py.pt to model.pt * Rename to gather and scatter * Update documentation * Unify docs format * Fix docs format * Fix docs format * Unify gather and scatter interface * Link docs from primary Python docs * Update kind_model behavior * Simplify kind_model logic --- README.md | 4 + .../platform_handlers/pytorch/README.md | 132 +++++++ .../platform_handlers/pytorch/model.py | 323 ++++++++++++++++++ 3 files changed, 459 insertions(+) create mode 100644 src/resources/platform_handlers/pytorch/README.md create mode 100755 src/resources/platform_handlers/pytorch/model.py diff --git a/README.md b/README.md index af5ef8f7..147e1a34 100644 --- a/README.md +++ b/README.md @@ -1397,6 +1397,10 @@ this workflow. For a simple example of using PyTorch in a Python Backend model, see the [AddSubNet PyTorch example](#addsubnet-in-pytorch). +PyTorch models may be served directly without implementing the `model.py`, see +[Serving PyTorch models using Python Backend \[Experimental\]](src/resources/platform_handlers/pytorch/README.md) +for more details. + ### PyTorch Determinism When running PyTorch code, you may notice slight differences in output values diff --git a/src/resources/platform_handlers/pytorch/README.md b/src/resources/platform_handlers/pytorch/README.md new file mode 100644 index 00000000..0e9240b8 --- /dev/null +++ b/src/resources/platform_handlers/pytorch/README.md @@ -0,0 +1,132 @@ + + +# Serving PyTorch models using Python Backend \[Experimental\] + +**NOTE**: *This feature is subject to change and removal, and should not +be used in production.* + +Starting from 23.08, we are adding an experimental support for loading and +serving PyTorch models directly via Python backend. The model can be provided +within the triton server model repository, and a +[pre-built Python model](model.py) will be used to load and serve the PyTorch +model. + +## Model Layout + +The model repository should look like: + +``` +model_repository/ +`-- model_directory + |-- 1 + | |-- model.py + | `-- model.pt + `-- config.pbtxt +``` + +The `model.py` contains the class definition of the PyTorch model. The class +should extend the +[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +The `model.pt` may be optionally provided which contains the saved +[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) +of the model. For serving TorchScript models, a `model.pt` TorchScript can be +provided in place of the `model.py` file. + +By default, Triton will use the +[PyTorch backend](https://github.com/triton-inference-server/pytorch_backend) to +load and serve PyTorch models. In order to serve from Python backend, +[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md) +should explicitly provide the following settings: + +``` +backend: "python" +platform: "pytorch" +``` + +## PyTorch Installation + +This feature will take advantage of the +[`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) +optimization, make sure the +[PyTorch 2.0+ pip package](https://pypi.org/project/torch/2.0.1/) is available +in the same Python environment. + +``` +pip install torch==2.0.1 +``` +Alternatively, a +[Python Execution Environment](#using-custom-python-execution-environments) +with the PyTorch dependency may be used. + +## Customization + +The following PyTorch settings may be customized by setting parameters on the +`config.pbtxt`. + +[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads) +- Key: NUM_THREADS +- Value: The number of threads used for intraop parallelism on CPU. + +[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads) +- Key: NUM_INTEROP_THREADS +- Value: The number of threads used for interop parallelism (e.g. in JIT +interpreter) on CPU. + +[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) +- Key: TORCH_COMPILE_OPTIONAL_PARAMETERS +- Value: Any of following parameter(s) encoded as a JSON object. + - fullgraph (*bool*): Whether it is ok to break model into several subgraphs. + - dynamic (*bool*): Use dynamic shape tracing. + - backend (*str*): The backend to be used. + - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune". + - options (*dict*): A dictionary of options to pass to the backend. + - disable (*bool*): Turn `torch.compile()` into a no-op for testing. + +For example: +``` +parameters: { + key: "NUM_THREADS" + value: { string_value: "4" } +} +parameters: { + key: "TORCH_COMPILE_OPTIONAL_PARAMETERS" + value: { string_value: "{\"disable\": true}" } +} +`````` + +## Limitations + +Following are few known limitations of this feature: +- Python functions optimizable by `torch.compile` may not be served directly in +the `model.py` file, they need to be enclosed by a class extending the +[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +- Model weights cannot be shared across multiple instances on the same GPU +device. +- When using `KIND_MODEL` as model instance kind, the default device of the +first parameter on the model is used. diff --git a/src/resources/platform_handlers/pytorch/model.py b/src/resources/platform_handlers/pytorch/model.py new file mode 100755 index 00000000..365599e0 --- /dev/null +++ b/src/resources/platform_handlers/pytorch/model.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import importlib +import json +import os + +try: + import torch +except ModuleNotFoundError as error: + raise RuntimeError( + "Missing/Incomplete PyTorch package installation... (Did you install PyTorch?)" + ) from error + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + + +def _get_model_path(config): + filenames = ["model.py", "model.pt"] + if config["default_model_filename"]: + filenames.insert(0, config["default_model_filename"]) + for filename in filenames: + model_path = os.path.join(pb_utils.get_model_dir(), filename) + if os.path.exists(model_path): + return model_path + raise pb_utils.TritonModelException( + "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames) + ) + + +def _get_model_data_path(model_path): + data_path_extensions = [".pt"] + model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)] + for extension in data_path_extensions: + data_path = model_path_no_extension + extension + if os.path.exists(data_path): + return data_path + # data file not provided + return "" + + +def _is_py_class_model(model_path): + return model_path[-3:] == ".py" + + +def _import_module_from_path(module_name, file_path): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _get_model_class_from_module(module): + names = dir(module) + for name in names: + attr = getattr(module, name) + try: + if issubclass(attr, torch.nn.Module): + return attr + except TypeError: + # attr may not be a class + pass + raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module") + + +def _parse_io_config(io_config): + io = [] + for conf in io_config: + io.append({"name": conf["name"]}) + return io + + +def _get_device_name(kind, device_id): + if kind == "GPU": + return "cuda:" + device_id + if kind == "CPU": + return "cpu" + # unspecified device + return "" + + +def _get_device(kind, device_id, model): + device_name = _get_device_name(kind, device_id) + if device_name == "": + for param in model.parameters(): + return param.device + raise pb_utils.TritonModelException("Cannot determine model device") + return torch.device(device_name) + + +def _set_torch_parallelism(config): + log_msg = "" + parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"] + for setting in parallelism_settings: + val = "1" + if setting in config["parameters"]: + val = config["parameters"][setting]["string_value"] + getattr(torch, "set_" + setting.lower())(int(val)) + log_msg += setting + " = " + val + "; " + return log_msg + + +def _get_torch_compile_params(config): + params = {} + if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]: + val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"] + params = json.loads(val) + if "model" in params: + raise pb_utils.TritonModelException( + "'model' is not an optional parameter for 'torch.compile'" + ) + return params + + +def _gather_torch_tensors(scatter_tensors): + gather_tensors = [] + sections = [] + for i in range(len(scatter_tensors)): + tensors = scatter_tensors[i] + for j in range(len(tensors)): + tensor = tensors[j] + if j < len(gather_tensors): + # add to existing tensor + gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0) + else: + # start a new tensor + gather_tensors.append(tensor) + # record section + section_length = tensors[0].size()[0] + sections.append(section_length) + return gather_tensors, sections + + +def _scatter_torch_tensors(gather_tensors, sections): + scatter_tensors = [] + for j in range(len(gather_tensors)): + scatter_tensor = torch.split(gather_tensors[j], sections) + for i in range(len(scatter_tensor)): + tensor = scatter_tensor[i] + if i < len(scatter_tensors): + # add to existing response + scatter_tensors[i].append(tensor) + else: + # start a new response + scatter_tensors.append([tensor]) + return scatter_tensors + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self._model_name = args["model_name"] + for_model = "for '" + self._model_name + "'" + self._logger = pb_utils.Logger + self._logger.log_info("Initializing model instance " + for_model) + + self._model_config = json.loads(args["model_config"]) + self._kind = args["model_instance_kind"] + self._device_id = args["model_instance_device_id"] + self._support_batching = self._model_config["max_batch_size"] > 0 + self._inputs = _parse_io_config(self._model_config["input"]) + self._outputs = _parse_io_config(self._model_config["output"]) + + setting_msg = _set_torch_parallelism(self._model_config) + self._logger.log_verbose( + "Torch parallelism settings " + for_model + ": " + setting_msg + ) + + self._infer_mode = torch.inference_mode(mode=True) + self._infer_mode.__enter__() + + params = _get_torch_compile_params(self._model_config) + self._logger.log_verbose( + "'torch.compile' optional parameter(s) " + for_model + ": " + str(params) + ) + if self._support_batching: + self._gather = torch.compile(_gather_torch_tensors, **params) + self._scatter = torch.compile(_scatter_torch_tensors, **params) + + model_path = _get_model_path(self._model_config) + if not _is_py_class_model(model_path): + self._logger.log_info("Loading '" + self._model_name + "' as TorchScript") + self._model = torch.jit.load(model_path) + self._device = _get_device(self._kind, self._device_id, self._model) + self._model.to(self._device) + self._model.eval() + return + + self._model_module = _import_module_from_path(self._model_name, model_path) + self._model_class = _get_model_class_from_module(self._model_module) + self._raw_model = self._model_class() + self._device = _get_device(self._kind, self._device_id, self._raw_model) + data_path = _get_model_data_path(model_path) + if data_path != "": + self._raw_model.load_state_dict( + torch.load(data_path, map_location=self._device) + ) + else: + self._logger.log_info("Model parameter file not found " + for_model) + self._raw_model.to(self._device) + self._raw_model.eval() + self._model = torch.compile(self._raw_model, **params) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + requests_tensors = [] + for request in requests: + tensors = [] + for io in self._inputs: + tensor = pb_utils.get_input_tensor_by_name( + request, io["name"] + ).to_dlpack() + tensor = torch.from_dlpack(tensor).to(self._device) + tensors.append(tensor) + requests_tensors.append(tensors) + + sections = None + if self._support_batching: + requests_tensors, sections = self._gather(requests_tensors) + requests_tensors = [requests_tensors] + + responses_tensors = [] + for input_tensors in requests_tensors: + output_tensors = self._model(*input_tensors) + if not isinstance(output_tensors, tuple) and not isinstance( + output_tensors, list + ): + output_tensors = [output_tensors] + responses_tensors.append(output_tensors) + + if self._support_batching: + responses_tensors = self._scatter(responses_tensors[0], sections) + + for response_tensors in responses_tensors: + output_tensors = [] + for i in range(len(self._outputs)): + io = self._outputs[i] + tensor = response_tensors[i].detach() + tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor) + output_tensors.append(tensor) + inference_response = pb_utils.InferenceResponse( + output_tensors=output_tensors + ) + responses.append(inference_response) + + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + self._logger.log_info("Removing model instance for '" + self._model_name + "'") + self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None) From 3ac4eb1c0cb1feb5a826ab0de50299efce643f8d Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Mon, 7 Aug 2023 17:57:13 -0400 Subject: [PATCH 136/216] Adding the support tracing of child models invoked from a BLS model (#277) * Adding tracing for bls * Added access to trace from BLS request creation * Added tracing to decoupled * clang format * Adding InferenceTrace object --- src/infer_request.cc | 13 +++++++++++-- src/infer_request.h | 17 ++++++++++++++++- src/pb_stub.cc | 12 +++++++++--- src/python_be.cc | 11 +++++++++-- src/request_executor.cc | 10 ++++++++-- 5 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 3ecde9e8..5fdae669 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -44,12 +44,13 @@ InferRequest::InferRequest( const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags, const int32_t timeout, const intptr_t response_factory_address, const intptr_t request_address, - const PreferredMemory& preferred_memory) + const PreferredMemory& preferred_memory, const InferenceTrace& trace) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), requested_output_names_(requested_output_names), model_name_(model_name), model_version_(model_version), parameters_(parameters), flags_(flags), timeout_(timeout), response_factory_address_(response_factory_address), - request_address_(request_address), preferred_memory_(preferred_memory) + request_address_(request_address), preferred_memory_(preferred_memory), + trace_(trace) { for (auto& input : inputs) { if (!input) { @@ -166,6 +167,12 @@ InferRequest::GetPreferredMemory() return preferred_memory_; } +InferenceTrace& +InferRequest::Trace() +{ + return trace_; +} + void InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { @@ -191,6 +198,7 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) infer_request_shm_ptr_->is_decoupled = is_decoupled_; infer_request_shm_ptr_->timeout = timeout_; infer_request_shm_ptr_->preferred_memory = preferred_memory_; + infer_request_shm_ptr_->trace = trace_; output_names_handle_shm_ptr_ = reinterpret_cast( @@ -368,6 +376,7 @@ InferRequest::InferRequest( is_decoupled_ = infer_request_shm_ptr_->is_decoupled; timeout_ = infer_request_shm_ptr_->timeout; preferred_memory_ = infer_request_shm_ptr_->preferred_memory; + trace_ = infer_request_shm_ptr_->trace; #ifdef TRITON_PB_STUB response_sender_ = std::make_shared( diff --git a/src/infer_request.h b/src/infer_request.h index 7eb2fd88..7ef3a363 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -41,6 +41,17 @@ namespace triton { namespace backend { namespace python { class Stub; +// +// Inference Trace +// +struct InferenceTrace { +#ifndef TRITON_PB_STUB + TRITONSERVER_InferenceTrace* triton_trace_; +#else + void* triton_trace_; +#endif +}; + // // Inference Request // @@ -55,6 +66,7 @@ struct InferRequestShm { bool is_decoupled; int32_t timeout; PreferredMemory preferred_memory; + InferenceTrace trace; }; class InferRequest { @@ -68,7 +80,8 @@ class InferRequest { const int32_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = - PreferredMemory(PreferredMemory::DEFAULT, 0)); + PreferredMemory(PreferredMemory::DEFAULT, 0), + const InferenceTrace& trace = {.triton_trace_ = nullptr}); const std::vector>& Inputs(); const std::string& RequestId(); @@ -84,6 +97,7 @@ class InferRequest { bool IsDecoupled(); void SetIsDecoupled(const bool is_decoupled); PreferredMemory& GetPreferredMemory(); + InferenceTrace& Trace(); #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); @@ -139,6 +153,7 @@ class InferRequest { intptr_t request_address_; bool is_decoupled_; PreferredMemory preferred_memory_; + InferenceTrace trace_; // Shared Memory Data Structures AllocatedSharedMemory infer_request_shm_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index eb561dec..b7df94c6 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1362,6 +1362,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::CPU) .export_values(); + py::class_>( + module, "InferenceTrace"); + py::class_>( module, "InferenceRequest") .def( @@ -1371,7 +1374,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::string& model_name, const int64_t model_version, const uint32_t flags, const int32_t timeout, - const PreferredMemory& preferred_memory) { + const PreferredMemory& preferred_memory, + const InferenceTrace& trace) { std::set requested_outputs; for (auto& requested_output_name : requested_output_names) { requested_outputs.emplace(requested_output_name); @@ -1381,7 +1385,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) request_id, correlation_id, inputs, requested_outputs, model_name, model_version, "" /*parameters*/, flags, timeout, 0 /*response_factory_address*/, 0 /*request_address*/, - preferred_memory); + preferred_memory, trace); }), py::arg("request_id").none(false) = "", py::arg("correlation_id").none(false) = 0, @@ -1391,7 +1395,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("model_version").none(false) = -1, py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = - PreferredMemory(PreferredMemory::DEFAULT, 0)) + PreferredMemory(PreferredMemory::DEFAULT, 0), + py::arg("trace").none(false) = nullptr) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) @@ -1401,6 +1406,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("set_flags", &InferRequest::SetFlags) .def("timeout", &InferRequest::Timeout) .def("parameters", &InferRequest::Parameters) + .def("trace", &InferRequest::Trace) .def( "exec", [](std::shared_ptr& infer_request, diff --git a/src/python_be.cc b/src/python_be.cc index 458e651a..a9cbbbd0 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -364,6 +364,11 @@ ModelInstanceState::SaveRequestsToSharedMemory( uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags)); + TRITONSERVER_InferenceTrace* triton_trace; + RETURN_IF_ERROR(TRITONBACKEND_RequestTrace(request, &triton_trace)); + + InferenceTrace trace = {triton_trace}; + std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; @@ -372,13 +377,15 @@ ModelInstanceState::SaveRequestsToSharedMemory( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), - reinterpret_cast(request)); + reinterpret_cast(request), + PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, 0 /* BLS request timeout*/, 0 /* response_factory_address */, - reinterpret_cast(request)); + reinterpret_cast(request), + PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); diff --git a/src/request_executor.cc b/src/request_executor.cc index 2590ee37..b54e3988 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -359,6 +359,12 @@ RequestExecutor::Infer( THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback( irequest, InferRequestComplete, nullptr /* request_release_userp */)); + TRITONSERVER_InferenceTrace* trace = nullptr; + if (infer_request->Trace().triton_trace_ != nullptr) { + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceTraceSpawnChildTrace( + infer_request->Trace().triton_trace_, &trace)); + } + for (auto& infer_input : infer_request->Inputs()) { THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAddInput( irequest, infer_input->Name().c_str(), @@ -388,8 +394,8 @@ RequestExecutor::Infer( reinterpret_cast(infer_payload->ResponseAllocUserp().get()), InferResponseComplete, reinterpret_cast(infer_payload.get()))); - THROW_IF_TRITON_ERROR(TRITONSERVER_ServerInferAsync( - server_, irequest, nullptr /* trace */)); + THROW_IF_TRITON_ERROR( + TRITONSERVER_ServerInferAsync(server_, irequest, trace)); } } catch (const PythonBackendException& pb_exception) { From 0476ee4f81d75d3de8c515cda1a1606a03fcb939 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 8 Aug 2023 15:43:54 -0700 Subject: [PATCH 137/216] Use constructor to define InferenceTrace default value (#286) --- src/infer_request.h | 7 ++++++- src/pb_stub.cc | 2 +- src/python_be.cc | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/infer_request.h b/src/infer_request.h index 7ef3a363..6652b2fb 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -47,9 +47,14 @@ class Stub; struct InferenceTrace { #ifndef TRITON_PB_STUB TRITONSERVER_InferenceTrace* triton_trace_; + InferenceTrace(TRITONSERVER_InferenceTrace* triton_trace) + : triton_trace_(triton_trace) + { + } #else void* triton_trace_; #endif + InferenceTrace() : triton_trace_(nullptr) {} }; // @@ -81,7 +86,7 @@ class InferRequest { const intptr_t request_address = 0, const PreferredMemory& preferred_memory = PreferredMemory(PreferredMemory::DEFAULT, 0), - const InferenceTrace& trace = {.triton_trace_ = nullptr}); + const InferenceTrace& trace = InferenceTrace()); const std::vector>& Inputs(); const std::string& RequestId(); diff --git a/src/pb_stub.cc b/src/pb_stub.cc index b7df94c6..c5c6b42e 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1396,7 +1396,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = PreferredMemory(PreferredMemory::DEFAULT, 0), - py::arg("trace").none(false) = nullptr) + py::arg("trace").none(false) = InferenceTrace()) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) diff --git a/src/python_be.cc b/src/python_be.cc index a9cbbbd0..df2a3235 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -367,7 +367,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( TRITONSERVER_InferenceTrace* triton_trace; RETURN_IF_ERROR(TRITONBACKEND_RequestTrace(request, &triton_trace)); - InferenceTrace trace = {triton_trace}; + InferenceTrace trace = InferenceTrace(triton_trace); std::unique_ptr infer_request; if (model_state->IsDecoupled()) { From 74722ba6584a7427f176e51db84f5ef6019aeccc Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 11 Aug 2023 15:02:26 -0400 Subject: [PATCH 138/216] Add custom parameters documentation (#288) --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 147e1a34..8fb40e68 100644 --- a/README.md +++ b/README.md @@ -1609,6 +1609,31 @@ how to adjust them dynamically, please see Triton's [logging extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_logging.md) documentation. +# Adding Custom Parameters in the Model Configuration + +If your model requires custom parameters in the configuration, you can specify +that in the `parameters` section of the model config. For example: + +``` +parameters { + key: "custom_key" + value: { + string_value: "custom_value" + } +} +``` + +Now you can access this parameter in the `args` argument of the `initialize` +function: + +```python +def initialize(self, args): + print(json.loads(args['model_config'])['parameters']) + # Should print {'custom_key': {'string_value': 'custom_value'}} +``` + + + # Reporting problems, asking questions We appreciate any feedback, questions or bug reporting regarding this From 02c9c1cde212a254b681159f660ac6080d67680b Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 15 Aug 2023 12:10:17 -0700 Subject: [PATCH 139/216] Add PyTorch platform handler example (#287) * Add PyTorch platform handler example * Refactor docs structure * Add more comments and minor refactoring * Further break down client.py * Remove exit 0 if terminated normally * Simplify comments * Improve comment * List mug.jpg paths * Docs update * Describe the source of mug.jpg --- README.md | 113 +- examples/pytorch_platform_handler/README.md | 109 ++ examples/pytorch_platform_handler/client.py | 92 ++ .../pytorch_platform_handler/config.pbtxt | 45 + examples/pytorch_platform_handler/model.py | 46 + .../resnet50_labels.txt | 1000 +++++++++++++++++ .../platform_handlers/pytorch/README.md | 132 --- 7 files changed, 1402 insertions(+), 135 deletions(-) create mode 100644 examples/pytorch_platform_handler/README.md create mode 100755 examples/pytorch_platform_handler/client.py create mode 100644 examples/pytorch_platform_handler/config.pbtxt create mode 100755 examples/pytorch_platform_handler/model.py create mode 100644 examples/pytorch_platform_handler/resnet50_labels.txt delete mode 100644 src/resources/platform_handlers/pytorch/README.md diff --git a/README.md b/README.md index 8fb40e68..49d4229b 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ any C++ code. - [Input Tensor Device Placement](#input-tensor-device-placement) - [Frameworks](#frameworks) - [PyTorch](#pytorch) + - [PyTorch Platform \[Experimental\]](#pytorch-platform-experimental) - [PyTorch Determinism](#pytorch-determinism) - [TensorFlow](#tensorflow) - [TensorFlow Determinism](#tensorflow-determinism) @@ -1397,9 +1398,115 @@ this workflow. For a simple example of using PyTorch in a Python Backend model, see the [AddSubNet PyTorch example](#addsubnet-in-pytorch). -PyTorch models may be served directly without implementing the `model.py`, see -[Serving PyTorch models using Python Backend \[Experimental\]](src/resources/platform_handlers/pytorch/README.md) -for more details. +### PyTorch Platform \[Experimental\] + +**NOTE**: *This feature is subject to change and removal, and should not +be used in production.* + +Starting from 23.08, we are adding an experimental support for loading and +serving PyTorch models directly via Python backend. The model can be provided +within the triton server model repository, and a +[pre-built Python model](src/resources/platform_handlers/pytorch/model.py) will +be used to load and serve the PyTorch model. + +#### Model Layout + +The model repository should look like: + +``` +model_repository/ +`-- model_directory + |-- 1 + | |-- model.py + | `-- model.pt + `-- config.pbtxt +``` + +The `model.py` contains the class definition of the PyTorch model. The class +should extend the +[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +The `model.pt` may be optionally provided which contains the saved +[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) +of the model. For serving TorchScript models, a `model.pt` TorchScript can be +provided in place of the `model.py` file. + +By default, Triton will use the +[PyTorch backend](https://github.com/triton-inference-server/pytorch_backend) to +load and serve TorchScript models. In order to serve from Python backend, +[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md) +should explicitly provide the following settings: + +``` +backend: "python" +platform: "pytorch" +``` + +#### PyTorch Installation + +This feature will take advantage of the +[`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) +optimization, make sure the +[PyTorch 2.0+ pip package](https://pypi.org/project/torch/2.0.1/) is available +in the same Python environment. + +``` +pip install torch==2.0.1 +``` +Alternatively, a +[Python Execution Environment](#using-custom-python-execution-environments) +with the PyTorch dependency may be used. + +#### Customization + +The following PyTorch settings may be customized by setting parameters on the +`config.pbtxt`. + +[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads) +- Key: NUM_THREADS +- Value: The number of threads used for intraop parallelism on CPU. + +[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads) +- Key: NUM_INTEROP_THREADS +- Value: The number of threads used for interop parallelism (e.g. in JIT +interpreter) on CPU. + +[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) +- Key: TORCH_COMPILE_OPTIONAL_PARAMETERS +- Value: Any of following parameter(s) encoded as a JSON object. + - fullgraph (*bool*): Whether it is ok to break model into several subgraphs. + - dynamic (*bool*): Use dynamic shape tracing. + - backend (*str*): The backend to be used. + - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune". + - options (*dict*): A dictionary of options to pass to the backend. + - disable (*bool*): Turn `torch.compile()` into a no-op for testing. + +For example: +``` +parameters: { + key: "NUM_THREADS" + value: { string_value: "4" } +} +parameters: { + key: "TORCH_COMPILE_OPTIONAL_PARAMETERS" + value: { string_value: "{\"disable\": true}" } +} +``` + +#### Example + +You can find the complete example instructions in +[examples/pytorch_platform_handler](examples/pytorch_platform_handler/README.md). + +#### Limitations + +Following are few known limitations of this feature: +- Python functions optimizable by `torch.compile` may not be served directly in +the `model.py` file, they need to be enclosed by a class extending the +[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). +- Model weights cannot be shared across multiple instances on the same GPU +device. +- When using `KIND_MODEL` as model instance kind, the default device of the +first parameter on the model is used. ### PyTorch Determinism diff --git a/examples/pytorch_platform_handler/README.md b/examples/pytorch_platform_handler/README.md new file mode 100644 index 00000000..13e32249 --- /dev/null +++ b/examples/pytorch_platform_handler/README.md @@ -0,0 +1,109 @@ + + +# PyTorch Example + +In this section, we demonstrate an end-to-end example for using the +[PyTorch Platform \[Experimental\]](../../README.md#pytorch-platform-experimental) +to serve a PyTorch model directly, **without** needing to implement the +`TritonPythonModel` class. + +## Create a ResNet50 model repository + +We will use the files that come with this example to create the model +repository. + +First, download [client.py](client.py), [config.pbtxt](config.pbtxt), +[model.py](model.py), +[mug.jpg](https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg) +and [resnet50_labels.txt](resnet50_labels.txt) to your local machine. + +Next, at the directory where the downloaded files are saved at, create a model +repository with the following commands: +``` +$ mkdir -p models/resnet50_pytorch/1 +$ mv model.py models/resnet50_pytorch/1 +$ mv config.pbtxt models/resnet50_pytorch +``` + +## Pull the Triton Docker images + +We need to install Docker and NVIDIA Container Toolkit before proceeding, refer +to the +[installation steps](https://github.com/triton-inference-server/server/tree/main/docs#installation). + +To pull the latest containers, run the following commands: +``` +$ docker pull nvcr.io/nvidia/tritonserver:-py3 +$ docker pull nvcr.io/nvidia/tritonserver:-py3-sdk +``` +See the installation steps above for the `` version. + +For example, if the version is `23.08`, then: +``` +$ docker pull nvcr.io/nvidia/tritonserver:23.08-py3 +$ docker pull nvcr.io/nvidia/tritonserver:23.08-py3-sdk +``` + +Be sure to replace the `` with the version pulled for all the remaining +parts of this example. + +## Start the Triton Server + +At the directory where we created the PyTorch model (at where the "models" +folder is located), run the following command: +``` +$ docker run -it --rm --gpus all --shm-size 1g -p 8000:8000 -v `pwd`:/pytorch_example nvcr.io/nvidia/tritonserver:-py3 /bin/bash +``` + +Inside the container, we need to install PyTorch, Pillow and Requests to run this example. +We recommend using `pip` method for the installations, for example: +``` +$ pip3 install torch Pillow requests +``` + +Finally, we need to start the Triton Server, run the following command: +``` +$ tritonserver --model-repository=/pytorch_example/models +``` + +To leave the container for the next step, press: `CTRL + P + Q`. + +## Test inference + +At the directory where the client.py is located, run the following command: +``` +$ docker run --rm --net=host -v `pwd`:/pytorch_example nvcr.io/nvidia/tritonserver:-py3-sdk python3 /pytorch_example/client.py +``` + +A successful inference will print the following at the end: +``` +Result: COFFEE MUG +Expected result: COFFEE MUG +PASS: PyTorch platform handler +``` diff --git a/examples/pytorch_platform_handler/client.py b/examples/pytorch_platform_handler/client.py new file mode 100755 index 00000000..ccd4624d --- /dev/null +++ b/examples/pytorch_platform_handler/client.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +import numpy as np +from PIL import Image +from tritonclient import http as httpclient +from tritonclient.utils import * + +script_directory = os.path.dirname(os.path.realpath(__file__)) + +server_url = "localhost:8000" +model_name = "resnet50_pytorch" +input_name = "INPUT" +output_name = "OUTPUT" +label_path = os.path.join(script_directory, "resnet50_labels.txt") +# The 'mug.jpg' image will be present at the script_directory if the steps on +# the provided README.md are followed. The image may also be found at +# '/workspace/images/mug.jpg' on the SDK container or +# '/opt/tritonserver/qa/images/mug.jpg' on the QA container. +image_path = os.path.join(script_directory, "mug.jpg") +expected_output_class = "COFFEE MUG" + + +def _load_input_image(): + raw_image = Image.open(image_path) + raw_image = raw_image.convert("RGB").resize((224, 224), Image.BILINEAR) + input_image = np.array(raw_image).astype(np.float32) + input_image = (input_image / 127.5) - 1 + input_image = np.transpose(input_image, (2, 0, 1)) + input_image = np.reshape(input_image, (1, 3, 224, 224)) + return input_image + + +def _infer(input_image): + with httpclient.InferenceServerClient(server_url) as client: + input_tensors = httpclient.InferInput(input_name, input_image.shape, "FP32") + input_tensors.set_data_from_numpy(input_image) + results = client.infer(model_name=model_name, inputs=[input_tensors]) + output_tensors = results.as_numpy(output_name) + return output_tensors + + +def _check_output(output_tensors): + with open(label_path) as f: + labels_dict = {idx: line.strip() for idx, line in enumerate(f)} + max_id = np.argmax(output_tensors, axis=1)[0] + output_class = labels_dict[max_id] + print("Result: " + output_class) + print("Expected result: " + expected_output_class) + if output_class != expected_output_class: + return False + return True + + +if __name__ == "__main__": + input_image = _load_input_image() + output_tensors = _infer(input_image) + result_valid = _check_output(output_tensors) + + if not result_valid: + print("PyTorch platform handler example error: Unexpected result") + sys.exit(1) + + print("PASS: PyTorch platform handler") diff --git a/examples/pytorch_platform_handler/config.pbtxt b/examples/pytorch_platform_handler/config.pbtxt new file mode 100644 index 00000000..70d99dad --- /dev/null +++ b/examples/pytorch_platform_handler/config.pbtxt @@ -0,0 +1,45 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "resnet50_pytorch" +backend: "python" +platform: "pytorch" + +max_batch_size: 128 + +input { + name: "INPUT" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 3, 224, 224 ] +} +output { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ 1000 ] +} + +instance_group [{ kind: KIND_CPU }] diff --git a/examples/pytorch_platform_handler/model.py b/examples/pytorch_platform_handler/model.py new file mode 100755 index 00000000..7fe59597 --- /dev/null +++ b/examples/pytorch_platform_handler/model.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch + + +class ResNet50(torch.nn.Module): + # This is a native PyTorch model class. `TritonPythonModel` is not needed. + + def __init__(self): + super().__init__() + self._model = torch.hub.load( + "pytorch/vision", + "resnet50", + weights="ResNet50_Weights.IMAGENET1K_V2", + skip_validation=True, + ) + + def forward(self, input_tensor): + output_tensor = self._model(input_tensor) + return output_tensor diff --git a/examples/pytorch_platform_handler/resnet50_labels.txt b/examples/pytorch_platform_handler/resnet50_labels.txt new file mode 100644 index 00000000..2376a285 --- /dev/null +++ b/examples/pytorch_platform_handler/resnet50_labels.txt @@ -0,0 +1,1000 @@ +TENCH +GOLDFISH +WHITE SHARK +TIGER SHARK +HAMMERHEAD SHARK +ELECTRIC RAY +STINGRAY +ROOSTER +HEN +OSTRICH +BRAMBLING +GOLDFINCH +HOUSE FINCH +SNOWBIRD +INDIGO FINCH +ROBIN +BULBUL +JAY +MAGPIE +CHICKADEE +WATER OUZEL +KITE +BALD EAGLE +VULTURE +GREAT GREY OWL +FIRE SALAMANDER +NEWT +EFT +SPOTTED SALAMANDER +AXOLOTL +BULL FROG +TREE FROG +TAILED FROG +LOGGERHEAD +LEATHERBACK TURTLE +MUD TURTLE +TERRAPIN +BOX TURTLE +BANDED GECKO +COMMON IGUANA +AMERICAN CHAMELEON +WHIPTAIL +AGAMA +FRILLED LIZARD +ALLIGATOR LIZARD +GILA MONSTER +GREEN LIZARD +AFRICAN CHAMELEON +KOMODO DRAGON +AFRICAN CROCODILE +AMERICAN ALLIGATOR +TRICERATOPS +THUNDER SNAKE +RINGNECK SNAKE +HOGNOSE SNAKE +GREEN SNAKE +KING SNAKE +GARTER SNAKE +WATER SNAKE +VINE SNAKE +NIGHT SNAKE +BOA +ROCK PYTHON +COBRA +GREEN MAMBA +SEA SNAKE +HORNED VIPER +DIAMONDBACK +SIDEWINDER +TRILOBITE +HARVESTMAN +SCORPION +GARDEN SPIDER +BARN SPIDER +GARDEN SPIDER +BLACK WIDOW +TARANTULA +WOLF SPIDER +TICK +CENTIPEDE +GROUSE +PTARMIGAN +RUFFED GROUSE +PRAIRIE CHICKEN +PEACOCK +QUAIL +PARTRIDGE +AFRICAN GREY +MACAW +COCKATOO +LORIKEET +COUCAL +BEE EATER +HORNBILL +HUMMINGBIRD +JACAMAR +TOUCAN +DRAKE +MERGANSER +GOOSE +BLACK SWAN +TUSKER +ECHIDNA +PLATYPUS +WALLABY +KOALA +WOMBAT +JELLYFISH +SEA ANEMONE +BRAIN CORAL +FLATWORM +NEMATODE +CONCH +SNAIL +SLUG +SEA SLUG +CHITON +CHAMBERED NAUTILUS +DUNGENESS CRAB +ROCK CRAB +FIDDLER CRAB +KING CRAB +AMERICAN LOBSTER +SPINY LOBSTER +CRAYFISH +HERMIT CRAB +ISOPOD +WHITE STORK +BLACK STORK +SPOONBILL +FLAMINGO +LITTLE BLUE HERON +AMERICAN EGRET +BITTERN +CRANE +LIMPKIN +EUROPEAN GALLINULE +AMERICAN COOT +BUSTARD +RUDDY TURNSTONE +RED-BACKED SANDPIPER +REDSHANK +DOWITCHER +OYSTERCATCHER +PELICAN +KING PENGUIN +ALBATROSS +GREY WHALE +KILLER WHALE +DUGONG +SEA LION +CHIHUAHUA +JAPANESE SPANIEL +MALTESE DOG +PEKINESE +SHIH-TZU +BLENHEIM SPANIEL +PAPILLON +TOY TERRIER +RHODESIAN RIDGEBACK +AFGHAN HOUND +BASSET +BEAGLE +BLOODHOUND +BLUETICK +COONHOUND +WALKER HOUND +ENGLISH FOXHOUND +REDBONE +BORZOI +IRISH WOLFHOUND +ITALIAN GREYHOUND +WHIPPET +IBIZAN HOUND +NORWEGIAN ELKHOUND +OTTERHOUND +SALUKI +SCOTTISH DEERHOUND +WEIMARANER +STAFFORDSHIRE BULLTERRIER +STAFFORDSHIRE TERRIER +BEDLINGTON TERRIER +BORDER TERRIER +KERRY BLUE TERRIER +IRISH TERRIER +NORFOLK TERRIER +NORWICH TERRIER +YORKSHIRE TERRIER +WIRE-HAIRED FOX TERRIER +LAKELAND TERRIER +SEALYHAM TERRIER +AIREDALE +CAIRN +AUSTRALIAN TERRIER +DANDIE DINMONT +BOSTON BULL +MINIATURE SCHNAUZER +GIANT SCHNAUZER +STANDARD SCHNAUZER +SCOTCH TERRIER +TIBETAN TERRIER +SILKY TERRIER +WHEATEN TERRIER +WHITE TERRIER +LHASA +RETRIEVER +CURLY-COATED RETRIEVER +GOLDEN RETRIEVER +LABRADOR RETRIEVER +CHESAPEAKE BAY RETRIEVER +SHORT-HAIRED POINTER +VISLA +ENGLISH SETTER +IRISH SETTER +GORDON SETTER +BRITTANY SPANIEL +CLUMBER +ENGLISH SPRINGER +WELSH SPRINGER SPANIEL +COCKER SPANIEL +SUSSEX SPANIEL +IRISH WATERSPANIEL +KUVASZ +SCHIPPERKE +GROENENDAEL +MALINOIS +BRIARD +KELPIE +KOMONDOR +OLD ENGLISH SHEEPDOG +SHETLAND SHEEPDOG +COLLIE +BORDER COLLIE +BOUVIER DES FLANDRES +ROTTWEILER +GERMAN SHEPHERD +DOBERMAN +MINIATURE PINSCHER +GREATER SWISS MOUNTAIN DOG +BERNESE MOUNTAIN DOG +APPENZELLER +ENTLEBUCHER +BOXER +BULL MASTIFF +TIBETAN MASTIFF +FRENCH BULLDOG +GREAT DANE +SAINT BERNARD +ESKIMO DOG +MALAMUTE +SIBERIAN HUSKY +DALMATIAN +AFFENPINSCHER +BASENJI +PUG +LEONBERG +NEWFOUNDLAND +GREAT PYRENEES +SAMOYED +POMERANIAN +CHOW +KEESHOND +BRABANCON GRIFFON +PEMBROKE +CARDIGAN +TOY POODLE +MINIATURE POODLE +STANDARD POODLE +MEXICAN HAIRLESS +TIMBER WOLF +WHITE WOLF +RED WOLF +COYOTE +DINGO +DHOLE +AFRICAN HUNTING DOG +HYENA +RED FOX +KIT FOX +ARCTIC FOX +GREY FOX +TABBY +TIGER CAT +PERSIAN CAT +SIAMESE CAT +EGYPTIAN CAT +COUGAR +LYNX +LEOPARD +SNOW LEOPARD +JAGUAR +LION +TIGER +CHEETAH +BROWN BEAR +AMERICAN BLACK BEAR +ICE BEAR +SLOTH BEAR +MONGOOSE +MEERKAT +TIGER BEETLE +LADYBUG +GROUND BEETLE +LONG-HORNED BEETLE +LEAF BEETLE +DUNG BEETLE +RHINOCEROS BEETLE +WEEVIL +FLY +BEE +ANT +GRASSHOPPER +CRICKET +WALKING STICK +COCKROACH +MANTIS +CICADA +LEAFHOPPER +LACEWING +DRAGONFLY +DAMSELFLY +ADMIRAL +RINGLET +MONARCH +CABBAGE BUTTERFLY +SULPHUR BUTTERFLY +LYCAENID +STARFISH +SEA URCHIN +SEA CUCUMBER +WOOD RABBIT +HARE +ANGORA +HAMSTER +PORCUPINE +FOX SQUIRREL +MARMOT +BEAVER +GUINEA PIG +SORREL +ZEBRA +HOG +WILD BOAR +WARTHOG +HIPPOPOTAMUS +OX +WATER BUFFALO +BISON +RAM +BIGHORN +IBEX +HARTEBEEST +IMPALA +GAZELLE +ARABIAN CAMEL +LLAMA +WEASEL +MINK +POLECAT +BLACK-FOOTED FERRET +OTTER +SKUNK +BADGER +ARMADILLO +THREE-TOED SLOTH +ORANGUTAN +GORILLA +CHIMPANZEE +GIBBON +SIAMANG +GUENON +PATAS +BABOON +MACAQUE +LANGUR +COLOBUS +PROBOSCIS MONKEY +MARMOSET +CAPUCHIN +HOWLER MONKEY +TITI +SPIDER MONKEY +SQUIRREL MONKEY +MADAGASCAR CAT +INDRI +INDIAN ELEPHANT +AFRICAN ELEPHANT +LESSER PANDA +GIANT PANDA +BARRACOUTA +EEL +COHO +ROCK BEAUTY +ANEMONE FISH +STURGEON +GAR +LIONFISH +PUFFER +ABACUS +ABAYA +ACADEMIC GOWN +ACCORDION +ACOUSTIC GUITAR +AIRCRAFT CARRIER +AIRLINER +AIRSHIP +ALTAR +AMBULANCE +AMPHIBIAN +ANALOG CLOCK +APIARY +APRON +ASHCAN +ASSAULT RIFLE +BACKPACK +BAKERY +BALANCE BEAM +BALLOON +BALLPOINT +BAND AID +BANJO +BANNISTER +BARBELL +BARBER CHAIR +BARBERSHOP +BARN +BAROMETER +BARREL +BARROW +BASEBALL +BASKETBALL +BASSINET +BASSOON +BATHING CAP +BATH TOWEL +BATHTUB +BEACH WAGON +BEACON +BEAKER +BEARSKIN +BEER BOTTLE +BEER GLASS +BELL COTE +BIB +BICYCLE-BUILT-FOR-TWO +BIKINI +BINDER +BINOCULARS +BIRDHOUSE +BOATHOUSE +BOBSLED +BOLO TIE +BONNET +BOOKCASE +BOOKSHOP +BOTTLECAP +BOW +BOW TIE +BRASS +BRASSIERE +BREAKWATER +BREASTPLATE +BROOM +BUCKET +BUCKLE +BULLETPROOF VEST +BULLET TRAIN +BUTCHER SHOP +CAB +CALDRON +CANDLE +CANNON +CANOE +CAN OPENER +CARDIGAN +CAR MIRROR +CAROUSEL +CARPENTERS KIT +CARTON +CAR WHEEL +CASH MACHINE +CASSETTE +CASSETTE PLAYER +CASTLE +CATAMARAN +CD PLAYER +CELLO +CELLULAR TELEPHONE +CHAIN +CHAINLINK FENCE +CHAIN MAIL +CHAIN SAW +CHEST +CHIFFONIER +CHIME +CHINA CABINET +CHRISTMAS STOCKING +CHURCH +CINEMA +CLEAVER +CLIFF DWELLING +CLOAK +CLOG +COCKTAIL SHAKER +COFFEE MUG +COFFEEPOT +COIL +COMBINATION LOCK +COMPUTER KEYBOARD +CONFECTIONERY +CONTAINER SHIP +CONVERTIBLE +CORKSCREW +CORNET +COWBOY BOOT +COWBOY HAT +CRADLE +CRANE +CRASH HELMET +CREATE +CRIB +CROCK POT +CROQUET BALL +CRUTCH +CUIRASS +DAM +DESK +DESKTOP COMPUTER +DIAL TELEPHONE +DIAPER +DIGITAL CLOCK +DIGITAL WATCH +DINING TABLE +DISHRAG +DISHWASHER +DISK BRAKE +DOCK +DOGSLED +DOME +DOORMAT +DRILLING PLATFORM +DRUM +DRUMSTICK +DUMBBELL +DUTCH OVEN +ELECTRIC FAN +ELECTRIC GUITAR +ELECTRIC LOCOMOTIVE +ENTERTAINMENT CENTER +ENVELOPE +ESPRESSO MAKER +FACE POWDER +FEATHER BOA +FILE +FIREBOAT +FIRE ENGINE +FIRE SCREEN +FLAGPOLE +FLUTE +FOLDING CHAIR +FOOTBALL HELMET +FORKLIFT +FOUNTAIN +FOUNTAIN PEN +FOUR-POSTER +FREIGHT CAR +FRENCH HORN +FRYING PAN +FUR COAT +GARBAGE TRUCK +GASMASK +GAS PUMP +GOBLET +GO-KART +GOLF BALL +GOLFCART +GONDOLA +GONG +GOWN +GRAND PIANO +GREENHOUSE +GRILLE +GROCERY STORE +GUILLOTINE +HAIR SLIDE +HAIR SPRAY +HALF TRACK +HAMMER +HAMPER +HAND BLOWER +HAND-HELD COMPUTER +HANDKERCHIEF +HARD DISC +HARMONICA +HARP +HARVESTER +HATCHET +HOLSTER +HOME THEATER +HONEYCOMB +HOOK +HOOPSKIRT +HORIZONTAL BAR +HORSE CART +HOURGLASS +IPOD +IRON +JACK-O-LANTERN +JEAN +JEEP +JERSEY +JIGSAW PUZZLE +JINRIKISHA +JOYSTICK +KIMONO +KNEE PAD +KNOT +LAB COAT +LADLE +LAMPSHADE +LAPTOP +LAWN MOWER +LENS CAP +LETTER OPENER +LIBRARY +LIFEBOAT +LIGHTER +LIMOUSINE +LINER +LIPSTICK +LOAFER +LOTION +LOUDSPEAKER +LOUPE +LUMBERMILL +MAGNETIC COMPASS +MAILBAG +MAILBOX +MAILLOT +MAILLOT +MANHOLE COVER +MARACA +MARIMBA +MASK +MATCHSTICK +MAYPOLE +MAZE +MEASURING CUP +MEDICINE CHEST +MEGALITH +MICROPHONE +MICROWAVE +MILITARY UNIFORM +MILK CAN +MINIBUS +MINISKIRT +MINIVAN +MISSILE +MITTEN +MIXING BOWL +MOBILE HOME +MODEL T +MODEM +MONASTERY +MONITOR +MOPED +MORTAR +MORTARBOARD +MOSQUE +MOSQUITO NET +MOTOR SCOOTER +MOUNTAIN BIKE +MOUNTAIN TENT +MOUSE +MOUSETRAP +MOVING VAN +MUZZLE +NAIL +NECK BRACE +NECKLACE +NIPPLE +NOTEBOOK +OBELISK +OBOE +OCARINA +ODOMETER +OIL FILTER +ORGAN +OSCILLOSCOPE +OVERSKIRT +OXCART +OXYGEN MASK +PACKET +PADDLE +PADDLEWHEEL +PADLOCK +PAINTBRUSH +PAJAMA +PALACE +PANPIPE +PAPER TOWEL +PARACHUTE +PARALLEL BARS +PARK BENCH +PARKING METER +PASSENGER CAR +PATIO +PAY-PHONE +PEDESTAL +PENCIL BOX +PENCIL SHARPENER +PERFUME +PETRI DISH +PHOTOCOPIER +PICK +PICKELHAUBE +PICKET FENCE +PICKUP +PIER +PIGGY BANK +PILL BOTTLE +PILLOW +PING-PONG BALL +PINWHEEL +PIRATE +PITCHER +PLANE +PLANETARIUM +PLASTIC BAG +PLATE RACK +PLOW +PLUNGER +POLAROID CAMERA +POLE +POLICE VAN +PONCHO +POOL TABLE +POP BOTTLE +POT +POTTERS WHEEL +POWER DRILL +PRAYER RUG +PRINTER +PRISON +PROJECTILE +PROJECTOR +PUCK +PUNCHING BAG +PURSE +QUILL +QUILT +RACER +RACKET +RADIATOR +RADIO +RADIO TELESCOPE +RAIN BARREL +RECREATIONAL VEHICLE +REEL +REFLEX CAMERA +REFRIGERATOR +REMOTE CONTROL +RESTAURANT +REVOLVER +RIFLE +ROCKING CHAIR +ROTISSERIE +RUBBER ERASER +RUGBY BALL +RULE +RUNNING SHOE +SAFE +SAFETY PIN +SALTSHAKER +SANDAL +SARONG +SAX +SCABBARD +SCALE +SCHOOL BUS +SCHOONER +SCOREBOARD +SCREEN +SCREW +SCREWDRIVER +SEAT BELT +SEWING MACHINE +SHIELD +SHOE SHOP +SHOJI +SHOPPING BASKET +SHOPPING CART +SHOVEL +SHOWER CAP +SHOWER CURTAIN +SKI +SKI MASK +SLEEPING BAG +SLIDE RULE +SLIDING DOOR +SLOT +SNORKEL +SNOWMOBILE +SNOWPLOW +SOAP DISPENSER +SOCCER BALL +SOCK +SOLAR DISH +SOMBRERO +SOUP BOWL +SPACE BAR +SPACE HEATER +SPACE SHUTTLE +SPATULA +SPEEDBOAT +SPIDER WEB +SPINDLE +SPORTS CAR +SPOTLIGHT +STAGE +STEAM LOCOMOTIVE +STEEL ARCH BRIDGE +STEEL DRUM +STETHOSCOPE +STOLE +STONE WALL +STOPWATCH +STOVE +STRAINER +STREETCAR +STRETCHER +STUDIO COUCH +STUPA +SUBMARINE +SUIT +SUNDIAL +SUNGLASS +SUNGLASSES +SUNSCREEN +SUSPENSION BRIDGE +SWAB +SWEATSHIRT +SWIMMING TRUNKS +SWING +SWITCH +SYRINGE +TABLE LAMP +TANK +TAPE PLAYER +TEAPOT +TEDDY +TELEVISION +TENNIS BALL +THATCH +THEATER CURTAIN +THIMBLE +THRESHER +THRONE +TILE ROOF +TOASTER +TOBACCO SHOP +TOILET SEAT +TORCH +TOTEM POLE +TOW TRUCK +TOYSHOP +TRACTOR +TRAILER TRUCK +TRAY +TRENCH COAT +TRICYCLE +TRIMARAN +TRIPOD +TRIUMPHAL ARCH +TROLLEYBUS +TROMBONE +TUB +TURNSTILE +TYPEWRITER KEYBOARD +UMBRELLA +UNICYCLE +UPRIGHT +VACUUM +VASE +VAULT +VELVET +VENDING MACHINE +VESTMENT +VIADUCT +VIOLIN +VOLLEYBALL +WAFFLE IRON +WALL CLOCK +WALLET +WARDROBE +WARPLANE +WASHBASIN +WASHER +WATER BOTTLE +WATER JUG +WATER TOWER +WHISKEY JUG +WHISTLE +WIG +WINDOW SCREEN +WINDOW SHADE +WINDSOR TIE +WINE BOTTLE +WING +WOK +WOODEN SPOON +WOOL +WORM FENCE +WRECK +YAWL +YURT +WEB SITE +COMIC BOOK +CROSSWORD PUZZLE +STREET SIGN +TRAFFIC LIGHT +BOOK JACKET +MENU +PLATE +GUACAMOLE +CONSOMME +HOT POT +TRIFLE +ICE CREAM +ICE LOLLY +FRENCH LOAF +BAGEL +PRETZEL +CHEESEBURGER +HOTDOG +MASHED POTATO +HEAD CABBAGE +BROCCOLI +CAULIFLOWER +ZUCCHINI +SPAGHETTI SQUASH +ACORN SQUASH +BUTTERNUT SQUASH +CUCUMBER +ARTICHOKE +BELL PEPPER +CARDOON +MUSHROOM +GRANNY SMITH +STRAWBERRY +ORANGE +LEMON +FIG +PINEAPPLE +BANANA +JACKFRUIT +CUSTARD APPLE +POMEGRANATE +HAY +CARBONARA +CHOCOLATE SAUCE +DOUGH +MEAT LOAF +PIZZA +POTPIE +BURRITO +RED WINE +ESPRESSO +CUP +EGGNOG +ALP +BUBBLE +CLIFF +CORAL REEF +GEYSER +LAKESIDE +PROMONTORY +SANDBAR +SEASHORE +VALLEY +VOLCANO +BALLPLAYER +GROOM +SCUBA DIVER +RAPESEED +DAISY +LADY SLIPPER +CORN +ACORN +HIP +BUCKEYE +CORAL FUNGUS +AGARIC +GYROMITRA +STINKHORN +EARTHSTAR +HEN-OF-THE-WOODS +BOLETE +EAR +TOILET TISSUE diff --git a/src/resources/platform_handlers/pytorch/README.md b/src/resources/platform_handlers/pytorch/README.md deleted file mode 100644 index 0e9240b8..00000000 --- a/src/resources/platform_handlers/pytorch/README.md +++ /dev/null @@ -1,132 +0,0 @@ - - -# Serving PyTorch models using Python Backend \[Experimental\] - -**NOTE**: *This feature is subject to change and removal, and should not -be used in production.* - -Starting from 23.08, we are adding an experimental support for loading and -serving PyTorch models directly via Python backend. The model can be provided -within the triton server model repository, and a -[pre-built Python model](model.py) will be used to load and serve the PyTorch -model. - -## Model Layout - -The model repository should look like: - -``` -model_repository/ -`-- model_directory - |-- 1 - | |-- model.py - | `-- model.pt - `-- config.pbtxt -``` - -The `model.py` contains the class definition of the PyTorch model. The class -should extend the -[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). -The `model.pt` may be optionally provided which contains the saved -[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) -of the model. For serving TorchScript models, a `model.pt` TorchScript can be -provided in place of the `model.py` file. - -By default, Triton will use the -[PyTorch backend](https://github.com/triton-inference-server/pytorch_backend) to -load and serve PyTorch models. In order to serve from Python backend, -[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md) -should explicitly provide the following settings: - -``` -backend: "python" -platform: "pytorch" -``` - -## PyTorch Installation - -This feature will take advantage of the -[`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) -optimization, make sure the -[PyTorch 2.0+ pip package](https://pypi.org/project/torch/2.0.1/) is available -in the same Python environment. - -``` -pip install torch==2.0.1 -``` -Alternatively, a -[Python Execution Environment](#using-custom-python-execution-environments) -with the PyTorch dependency may be used. - -## Customization - -The following PyTorch settings may be customized by setting parameters on the -`config.pbtxt`. - -[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads) -- Key: NUM_THREADS -- Value: The number of threads used for intraop parallelism on CPU. - -[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads) -- Key: NUM_INTEROP_THREADS -- Value: The number of threads used for interop parallelism (e.g. in JIT -interpreter) on CPU. - -[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) -- Key: TORCH_COMPILE_OPTIONAL_PARAMETERS -- Value: Any of following parameter(s) encoded as a JSON object. - - fullgraph (*bool*): Whether it is ok to break model into several subgraphs. - - dynamic (*bool*): Use dynamic shape tracing. - - backend (*str*): The backend to be used. - - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune". - - options (*dict*): A dictionary of options to pass to the backend. - - disable (*bool*): Turn `torch.compile()` into a no-op for testing. - -For example: -``` -parameters: { - key: "NUM_THREADS" - value: { string_value: "4" } -} -parameters: { - key: "TORCH_COMPILE_OPTIONAL_PARAMETERS" - value: { string_value: "{\"disable\": true}" } -} -`````` - -## Limitations - -Following are few known limitations of this feature: -- Python functions optimizable by `torch.compile` may not be served directly in -the `model.py` file, they need to be enclosed by a class extending the -[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). -- Model weights cannot be shared across multiple instances on the same GPU -device. -- When using `KIND_MODEL` as model instance kind, the default device of the -first parameter on the model is used. From cb53f0ad1395f7a5b826d9fae6f9b613cc3e53f6 Mon Sep 17 00:00:00 2001 From: Tanmay Verma Date: Tue, 15 Aug 2023 12:47:40 -0700 Subject: [PATCH 140/216] Add PyTorch platform handler example (#287) (#289) * Add PyTorch platform handler example * Refactor docs structure * Add more comments and minor refactoring * Further break down client.py * Remove exit 0 if terminated normally * Simplify comments * Improve comment * List mug.jpg paths * Docs update * Describe the source of mug.jpg Co-authored-by: Jacky <18255193+kthui@users.noreply.github.com> From 0f2ce85f789dddffec51aba6c483f759675f66f1 Mon Sep 17 00:00:00 2001 From: Matthieu Toulemont <75613333+MatthieuToulemont@users.noreply.github.com> Date: Fri, 18 Aug 2023 00:16:30 +0200 Subject: [PATCH 141/216] Skip dimension of size 1 in contiguous checks. (#281) * Skip dimension of size 1 in contiguous checks. Since PyTorch 1.13 dimension of size 1 have normalised strides in PyTorch which fail here when using DLPack. This was done to conform the torch stride representation and the one from numpy. Unfortunately this means we are stuck with PyTorch 1.12.0 in python models. * Conform to pre-commit guidelines --- src/pb_tensor.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 4011faad..84cd8f3f 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -433,12 +433,14 @@ PbTensor::FromDLPackCapsule( int64_t calculated_stride{1}; bool is_contiguous_c_order = true; for (size_t i = 1; i < dims.size(); i++) { - if (strides[ndim - i] != calculated_stride) { - is_contiguous_c_order = false; - break; - } + if (dims[ndim - i] != 1) { + if (strides[ndim - i] != calculated_stride) { + is_contiguous_c_order = false; + break; + } - calculated_stride *= dims[ndim - i]; + calculated_stride *= dims[ndim - i]; + } } if (!is_contiguous_c_order) { From 6f369ef10312ad3db0aef73df5b1138b8467cf14 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 22 Aug 2023 18:54:18 -0400 Subject: [PATCH 142/216] Reduce the default required shm size to 1MB (#291) * Reduce the default required shm size to 1MB * Review edit --- README.md | 4 ++-- src/python_be.cc | 8 ++++---- src/shm_manager.cc | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 49d4229b..58427bbe 100644 --- a/README.md +++ b/README.md @@ -852,8 +852,8 @@ Starting from 21.04 release, Python backend uses shared memory to connect user's code to Triton. Note that this change is completely transparent and does not require any change to the existing user's model code. -Python backend, by default, allocates 64 MBs for each model instance. Then, -it will grow the shared memory region by 64 MBs whenever an increase is +Python backend, by default, allocates 1 MB for each model instance. Then, +it will grow the shared memory region by 1 MB chunks whenever an increase is required. You can configure the default shared memory used by each model instance using the `shm-default-byte-size` flag. The amount of shared memory growth can be configured using the `shm-growth-byte-size`. diff --git a/src/python_be.cc b/src/python_be.cc index df2a3235..14e0c74b 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1901,8 +1901,8 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) std::unique_ptr backend_state(new BackendState()); triton::common::TritonJson::Value cmdline; - backend_state->shm_default_byte_size = 64 * 1024 * 1024; // 64 MBs - backend_state->shm_growth_byte_size = 64 * 1024 * 1024; // 64 MBs + backend_state->shm_default_byte_size = 1 * 1024 * 1024; // 1 MB + backend_state->shm_growth_byte_size = 1 * 1024 * 1024; // 1 MB backend_state->stub_timeout_seconds = 30; backend_state->shm_message_queue_size = 1000; backend_state->number_of_instance_inits = 0; @@ -1936,8 +1936,8 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) RETURN_IF_ERROR(shm_default_size.AsString(&shm_default_byte_size)); try { backend_state->shm_default_byte_size = std::stol(shm_default_byte_size); - // Shared memory default byte size can't be less than 4 MBs. - if (backend_state->shm_default_byte_size < 4 * 1024 * 1024) { + // Shared memory default byte size can't be less than 1 MB. + if (backend_state->shm_default_byte_size < 1 * 1024 * 1024) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("shm-default-byte-size") + diff --git a/src/shm_manager.cc b/src/shm_manager.cc index b5499f88..b52d5a4f 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -76,7 +76,7 @@ SharedMemoryManager::SharedMemoryManager( "' to requested size (" + std::to_string(shm_size) + " bytes). If you are running Triton inside docker, use '--shm-size' " "flag to control the shared memory region size. Each Python backend " - "model instance requires at least 64MBs of shared memory. Error: " + + "model instance requires at least 1 MB of shared memory. Error: " + ex.what()); // Remove the shared memory region if there was an error. bi::shared_memory_object::remove(shm_region_name.c_str()); From 66f5e1ec70fdd5368c2d8750664ecb284a95554e Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 1 Sep 2023 17:46:27 -0700 Subject: [PATCH 143/216] Enable Python execute() to return Triton error code (#292) * Add error code to pb error * Return error code on pb error * Add to error code param to Python * Move ErrorCode into TritonError * Expose ErrorCode internal in TritonError * Unify PbError constructors --- src/infer_response.cc | 4 ++-- src/pb_error.cc | 37 +++++++++++++++++++++++++------- src/pb_error.h | 33 ++++++++++++++++++++++++----- src/pb_stub.cc | 49 ++++++++++++++++++++++++++++++++++++++++--- src/python_be.cc | 2 +- 5 files changed, 106 insertions(+), 19 deletions(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index afadc324..ebadc02d 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -243,8 +243,8 @@ InferResponse::Send( }); if (HasError()) { - *response_error = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str()); + *response_error = + TRITONSERVER_ErrorNew(Error()->Code(), Error()->Message().c_str()); return; } diff --git a/src/pb_error.cc b/src/pb_error.cc index e190af42..0e5d0bd4 100644 --- a/src/pb_error.cc +++ b/src/pb_error.cc @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,13 @@ #include "pb_error.h" namespace triton { namespace backend { namespace python { + +TRITONSERVER_Error_Code +PbError::Code() +{ + return code_; +} + const std::string& PbError::Message() { @@ -43,7 +50,10 @@ void PbError::SaveToSharedMemory(std::unique_ptr& shm_pool) { message_shm_ = PbString::Create(shm_pool, message_); - shm_handle_ = message_shm_->ShmHandle(); + error_shm_ = shm_pool->Construct(); + error_shm_.data_->code = code_; + error_shm_.data_->message_shm_handle = message_shm_->ShmHandle(); + shm_handle_ = error_shm_.handle_; } std::shared_ptr @@ -51,14 +61,25 @@ PbError::LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t shm_handle) { - std::unique_ptr message_shm = - PbString::LoadFromSharedMemory(shm_pool, shm_handle); - return std::shared_ptr(new PbError(message_shm)); + AllocatedSharedMemory error_shm = + shm_pool->Load(shm_handle); + std::unique_ptr message_shm = PbString::LoadFromSharedMemory( + shm_pool, error_shm.data_->message_shm_handle); + + TRITONSERVER_Error_Code code = error_shm.data_->code; + std::string message = message_shm->String(); + + return std::shared_ptr(new PbError( + std::move(message_shm), std::move(error_shm), code, std::move(message))); } -PbError::PbError(std::unique_ptr& message_shm) +PbError::PbError( + std::shared_ptr&& message_shm, + AllocatedSharedMemory&& error_shm, TRITONSERVER_Error_Code code, + std::string&& message) + : message_shm_(std::move(message_shm)), error_shm_(std::move(error_shm)), + code_(code), message_(std::move(message)) { - message_shm_ = std::move(message_shm); - message_ = message_shm_->String(); } + }}} // namespace triton::backend::python diff --git a/src/pb_error.h b/src/pb_error.h index b80546b2..6001459a 100644 --- a/src/pb_error.h +++ b/src/pb_error.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,21 +32,44 @@ #include "pb_utils.h" namespace triton { namespace backend { namespace python { + +struct PbErrorShm { + TRITONSERVER_Error_Code code; + bi::managed_external_buffer::handle_t message_shm_handle; +}; + class PbError { public: - PbError(const std::string& message) : message_(message) {} + PbError( + const std::string& message, + TRITONSERVER_Error_Code code = TRITONSERVER_ERROR_INTERNAL) + : code_(code), message_(message) + { + } + DISALLOW_COPY_AND_ASSIGN(PbError); + + TRITONSERVER_Error_Code Code(); const std::string& Message(); + void SaveToSharedMemory(std::unique_ptr& shm_pool); bi::managed_external_buffer::handle_t ShmHandle(); + static std::shared_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t handle); - DISALLOW_COPY_AND_ASSIGN(PbError); private: - PbError(std::unique_ptr& pb_error); - std::string message_; + PbError( + std::shared_ptr&& message_shm, + AllocatedSharedMemory&& error_shm, + TRITONSERVER_Error_Code code, std::string&& message); + std::shared_ptr message_shm_; + AllocatedSharedMemory error_shm_; bi::managed_external_buffer::handle_t shm_handle_; + + TRITONSERVER_Error_Code code_; + std::string message_; }; + }}}; // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index c5c6b42e..d096f420 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1346,9 +1346,52 @@ Logger::BackendLoggingActive() PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) { - py::class_>(module, "TritonError") - .def(py::init()) - .def("message", &PbError::Message); + py::class_> triton_error( + module, "TritonError"); + py::enum_(triton_error, "__ErrorCode") + .value("UNKNOWN", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNKNOWN) + .value("INTERNAL", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_INTERNAL) + .value("NOT_FOUND", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_NOT_FOUND) + .value( + "INVALID_ARG", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_INVALID_ARG) + .value( + "UNAVAILABLE", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNAVAILABLE) + .value( + "UNSUPPORTED", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_UNSUPPORTED) + .value( + "ALREADY_EXISTS", + TRITONSERVER_Error_Code::TRITONSERVER_ERROR_ALREADY_EXISTS) + .export_values(); + triton_error.def_property_readonly_static( + "UNKNOWN", + [](py::object /* self */) { return TRITONSERVER_ERROR_UNKNOWN; }); + triton_error.def_property_readonly_static( + "INTERNAL", + [](py::object /* self */) { return TRITONSERVER_ERROR_INTERNAL; }); + triton_error.def_property_readonly_static( + "NOT_FOUND", + [](py::object /* self */) { return TRITONSERVER_ERROR_NOT_FOUND; }); + triton_error.def_property_readonly_static( + "INVALID_ARG", + [](py::object /* self */) { return TRITONSERVER_ERROR_INVALID_ARG; }); + triton_error.def_property_readonly_static( + "UNAVAILABLE", + [](py::object /* self */) { return TRITONSERVER_ERROR_UNAVAILABLE; }); + triton_error.def_property_readonly_static( + "UNSUPPORTED", + [](py::object /* self */) { return TRITONSERVER_ERROR_UNSUPPORTED; }); + triton_error.def_property_readonly_static( + "ALREADY_EXISTS", + [](py::object /* self */) { return TRITONSERVER_ERROR_ALREADY_EXISTS; }); + triton_error.def( + py::init(), + py::arg("message").none(false), + py::arg("code").none(false) = TRITONSERVER_ERROR_INTERNAL); + triton_error.def("code", &PbError::Code); + triton_error.def("message", &PbError::Message); py::class_>( module, "PreferredMemory") diff --git a/src/python_be.cc b/src/python_be.cc index 14e0c74b..70c89554 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1456,7 +1456,7 @@ ModelInstanceState::ProcessRequests( false /* open_cuda_handle */); if (infer_response->HasError()) { TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, + infer_response->Error()->Code(), infer_response->Error()->Message().c_str()); LOG_IF_ERROR( From f4e24d7735563f942f66a3baa91e5b2d2433df8e Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Tue, 12 Sep 2023 18:52:16 -0700 Subject: [PATCH 144/216] Fix returning error when tracing is off (#295) --- src/python_be.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/python_be.cc b/src/python_be.cc index 70c89554..b196cfab 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -365,8 +365,11 @@ ModelInstanceState::SaveRequestsToSharedMemory( RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags)); TRITONSERVER_InferenceTrace* triton_trace; - RETURN_IF_ERROR(TRITONBACKEND_RequestTrace(request, &triton_trace)); - + auto err = TRITONBACKEND_RequestTrace(request, &triton_trace); + if (err != nullptr) { + triton_trace = nullptr; + TRITONSERVER_ErrorDelete(err); + } InferenceTrace trace = InferenceTrace(triton_trace); std::unique_ptr infer_request; From 193de67ee692fdb2fd307941de4815b150bd6791 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 19 Sep 2023 08:59:33 -0700 Subject: [PATCH 145/216] Load model of current torchvision (#298) --- examples/pytorch_platform_handler/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch_platform_handler/model.py b/examples/pytorch_platform_handler/model.py index 7fe59597..391063b8 100755 --- a/examples/pytorch_platform_handler/model.py +++ b/examples/pytorch_platform_handler/model.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import torch +import torchvision class ResNet50(torch.nn.Module): @@ -35,7 +36,7 @@ class ResNet50(torch.nn.Module): def __init__(self): super().__init__() self._model = torch.hub.load( - "pytorch/vision", + "pytorch/vision:v" + torchvision.__version__.split("+")[0], "resnet50", weights="ResNet50_Weights.IMAGENET1K_V2", skip_validation=True, From 238e0d0012c8b506d29a4543c24649ee29c44d86 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 19 Sep 2023 12:18:56 -0700 Subject: [PATCH 146/216] Add docs for error code (#300) * Add docs for error code * Describe default behavior Co-authored-by: Ryan McCormick * Python format code block Co-authored-by: Ryan McCormick --------- Co-authored-by: Ryan McCormick --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 58427bbe..517a9b64 100644 --- a/README.md +++ b/README.md @@ -485,6 +485,23 @@ class TritonPythonModel: return responses ``` +Starting from 23.09, `pb_utils.TritonError` may be constructed with an optional +Triton error code on the second parameter. For example: + +```python +pb_utils.TritonError("The file is not found", pb_utils.TritonError.NOT_FOUND) +``` + +If no code is specified, `pb_utils.TritonError.INTERNAL` will be used by default. + +Supported error codes: +* `pb_utils.TritonError.UNKNOWN` +* `pb_utils.TritonError.INTERNAL` +* `pb_utils.TritonError.NOT_FOUND` +* `pb_utils.TritonError.INVALID_ARG` +* `pb_utils.TritonError.UNAVAILABLE` +* `pb_utils.TritonError.UNSUPPORTED` +* `pb_utils.TritonError.ALREADY_EXISTS` #### Decoupled mode From b136bf3fad77d5d467a6db5bd739be16eb9d9400 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 26 Sep 2023 12:46:41 -0400 Subject: [PATCH 147/216] Fix response iterator memory leak (#302) --- src/pb_response_iterator.cc | 4 +--- src/pb_response_iterator.h | 2 +- src/pb_stub.cc | 7 ++++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc index 9561df68..1e0d631a 100644 --- a/src/pb_response_iterator.cc +++ b/src/pb_response_iterator.cc @@ -100,7 +100,7 @@ ResponseIterator::Next() } } -py::iterator +void ResponseIterator::Iter() { if (is_finished_) { @@ -111,8 +111,6 @@ ResponseIterator::Iter() idx_ = 0; } } - - return py::cast(*this); } void diff --git a/src/pb_response_iterator.h b/src/pb_response_iterator.h index 1122a216..cad5ff1f 100644 --- a/src/pb_response_iterator.h +++ b/src/pb_response_iterator.h @@ -38,7 +38,7 @@ class ResponseIterator { ~ResponseIterator(); std::shared_ptr Next(); - py::iterator Iter(); + void Iter(); void EnqueueResponse(std::shared_ptr infer_response); void* Id(); void Clear(); diff --git a/src/pb_stub.cc b/src/pb_stub.cc index d096f420..37c9a5b5 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1544,7 +1544,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::class_>( module, "ResponseIterator") .def(py::init&>()) - .def("__iter__", &ResponseIterator::Iter, py::keep_alive<0, 1>()) + .def( + "__iter__", + [](ResponseIterator& it) -> ResponseIterator& { + it.Iter(); + return it; + }) .def("__next__", &ResponseIterator::Next); py::class_ logger(module, "Logger"); From 67ca860e72ba2547cdfad324b46c579eee5d3200 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 6 Oct 2023 09:51:04 -0700 Subject: [PATCH 148/216] Add Python backend request cancellation (#304) * Add cancelled response status * Add request cancellation * Check cancellation on response factory if available * Remove unnecessary wrapping * Throw error instead of log error * Add is cancelled check at response sender * Enable more reuse on request cancellation and improve model interface * Documentation wording updates * Copyright year update * Rollback response sender auto close on cancel * Rollback non-decoupled any response on cancel * Decoupled final flag docs update --- CMakeLists.txt | 2 + README.md | 36 +++++++++++++++++ src/infer_request.cc | 14 ++++++- src/infer_request.h | 3 ++ src/ipc_message.h | 3 +- src/pb_cancel.cc | 90 ++++++++++++++++++++++++++++++++++++++++++ src/pb_cancel.h | 64 ++++++++++++++++++++++++++++++ src/pb_stub.cc | 51 +++++++++++++++++++++++- src/pb_stub.h | 7 ++++ src/pb_utils.h | 6 +++ src/python_be.cc | 38 ++++++++++++++++++ src/python_be.h | 3 ++ src/response_sender.cc | 14 +++++-- src/response_sender.h | 8 +++- 14 files changed, 329 insertions(+), 10 deletions(-) create mode 100644 src/pb_cancel.cc create mode 100644 src/pb_cancel.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 93a7ae60..3f20bbc3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,8 @@ set( src/pb_stub.cc src/pb_response_iterator.h src/pb_response_iterator.cc + src/pb_cancel.cc + src/pb_cancel.h ) list(APPEND diff --git a/README.md b/README.md index 517a9b64..4cb9a960 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ any C++ code. - [`execute`](#execute) - [Default Mode](#default-mode) - [Error Handling](#error-handling) + - [Request Cancellation Handling](#request-cancellation-handling) - [Decoupled mode](#decoupled-mode) - [Use Cases](#use-cases) - [Known Issues](#known-issues) @@ -502,6 +503,36 @@ Supported error codes: * `pb_utils.TritonError.UNAVAILABLE` * `pb_utils.TritonError.UNSUPPORTED` * `pb_utils.TritonError.ALREADY_EXISTS` +* `pb_utils.TritonError.CANCELLED` (since 23.10) + +#### Request Cancellation Handling + +One or more requests may be cancelled by the client during execution. Starting +from 23.10, `request.is_cancelled()` returns whether the request is cancelled or +not. For example: + +```python +import triton_python_backend_utils as pb_utils + +class TritonPythonModel: + ... + + def execute(self, requests): + responses = [] + + for request in requests: + if request.is_cancelled(): + responses.append(pb_utils.InferenceResponse( + error=pb_utils.TritonError("Message", pb_utils.TritonError.CANCELLED))) + else: + ... + + return responses +``` + +Although checking for request cancellation is optional, it is recommended to +check for cancellation at strategic request execution stages that can early +terminate the execution in the event of its response is no longer needed. #### Decoupled mode @@ -543,6 +574,11 @@ request. After setting errors for an pb_utils.InferenceResponse object, use InferenceResponseSender.send() to send response with the error back to the user. +Starting from 23.10, request cancellation can be checked directly on the +`InferenceResponseSender` object using `response_sender.is_cancelled()`. Sending +the TRITONSERVER_RESPONSE_COMPLETE_FINAL flag at the end of response is still +needed even the request is cancelled. + ##### Use Cases The decoupled mode is powerful and supports various other use cases: diff --git a/src/infer_request.cc b/src/infer_request.cc index 5fdae669..e9d243f1 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -71,9 +71,11 @@ InferRequest::InferRequest( inputs_ = inputs; requested_output_names_ = requested_output_names; #ifdef TRITON_PB_STUB + pb_cancel_ = + std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( request_address_, response_factory_address_, - Stub::GetOrCreateInstance()->SharedMemory()); + Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); #endif } @@ -379,9 +381,11 @@ InferRequest::InferRequest( trace_ = infer_request_shm_ptr_->trace; #ifdef TRITON_PB_STUB + pb_cancel_ = + std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( request_address_, response_factory_address_, - Stub::GetOrCreateInstance()->SharedMemory()); + Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); #endif } @@ -400,6 +404,12 @@ InferRequest::DeleteResponseFactory() #endif #ifdef TRITON_PB_STUB +bool +InferRequest::IsCancelled() +{ + return pb_cancel_->IsCancelled(); +} + std::shared_ptr InferRequest::GetResponseSender() { diff --git a/src/infer_request.h b/src/infer_request.h index 6652b2fb..bc6a2acf 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -34,6 +34,7 @@ #include "pb_tensor.h" #ifdef TRITON_PB_STUB +#include "pb_cancel.h" #include "response_sender.h" #endif @@ -107,6 +108,7 @@ class InferRequest { #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); std::shared_ptr GetResponseSender(); + bool IsCancelled(); #endif /// Save an Inference Request to shared memory. @@ -173,6 +175,7 @@ class InferRequest { std::unique_ptr parameters_shm_; #ifdef TRITON_PB_STUB + std::shared_ptr pb_cancel_; std::shared_ptr response_sender_; #endif }; diff --git a/src/ipc_message.h b/src/ipc_message.h index 7040f2b4..14d3dc5f 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -62,7 +62,8 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_MetricRequestSet, PYTHONSTUB_LoadModelRequest, PYTHONSTUB_UnloadModelRequest, - PYTHONSTUB_ModelReadinessRequest + PYTHONSTUB_ModelReadinessRequest, + PYTHONSTUB_IsRequestCancelled } PYTHONSTUB_CommandType; /// diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc new file mode 100644 index 00000000..4c9b926b --- /dev/null +++ b/src/pb_cancel.cc @@ -0,0 +1,90 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "pb_cancel.h" + +#include "pb_stub.h" + +namespace triton { namespace backend { namespace python { + +void +PbCancel::SaveToSharedMemory(std::unique_ptr& shm_pool) +{ + cancel_shm_ = shm_pool->Construct(); + new (&(cancel_shm_.data_->mu)) bi::interprocess_mutex; + new (&(cancel_shm_.data_->cv)) bi::interprocess_condition; + cancel_shm_.data_->waiting_on_stub = false; + cancel_shm_.data_->response_factory_address = response_factory_address_; + cancel_shm_.data_->request_address = request_address_; + cancel_shm_.data_->is_cancelled = is_cancelled_; +} + +bi::managed_external_buffer::handle_t +PbCancel::ShmHandle() +{ + return cancel_shm_.handle_; +} + +IsCancelledMessage* +PbCancel::ShmPayload() +{ + return cancel_shm_.data_.get(); +} + +bool +PbCancel::IsCancelled() +{ + std::unique_lock lk(mu_); + // The cancelled flag can only move from false to true, not the other way, so + // it is checked on each query until cancelled and then implicitly cached. + if (is_cancelled_) { + return is_cancelled_; + } + if (!updating_) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + if (!stub->StubToParentServiceActive()) { + LOG_ERROR << "Cannot communicate with parent service"; + return false; + } + stub->EnqueueIsCancelled(this); + updating_ = true; + } + cv_.wait(lk, [this] { return !updating_; }); + return is_cancelled_; +} + +void +PbCancel::ReportIsCancelled(bool is_cancelled) +{ + { + std::lock_guard lk(mu_); + is_cancelled_ = is_cancelled; + updating_ = false; + } + cv_.notify_all(); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_cancel.h b/src/pb_cancel.h new file mode 100644 index 00000000..3ebf07b5 --- /dev/null +++ b/src/pb_cancel.h @@ -0,0 +1,64 @@ +// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include + +#include "pb_utils.h" + +namespace triton { namespace backend { namespace python { + +class PbCancel { + public: + PbCancel(intptr_t response_factory_address, intptr_t request_address) + : updating_(false), response_factory_address_(response_factory_address), + request_address_(request_address), is_cancelled_(false) + { + } + DISALLOW_COPY_AND_ASSIGN(PbCancel); + + void SaveToSharedMemory(std::unique_ptr& shm_pool); + bi::managed_external_buffer::handle_t ShmHandle(); + IsCancelledMessage* ShmPayload(); + + bool IsCancelled(); + void ReportIsCancelled(bool is_cancelled); + + private: + AllocatedSharedMemory cancel_shm_; + + std::mutex mu_; + std::condition_variable cv_; + bool updating_; + + intptr_t response_factory_address_; + intptr_t request_address_; + bool is_cancelled_; +}; + +}}}; // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 37c9a5b5..87abe583 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -945,6 +945,9 @@ Stub::ServiceStubToParentRequests() SendLogMessage(utils_msg_payload); } else if (utils_msg_payload->command_type == PYTHONSTUB_CleanupRequest) { SendCleanupId(utils_msg_payload); + } else if ( + utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) { + SendIsCancelled(utils_msg_payload); } else { std::cerr << "Error when sending message via stub_to_parent message " "buffer - unknown command\n"; @@ -1028,6 +1031,44 @@ Stub::EnqueueCleanupId(void* id) } } +void +Stub::EnqueueIsCancelled(PbCancel* pb_cancel) +{ + std::unique_ptr utils_msg_payload = + std::make_unique( + PYTHONSTUB_IsRequestCancelled, reinterpret_cast(pb_cancel)); + EnqueueUtilsMessage(std::move(utils_msg_payload)); +} + +void +Stub::SendIsCancelled(std::unique_ptr& utils_msg_payload) +{ + PbCancel* pb_cancel = + reinterpret_cast(utils_msg_payload->utils_message_ptr); + pb_cancel->SaveToSharedMemory(shm_pool_); + + IsCancelledMessage* message_payload = pb_cancel->ShmPayload(); + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, false /* inline_response */); + ipc_message->Command() = utils_msg_payload->command_type; + ipc_message->Args() = pb_cancel->ShmHandle(); + + bool is_cancelled = false; + { + bi::scoped_lock lk(message_payload->mu); + + SendIPCUtilsMessage(ipc_message); + while (!message_payload->waiting_on_stub) { + message_payload->cv.wait(lk); + } + + is_cancelled = message_payload->is_cancelled; + message_payload->waiting_on_stub = false; + message_payload->cv.notify_all(); + } + pb_cancel->ReportIsCancelled(is_cancelled); +} + bool Stub::StubToParentServiceActive() { @@ -1364,6 +1405,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .value( "ALREADY_EXISTS", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_ALREADY_EXISTS) + .value("CANCELLED", TRITONSERVER_Error_Code::TRITONSERVER_ERROR_CANCELLED) .export_values(); triton_error.def_property_readonly_static( "UNKNOWN", @@ -1386,6 +1428,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) triton_error.def_property_readonly_static( "ALREADY_EXISTS", [](py::object /* self */) { return TRITONSERVER_ERROR_ALREADY_EXISTS; }); + triton_error.def_property_readonly_static( + "CANCELLED", + [](py::object /* self */) { return TRITONSERVER_ERROR_CANCELLED; }); triton_error.def( py::init(), py::arg("message").none(false), @@ -1501,7 +1546,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def( "requested_output_names", &InferRequest::RequestedOutputNames, py::return_value_policy::reference_internal) - .def("get_response_sender", &InferRequest::GetResponseSender); + .def("get_response_sender", &InferRequest::GetResponseSender) + .def("is_cancelled", &InferRequest::IsCancelled); py::class_>(module, "Tensor") .def(py::init(&PbTensor::FromNumpy)) @@ -1539,7 +1585,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) module, "InferenceResponseSender") .def( "send", &ResponseSender::Send, py::arg("response") = nullptr, - py::arg("flags") = 0); + py::arg("flags") = 0) + .def("is_cancelled", &ResponseSender::IsCancelled); py::class_>( module, "ResponseIterator") diff --git a/src/pb_stub.h b/src/pb_stub.h index 6d047d29..d52196e1 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -49,6 +49,7 @@ #include "message_queue.h" #include "metric.h" #include "metric_family.h" +#include "pb_cancel.h" #include "pb_log.h" #include "pb_response_iterator.h" #include "pb_utils.h" @@ -308,6 +309,12 @@ class Stub { /// Add cleanup id to queue void EnqueueCleanupId(void* id); + /// Add request cancellation query to queue + void EnqueueIsCancelled(PbCancel* pb_cancel); + + /// Send request cancellation query to python backend + void SendIsCancelled(std::unique_ptr& utils_msg_payload); + /// Is the stub initialized bool IsInitialized(); diff --git a/src/pb_utils.h b/src/pb_utils.h index 1d651f3f..612c46a4 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -182,6 +182,12 @@ struct CleanupMessage : SendMessageBase { void* id; }; +struct IsCancelledMessage : SendMessageBase { + intptr_t response_factory_address; + intptr_t request_address; + bool is_cancelled; +}; + struct CustomMetricsMessage : SendMessageBase { bi::managed_external_buffer::handle_t message; bool has_error; diff --git a/src/python_be.cc b/src/python_be.cc index b196cfab..7f46d473 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -817,6 +817,10 @@ ModelInstanceState::StubToParentMQMonitor() ProcessBLSCleanupRequest(message); break; } + case PYTHONSTUB_IsRequestCancelled: { + ProcessIsRequestCancelled(message); + break; + } case PYTHONSTUB_MetricFamilyRequestNew: case PYTHONSTUB_MetricFamilyRequestDelete: { ProcessMetricFamilyRequest(message); @@ -918,6 +922,40 @@ ModelInstanceState::ProcessBLSCleanupRequest( } } +void +ModelInstanceState::ProcessIsRequestCancelled( + const std::unique_ptr& message) +{ + AllocatedSharedMemory message_shm = + Stub()->ShmPool()->Load(message->Args()); + IsCancelledMessage* message_payload = + reinterpret_cast(message_shm.data_.get()); + + { + bi::scoped_lock lk{message_payload->mu}; + + if (message_payload->response_factory_address != 0) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + message_payload->response_factory_address); + TRITONBACKEND_ResponseFactoryIsCancelled( + response_factory, &message_payload->is_cancelled); + } else if (message_payload->request_address != 0) { + TRITONBACKEND_Request* request = reinterpret_cast( + message_payload->request_address); + TRITONBACKEND_RequestIsCancelled(request, &message_payload->is_cancelled); + } else { + throw PythonBackendException("Cannot determine request cancellation"); + } + + message_payload->waiting_on_stub = true; + message_payload->cv.notify_all(); + while (message_payload->waiting_on_stub) { + message_payload->cv.wait(lk); + } + } +} + template void ModelInstanceState::ProcessMessage( diff --git a/src/python_be.h b/src/python_be.h index 825c45de..4c8d702f 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -394,6 +394,9 @@ class ModelInstanceState : public BackendModelInstance { // Process the bls decoupled cleanup request void ProcessBLSCleanupRequest(const std::unique_ptr& message); + // Process request cancellation query + void ProcessIsRequestCancelled(const std::unique_ptr& message); + // Process a message. The function 'request_handler' is invoked // to handle the request. T should be either 'MetricFamily', 'Metric' or // 'ModelLoader', and MessageType should be either 'MetricFamilyMessage', diff --git a/src/response_sender.cc b/src/response_sender.cc index a74459f6..1e2e9b50 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -37,10 +37,11 @@ namespace triton { namespace backend { namespace python { ResponseSender::ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool) + std::unique_ptr& shm_pool, + const std::shared_ptr& pb_cancel) : request_address_(request_address), response_factory_address_(response_factory_address), shm_pool_(shm_pool), - closed_(false) + closed_(false), pb_cancel_(pb_cancel) { } @@ -184,4 +185,11 @@ ResponseSender::Send( } } } + +bool +ResponseSender::IsCancelled() +{ + return pb_cancel_->IsCancelled(); +} + }}} // namespace triton::backend::python diff --git a/src/response_sender.h b/src/response_sender.h index 114f22c0..fda0d5d3 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -27,6 +27,7 @@ #pragma once #include "infer_response.h" +#include "pb_cancel.h" #include "shm_manager.h" namespace triton { namespace backend { namespace python { @@ -35,13 +36,16 @@ class ResponseSender { public: ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool); + std::unique_ptr& shm_pool, + const std::shared_ptr& pb_cancel); void Send(std::shared_ptr response, const uint32_t flags); + bool IsCancelled(); private: intptr_t request_address_; intptr_t response_factory_address_; std::unique_ptr& shm_pool_; bool closed_; + std::shared_ptr pb_cancel_; }; }}} // namespace triton::backend::python From cd68026e202be9363553eeff9442be93f5fcb92b Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Fri, 6 Oct 2023 17:40:05 -0700 Subject: [PATCH 149/216] Add logic to handle Python-based backends instead of platform handlers (#303) --- CMakeLists.txt | 7 - src/pb_stub.cc | 66 +-- src/pb_stub.h | 15 +- src/python_be.cc | 67 ++- src/python_be.h | 7 +- .../platform_handlers/pytorch/model.py | 323 ----------- .../tensorflow_savedmodel/README.md | 87 --- .../tensorflow_savedmodel/model.py | 536 ------------------ src/stub_launcher.cc | 12 +- src/stub_launcher.h | 2 +- 10 files changed, 99 insertions(+), 1023 deletions(-) delete mode 100755 src/resources/platform_handlers/pytorch/model.py delete mode 100644 src/resources/platform_handlers/tensorflow_savedmodel/README.md delete mode 100644 src/resources/platform_handlers/tensorflow_savedmodel/model.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f20bbc3..54341e01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,13 +309,6 @@ install( ${INSTALL_CONFIGDIR} ) -install( - DIRECTORY - src/resources/platform_handlers - DESTINATION - ${CMAKE_INSTALL_PREFIX}/backends/python -) - install( FILES src/resources/triton_python_backend_utils.py diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 87abe583..b38f8d38 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -82,9 +82,10 @@ Stub::Instantiate( const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& name, const std::string& platform) + const std::string& name, const std::string& python_runtime_model) { - model_context_.Init(model_path, platform, triton_install_path, model_version); + model_context_.Init( + model_path, python_runtime_model, triton_install_path, model_version); name_ = name; health_mutex_ = nullptr; initialized_ = false; @@ -1659,57 +1660,29 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) void ModelContext::Init( - const std::string& model_path, const std::string& platform, + const std::string& model_path, const std::string& runtime_modeldir, const std::string& triton_install_path, const std::string& model_version) { - bool python_model_found = false; - std::string platform_model_path; - - if (platform != "NONE") { - platform_model_path = - triton_install_path + "/platform_handlers/" + platform + "/model.py"; - // Check if model file exists in the path. - struct stat buffer; - if (stat(platform_model_path.c_str(), &buffer) == 0) { - // Use the Platform model for serving the model. - python_model_found = true; - type_ = ModelType::PLATFORM; - python_model_path_ = platform_model_path; - // Trimming the model name from the model path, the platform model - // will populate the expected default model file name into model_path_. - model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); - } else { - LOG_WARN << "Unable to find model(handler) \'" << platform_model_path - << "\' for platform field \'" << platform << "\'"; - } - } - - if (!python_model_found) { + type_ = ModelType::DEFAULT; + if (runtime_modeldir != "DEFAULT") { + // For python based backends, existence of `model.py` in the corresponding + // backend folder happens on the core side, so we can omit this check here. + python_model_path_ = runtime_modeldir + "/model.py"; + type_ = ModelType::BACKEND; + } else { python_model_path_ = model_path; // Check if model file exists in this path. struct stat buffer; - if (stat(python_model_path_.c_str(), &buffer) == 0) { - python_model_found = true; - type_ = ModelType::DEFAULT; - } - // Initializing here for consistency with platform model case. - model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); - } - - if (!python_model_found) { - if (platform != "NONE") { - throw PythonBackendException( - ("Python model file not found in neither \'" + platform_model_path + - "\' nor \'" + model_path + "\'")); - } else { + if (stat(python_model_path_.c_str(), &buffer) != 0) { throw PythonBackendException( ("Python model file not found in \'" + model_path + "\'")); } } + model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); python_backend_folder_ = triton_install_path; model_version_ = model_version; - platform_ = platform; + runtime_modeldir_ = runtime_modeldir; } void @@ -1740,9 +1713,10 @@ ModelContext::StubSetup(py::module& sys) sys = py::module_::import( (std::string(model_version_) + "." + model_name_trimmed).c_str()); } else { - std::string platform_model_dir( - python_backend_folder_ + "/platform_handlers/" + platform_ + "/"); - sys.attr("path").attr("append")(platform_model_dir); + std::string model_path_parent = + python_model_path_.substr(0, python_model_path_.find_last_of("/")); + std::string backend_model_dir(model_path_parent); + sys.attr("path").attr("append")(backend_model_dir); sys.attr("path").attr("append")(python_backend_folder_); sys = py::module_::import(model_name_trimmed.c_str()); } @@ -1791,14 +1765,14 @@ main(int argc, char** argv) int64_t shm_growth_size = std::stol(argv[4]); std::string triton_install_path = argv[6]; std::string name = argv[8]; - std::string platform = argv[9]; + std::string runtime_modeldir = argv[9]; std::unique_ptr& stub = Stub::GetOrCreateInstance(); try { stub->Instantiate( shm_growth_size, shm_default_size, shm_region_name, model_path, model_version, argv[6] /* triton install path */, - std::stoi(argv[7]) /* IPCControl handle */, name, platform); + std::stoi(argv[7]) /* IPCControl handle */, name, runtime_modeldir); } catch (const PythonBackendException& pb_exception) { LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what(); diff --git a/src/pb_stub.h b/src/pb_stub.h index d52196e1..94b4d8a1 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -180,9 +180,15 @@ class ModelContext { std::string model_dir_; std::string model_version_; std::string python_backend_folder_; - std::string platform_; - - enum ModelType { DEFAULT, PLATFORM }; + std::string runtime_modeldir_; + + // Triton supports python-based backends, + // i.e. backends that provide common `model.py`, that can be re-used + // between different models. `ModelType` helps to differentiate + // between models running with c++ python backend (ModelType::DEFAULT) + // and models running with python-based backend (ModelType::BACKEND) + // at the time of ModelContext::StubSetup to properly set up paths. + enum ModelType { DEFAULT, BACKEND }; ModelType type_; }; @@ -210,7 +216,8 @@ class Stub { const std::string& shm_region_name, const std::string& model_path, const std::string& model_version, const std::string& triton_install_path, bi::managed_external_buffer::handle_t ipc_control_handle, - const std::string& model_instance_name, const std::string& platform); + const std::string& model_instance_name, + const std::string& runtime_modeldir); /// Get the health of the stub process. bool& Health(); diff --git a/src/python_be.cc b/src/python_be.cc index 7f46d473..db979562 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1771,11 +1771,12 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) python_execution_env_ = ""; force_cpu_only_input_tensors_ = true; decoupled_ = false; - platform_ = ""; void* bstate; THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_BackendState(backend, &bstate)); backend_state_ = reinterpret_cast(bstate); + + runtime_modeldir_ = backend_state_->runtime_modeldir; triton::common::TritonJson::Value params; common::TritonJson::Value model_config; if (model_config_.Find("parameters", ¶ms)) { @@ -1812,14 +1813,6 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) } } - triton::common::TritonJson::Value platform; - if (model_config_.Find("platform", &platform)) { - auto error = platform.AsString(&platform_); - if (error != nullptr) { - throw BackendModelException(error); - } - } - // Skip the FORCE_CPU_ONLY_INPUT_TENSORS variable if it doesn't exits. std::string force_cpu_only_input_tensor; error = nullptr; @@ -1948,8 +1941,11 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) backend_state->shm_message_queue_size = 1000; backend_state->number_of_instance_inits = 0; backend_state->thread_pool_size = 32; + // Initialize shared memory region prefix to include backend's name + // to avoid collision between python backend and python-based backends. backend_state->shared_memory_region_prefix = - "triton_python_backend_shm_region_"; + "triton_" + name + "_backend_shm_region_"; + std::string default_backend_dir_string; if (backend_config.Find("cmdline", &cmdline)) { triton::common::TritonJson::Value shm_growth_size; @@ -2059,6 +2055,12 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INVALID_ARG, ia.what()); } } + + triton::common::TritonJson::Value default_backend_dir; + if (cmdline.Find("backend-directory", &default_backend_dir)) { + RETURN_IF_ERROR( + default_backend_dir.AsString(&default_backend_dir_string)); + } } LOG_MESSAGE( @@ -2076,7 +2078,50 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) TRITONBACKEND_ArtifactType artifact_type; RETURN_IF_ERROR( TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &location)); - backend_state->python_lib = location; + + // Check if `triton_python_backend_stub` and `triton_python_backend_utils.py` + // are located under `location`. + // DLIS-5596: Add forward slash to be platform agnostic + // (i.e. For Windows, we need to use backward slash). + std::string default_python_backend_dir = + default_backend_dir_string + "/python"; + std::string backend_stub_path = + std::string(location) + "/triton_python_backend_stub"; + std::string backend_utils = + std::string(location) + "/triton_python_backend_utils.py"; + // Both, stub and utils should be in the same location + if (FileExists(backend_stub_path) && FileExists(backend_utils)) { + backend_state->python_lib = location; + // If `location` is default location of a python backend, + // then we are using default python backend. + if (default_python_backend_dir == std::string(location)) { + backend_state->runtime_modeldir = ""; + } else { + // If `location` is not default location of a python backend, + // then we are using a python backend based backend and model.py stored + // in the received location. + backend_state->runtime_modeldir = location; + } + } else { + // If stub and utils are not found in received `location`, + // then we are using a python backend based backend and stub and utils are + // stored in the default python backend location. + if (!default_backend_dir_string.empty()) { + std::string backend_stub_path = + default_backend_dir_string + "/python/triton_python_backend_stub"; + if (!FileExists(backend_stub_path)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_NOT_FOUND, + (std::string("triton_python_backend_stub") + + " is not found. Searched paths: " + default_backend_dir_string + + "/python and" + std::string(location)) + .c_str()); + } + } + backend_state->runtime_modeldir = location; + backend_state->python_lib = default_backend_dir_string + "/python"; + } + backend_state->env_manager = std::make_unique(); RETURN_IF_ERROR(TRITONBACKEND_BackendSetState( diff --git a/src/python_be.h b/src/python_be.h index 4c8d702f..51793125 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -218,6 +218,7 @@ struct BackendState { std::string shared_memory_region_prefix; int64_t thread_pool_size; std::unique_ptr env_manager; + std::string runtime_modeldir; }; class ModelState : public BackendModel { @@ -237,8 +238,8 @@ class ModelState : public BackendModel { // Is decoupled API being used. bool IsDecoupled() { return decoupled_; } - // Returns the value in the platform field - std::string Platform() { return platform_; } + // Returns the value in the `runtime_modeldir_` field + std::string RuntimeModelDir() { return runtime_modeldir_; } // Launch auto-complete stub process. TRITONSERVER_Error* LaunchAutoCompleteStubProcess(); @@ -255,7 +256,7 @@ class ModelState : public BackendModel { std::string python_execution_env_; bool force_cpu_only_input_tensors_; bool decoupled_; - std::string platform_; + std::string runtime_modeldir_; std::unique_ptr auto_complete_stub_; }; diff --git a/src/resources/platform_handlers/pytorch/model.py b/src/resources/platform_handlers/pytorch/model.py deleted file mode 100755 index 365599e0..00000000 --- a/src/resources/platform_handlers/pytorch/model.py +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import importlib -import json -import os - -try: - import torch -except ModuleNotFoundError as error: - raise RuntimeError( - "Missing/Incomplete PyTorch package installation... (Did you install PyTorch?)" - ) from error - -# triton_python_backend_utils is available in every Triton Python model. You -# need to use this module to create inference requests and responses. It also -# contains some utility functions for extracting information from model_config -# and converting Triton input/output types to numpy types. -import triton_python_backend_utils as pb_utils - - -def _get_model_path(config): - filenames = ["model.py", "model.pt"] - if config["default_model_filename"]: - filenames.insert(0, config["default_model_filename"]) - for filename in filenames: - model_path = os.path.join(pb_utils.get_model_dir(), filename) - if os.path.exists(model_path): - return model_path - raise pb_utils.TritonModelException( - "No model found in " + pb_utils.get_model_dir() + "/" + str(filenames) - ) - - -def _get_model_data_path(model_path): - data_path_extensions = [".pt"] - model_path_no_extension = model_path[: -(len(model_path.split(".")[-1]) + 1)] - for extension in data_path_extensions: - data_path = model_path_no_extension + extension - if os.path.exists(data_path): - return data_path - # data file not provided - return "" - - -def _is_py_class_model(model_path): - return model_path[-3:] == ".py" - - -def _import_module_from_path(module_name, file_path): - spec = importlib.util.spec_from_file_location(module_name, file_path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def _get_model_class_from_module(module): - names = dir(module) - for name in names: - attr = getattr(module, name) - try: - if issubclass(attr, torch.nn.Module): - return attr - except TypeError: - # attr may not be a class - pass - raise pb_utils.TritonModelException("Cannot find a subclass of torch.nn.Module") - - -def _parse_io_config(io_config): - io = [] - for conf in io_config: - io.append({"name": conf["name"]}) - return io - - -def _get_device_name(kind, device_id): - if kind == "GPU": - return "cuda:" + device_id - if kind == "CPU": - return "cpu" - # unspecified device - return "" - - -def _get_device(kind, device_id, model): - device_name = _get_device_name(kind, device_id) - if device_name == "": - for param in model.parameters(): - return param.device - raise pb_utils.TritonModelException("Cannot determine model device") - return torch.device(device_name) - - -def _set_torch_parallelism(config): - log_msg = "" - parallelism_settings = ["NUM_THREADS", "NUM_INTEROP_THREADS"] - for setting in parallelism_settings: - val = "1" - if setting in config["parameters"]: - val = config["parameters"][setting]["string_value"] - getattr(torch, "set_" + setting.lower())(int(val)) - log_msg += setting + " = " + val + "; " - return log_msg - - -def _get_torch_compile_params(config): - params = {} - if "TORCH_COMPILE_OPTIONAL_PARAMETERS" in config["parameters"]: - val = config["parameters"]["TORCH_COMPILE_OPTIONAL_PARAMETERS"]["string_value"] - params = json.loads(val) - if "model" in params: - raise pb_utils.TritonModelException( - "'model' is not an optional parameter for 'torch.compile'" - ) - return params - - -def _gather_torch_tensors(scatter_tensors): - gather_tensors = [] - sections = [] - for i in range(len(scatter_tensors)): - tensors = scatter_tensors[i] - for j in range(len(tensors)): - tensor = tensors[j] - if j < len(gather_tensors): - # add to existing tensor - gather_tensors[j] = torch.cat((gather_tensors[j], tensor), 0) - else: - # start a new tensor - gather_tensors.append(tensor) - # record section - section_length = tensors[0].size()[0] - sections.append(section_length) - return gather_tensors, sections - - -def _scatter_torch_tensors(gather_tensors, sections): - scatter_tensors = [] - for j in range(len(gather_tensors)): - scatter_tensor = torch.split(gather_tensors[j], sections) - for i in range(len(scatter_tensor)): - tensor = scatter_tensor[i] - if i < len(scatter_tensors): - # add to existing response - scatter_tensors[i].append(tensor) - else: - # start a new response - scatter_tensors.append([tensor]) - return scatter_tensors - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - self._model_name = args["model_name"] - for_model = "for '" + self._model_name + "'" - self._logger = pb_utils.Logger - self._logger.log_info("Initializing model instance " + for_model) - - self._model_config = json.loads(args["model_config"]) - self._kind = args["model_instance_kind"] - self._device_id = args["model_instance_device_id"] - self._support_batching = self._model_config["max_batch_size"] > 0 - self._inputs = _parse_io_config(self._model_config["input"]) - self._outputs = _parse_io_config(self._model_config["output"]) - - setting_msg = _set_torch_parallelism(self._model_config) - self._logger.log_verbose( - "Torch parallelism settings " + for_model + ": " + setting_msg - ) - - self._infer_mode = torch.inference_mode(mode=True) - self._infer_mode.__enter__() - - params = _get_torch_compile_params(self._model_config) - self._logger.log_verbose( - "'torch.compile' optional parameter(s) " + for_model + ": " + str(params) - ) - if self._support_batching: - self._gather = torch.compile(_gather_torch_tensors, **params) - self._scatter = torch.compile(_scatter_torch_tensors, **params) - - model_path = _get_model_path(self._model_config) - if not _is_py_class_model(model_path): - self._logger.log_info("Loading '" + self._model_name + "' as TorchScript") - self._model = torch.jit.load(model_path) - self._device = _get_device(self._kind, self._device_id, self._model) - self._model.to(self._device) - self._model.eval() - return - - self._model_module = _import_module_from_path(self._model_name, model_path) - self._model_class = _get_model_class_from_module(self._model_module) - self._raw_model = self._model_class() - self._device = _get_device(self._kind, self._device_id, self._raw_model) - data_path = _get_model_data_path(model_path) - if data_path != "": - self._raw_model.load_state_dict( - torch.load(data_path, map_location=self._device) - ) - else: - self._logger.log_info("Model parameter file not found " + for_model) - self._raw_model.to(self._device) - self._raw_model.eval() - self._model = torch.compile(self._raw_model, **params) - - def execute(self, requests): - """`execute` MUST be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference request is made - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - requests_tensors = [] - for request in requests: - tensors = [] - for io in self._inputs: - tensor = pb_utils.get_input_tensor_by_name( - request, io["name"] - ).to_dlpack() - tensor = torch.from_dlpack(tensor).to(self._device) - tensors.append(tensor) - requests_tensors.append(tensors) - - sections = None - if self._support_batching: - requests_tensors, sections = self._gather(requests_tensors) - requests_tensors = [requests_tensors] - - responses_tensors = [] - for input_tensors in requests_tensors: - output_tensors = self._model(*input_tensors) - if not isinstance(output_tensors, tuple) and not isinstance( - output_tensors, list - ): - output_tensors = [output_tensors] - responses_tensors.append(output_tensors) - - if self._support_batching: - responses_tensors = self._scatter(responses_tensors[0], sections) - - for response_tensors in responses_tensors: - output_tensors = [] - for i in range(len(self._outputs)): - io = self._outputs[i] - tensor = response_tensors[i].detach() - tensor = pb_utils.Tensor.from_dlpack(io["name"], tensor) - output_tensors.append(tensor) - inference_response = pb_utils.InferenceResponse( - output_tensors=output_tensors - ) - responses.append(inference_response) - - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is OPTIONAL. This function allows - the model to perform any necessary clean ups before exit. - """ - self._logger.log_info("Removing model instance for '" + self._model_name + "'") - self._infer_mode.__exit__(exc_type=None, exc_value=None, traceback=None) diff --git a/src/resources/platform_handlers/tensorflow_savedmodel/README.md b/src/resources/platform_handlers/tensorflow_savedmodel/README.md deleted file mode 100644 index 23199e7b..00000000 --- a/src/resources/platform_handlers/tensorflow_savedmodel/README.md +++ /dev/null @@ -1,87 +0,0 @@ - - -# Serving Tensorflow SavedModels using Python Backend \[Experimental\] - -*NOTE*: This feature is subject to change and removal, and should not -be used in production. - -Starting from 23.07, we are adding experimental support for loading -and serving of models in [TensorFlow SavedModel](https://www.tensorflow.org/guide/saved_model) -format via Python backend. The `model.savedmodel` can be provided within -the triton server model repository without `model.py` and backend will -automatically use a pre-built python model (`model.py`)[model.py] to load -and serve provided TF SavedModel. The handler can [auto-complete](../../../../README.md#auto_complete_config) -the missing model configuration. - -The model repository structure can look like: - -``` -model_repository/ -`-- resnet_v1_50_savedmodel - |-- 1 - | `-- model.savedmodel - | |-- saved_model.pb - | `-- variables - |-- config.pbtxt - `-- resnet50_labels.txt -``` - -In order to use this feature, make sure that [TensorFlow pip package](https://pypi.org/project/tensorflow/2.13.0/) -is available in the same Python environment. - -``` -pip install tensorfow==2.13.0 -``` - -Alternatively, you can create a -[Python Execution Environment](#using-custom-python-execution-environments) -with the TensorFlow dependency. - -By default, Triton will use the [TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend) -to load and serve the saved model. In order to use the Python backend with -TensorFlow SavedModel, [model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md) -should explicitly provide the following settings: - -``` -backend: "python" -platform: "tensorflow_savedmodel" -``` - -It has been observed that certain DLFW like TensorFlow do not release the entire -memory allocated for loading a model back to the system when the model gets -unloaded. This can be problematic when working with a large number of models and -dynamically loading/unloading them. Using Python backend for TF SavedModel serving -will allow the models to be loaded in a separate process, which ensures that entire -memory allocated within the process would be released to the system upon a model -unload. - -Following are few known limitations of this feature: -- GPU execution is not supported. -- List of requests received in model [`execute`](../../../../README.md#execute) function are -not run in a single batch but one after the other. diff --git a/src/resources/platform_handlers/tensorflow_savedmodel/model.py b/src/resources/platform_handlers/tensorflow_savedmodel/model.py deleted file mode 100644 index 24b95472..00000000 --- a/src/resources/platform_handlers/tensorflow_savedmodel/model.py +++ /dev/null @@ -1,536 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import os - -try: - import tensorflow as tf - from tensorflow.core.framework import types_pb2 - from tensorflow.python.client import session - from tensorflow.python.saved_model import loader, signature_constants - from tensorflow.python.tools import saved_model_utils -except ModuleNotFoundError as error: - raise RuntimeError( - "Missing/Incomplete tensorflow package installation..." - ) from error - -# triton_python_backend_utils is available in every Triton Python model. You -# need to use this module to create inference requests and responses. It also -# contains some utility functions for extracting information from model_config -# and converting Triton input/output types to numpy types. -import triton_python_backend_utils as pb_utils - -TF_STRING_TO_TRITON = { - "DT_BOOL": "TYPE_BOOL", - "DT_UINT8": "TYPE_UINT8", - "DT_UINT16": "TYPE_UINT16", - "DT_UINT32": "TYPE_UINT32", - "DT_UINT64": "TYPE_UINT64", - "DT_INT8": "TYPE_INT8", - "DT_INT16": "TYPE_INT16", - "DT_INT32": "TYPE_INT32", - "DT_INT64": "TYPE_INT64", - "DT_HALF": "TYPE_FP16", - "DT_FLOAT": "TYPE_FP32", - "DT_DOUBLE": "TYPE_FP64", - "DT_STRING": "TYPE_STRING", -} - -_DEFAULT_ARTIFACT_NAME = "model.savedmodel" - - -def _get_savedmodel_path(config): - artifact_name = config["default_model_filename"] - if not artifact_name: - artifact_name = _DEFAULT_ARTIFACT_NAME - - savedmodel_path = os.path.join(pb_utils.get_model_dir(), artifact_name) - if not os.path.exists(savedmodel_path): - raise pb_utils.TritonModelException( - f"No savedmodel dir found in " + savedmodel_path - ) - - return savedmodel_path - - -def _parse_signature_def(config): - if config["parameters"]: - if "TF_SIGNATURE_DEF" in config["parameters"].keys(): - return config["parameters"]["TF_SIGNATURE_DEF"]["string_value"] - return None - - -def _parse_graph_tag(config): - if config["parameters"]: - if "TF_GRAPH_TAG" in config["parameters"].keys(): - return config["parameters"]["TF_GRAPH_TAG"]["string_value"] - return None - - -def _parse_num_intra_threads(config): - if config["parameters"]: - if "TF_NUM_INTRA_THREADS" in config["parameters"].keys(): - return int(config["parameters"]["TF_NUM_INTRA_THREADS"]["string_value"]) - return None - - -def _parse_num_inter_threads(config): - if config["parameters"]: - if "TF_NUM_INTER_THREADS" in config["parameters"].keys(): - return int(config["parameters"]["TF_NUM_INTER_THREADS"]["string_value"]) - return None - - -def _get_truth_value(string_value): - val = string_value.casefold() - if val == "yes" or val == "1" or val == "on" or val == "true": - return True - else: - return False - - -def _parse_use_per_session_thread(config): - if config["parameters"]: - if "USE_PER_SESSION_THREAD" in config["parameters"].keys(): - val = config["parameters"]["USE_PER_SESSION_THREAD"]["string_value"] - return _get_truth_value(val) - return False - - -def _get_signature_def(savedmodel_path, config): - tag_sets = saved_model_utils.get_saved_model_tag_sets(savedmodel_path) - graph_tag = _parse_graph_tag(config) - if graph_tag is None: - if "serve" in tag_sets[0]: - graph_tag = "serve" - else: - graph_tag = tag_sets[0][0] - - meta_graph_def = saved_model_utils.get_meta_graph_def(savedmodel_path, graph_tag) - signature_def_map = meta_graph_def.signature_def - signature_def_k = _parse_signature_def(config) - if signature_def_k is None: - serving_default = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY - if serving_default in signature_def_map.keys(): - signature_def_k = serving_default - else: - signature_def_k = signature_def_map.keys()[0] - - if signature_def_k not in signature_def_map.keys(): - raise pb_utils.TritonModelException( - f" The model does not include the signature_def '" + signature_def_k + "'" - ) - - return graph_tag, signature_def_map[signature_def_k] - - -def _has_batch_dim(tensor_info): - if tensor_info.tensor_shape.unknown_rank: - return True - elif tensor_info.tensor_shape.dim[0].size == -1: - return True - else: - return False - - -def _get_batching_hint_from_signature(signature_def): - for input_info in signature_def.inputs.values(): - if not _has_batch_dim(input_info): - return False - - for output_info in signature_def.outputs.values(): - if not _has_batch_dim(output_info): - return False - - return True - - -def _convert_proto_to_dict_tensor(name, tensor_proto, batching_enabled): - tensor_dict = {} - tensor_dict["name"] = name - dtype_dict = {value: key for (key, value) in types_pb2.DataType.items()} - tensor_dict["data_type"] = TF_STRING_TO_TRITON[dtype_dict[tensor_proto.dtype]] - if tensor_proto.tensor_shape.unknown_rank: - # FIXME: Fix the handling of unknown rank - dims = [-1] - else: - dims = [dim.size for dim in tensor_proto.tensor_shape.dim] - if batching_enabled: - tensor_dict["dims"] = dims[1:] - else: - tensor_dict["dims"] = dims - - return tensor_dict - - -def _validate_datatype(tf_dtype, triton_datatype, tensor_name): - dtype_dict = {value: key for (key, value) in types_pb2.DataType.items()} - if triton_datatype != TF_STRING_TO_TRITON[dtype_dict[tf_dtype]]: - raise pb_utils.TritonModelException( - f" Mismatch between datatype for tensor '" - + tensor_name - + "', expected '" - + TF_STRING_TO_TRITON[dtype_dict[tf_dtype]] - + "', got '" - + triton_datatype - ) - - -def _validate_dims(tf_shape, triton_dims, batching_enabled, tensor_name): - if tf_shape.unknown_rank: - return - - index = 0 - offset = 1 if batching_enabled else 0 - if len(tf_shape.dim) != (offset + len(triton_dims)): - raise pb_utils.TritonModelException( - f" Mismatch in the number of dimension with the model for tensor '" - + tensor_name - + "', expected " - + str(len(tf_shape.dim) - offset) - + ", got " - + str(len(triton_dims)) - ) - - for dim in tf_shape.dim: - if index == 0 and batching_enabled: - if dim.size != -1: - raise pb_utils.TritonModelException( - f" The first dimension of a batching model should be dynamic, " - "however, got shape of first dimension in model for tensor '" - + tensor_name - + "' as " - + str(dim.size) - ) - else: - if dim.size != triton_dims[index - offset]: - raise pb_utils.TritonModelException( - f" Mismatch in " - + str(index - offset) - + "th dimension for tensor '" - + tensor_name - + "', expected " - + str(dim.size) - + ", got " - + str(triton_dims[index - offset]) - ) - index = index + 1 - - -def _validate_model_config(model_config, signature_def): - signature_supports_batching = _get_batching_hint_from_signature(signature_def) - if (not signature_supports_batching) and (model_config["max_batch_size"] != 0): - raise pb_utils.TritonModelException( - f" The model signature does not support batching, yet model config" - " has max_batch_size set to '" + str(model_config["max_batch_size"]) + "'" - ) - - batching_enabled = model_config["max_batch_size"] != 0 - - if model_config["platform"] != "tensorflow_savedmodel": - raise pb_utils.TritonModelException( - f"[INTERNAL]: The platform field for using this model should be set to" - " 'tensorflow_savedmodel' in model config, got '" - + model_config["platform"] - + "'" - ) - if model_config["batch_input"]: - raise pb_utils.TritonModelException( - f"The platform model '" - + model_config["platform"] - + "' does not support model with batch_input" - ) - if model_config["batch_output"]: - raise pb_utils.TritonModelException( - f"The platform model '" - + model_config["platform"] - + "' does not support model with batch_output" - ) - - # Validate input tensors - input_tensor_info = signature_def.inputs - config_input_names = [input["name"] for input in model_config["input"]] - for input_name in input_tensor_info.keys(): - if input_name not in config_input_names: - raise pb_utils.TritonModelException( - f" Missing input tensor configuration for tensor '" + input_name + "'" - ) - for input in model_config["input"]: - config_input_name = input["name"] - if config_input_name not in input_tensor_info.keys(): - supported_names = "" - for valid_name in input_tensor_info.keys(): - supported_names = supported_names + ";" + valid_name - raise pb_utils.TritonModelException( - f" No input tensor with name '" - + config_input_name - + "', only supported input names are " - + supported_names - ) - _validate_datatype( - input_tensor_info[config_input_name].dtype, - input["data_type"], - config_input_name, - ) - _validate_dims( - input_tensor_info[config_input_name].tensor_shape, - input["dims"], - batching_enabled, - config_input_name, - ) - - # Validate output tensors - output_tensor_info = signature_def.outputs - for output in model_config["output"]: - config_output_name = output["name"] - if config_output_name not in output_tensor_info.keys(): - supported_names = "" - for valid_name in output_tensor_info.keys(): - supported_names = supported_names + ";" + valid_name - raise pb_utils.TritonModelException( - f" No output tensor with name '" - + config_output_name - + "', only supported output names are " - + supported_names - ) - - _validate_datatype( - output_tensor_info[config_output_name].dtype, - output["data_type"], - config_output_name, - ) - _validate_dims( - output_tensor_info[config_output_name].tensor_shape, - output["dims"], - batching_enabled, - config_output_name, - ) - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - @staticmethod - def auto_complete_config(auto_complete_model_config): - config = auto_complete_model_config.as_dict() - - if config["platform"] != "tensorflow_savedmodel": - raise pb_utils.TritonModelException( - f"[INTERNAL]: The platform field for using this model should be set to" - " 'tensorflow_savedmodel' in model config, got '" - + config["platform"] - + "'" - ) - if config["batch_input"]: - raise pb_utils.TritonModelException( - f"The platform model '" - + config["platform"] - + "' does not support model with batch_input" - ) - if config["batch_output"]: - raise pb_utils.TritonModelException( - f"The platform model '" - + config["platform"] - + "' does not support model with batch_output" - ) - - savedmodel_path = _get_savedmodel_path(config) - - if savedmodel_path is None: - raise pb_utils.TritonModelException( - f"[INTERNAL]: The path to the framework model should be" " provided" - ) - - batching_enabled = False - if config["max_batch_size"] != 0: - batching_enabled = True - - _, signature_def = _get_signature_def(savedmodel_path, config) - - input_tensor_info = signature_def.inputs - output_tensor_info = signature_def.outputs - - batching_hint = False - if not batching_enabled: - batching_hint = _get_batching_hint_from_signature(signature_def) - - # FIXME: Currently the presence of dynamic batch dimension is - # being treated as sufficient proof for enabling batching. - # Need to visit the tensors that are already provided in config - # to confirm the hint - batching_enabled = batching_hint - - config_input_names = [input["name"] for input in config["input"]] - config_output_names = [output["name"] for output in config["output"]] - - # TODO: Add auto-completion of partial tensor specification. - for input_name in input_tensor_info.keys(): - if input_name not in config_input_names: - auto_complete_model_config.add_input( - _convert_proto_to_dict_tensor( - input_name, input_tensor_info[input_name], batching_enabled - ) - ) - - for output_name in output_tensor_info.keys(): - if output_name not in config_output_names: - auto_complete_model_config.add_output( - _convert_proto_to_dict_tensor( - output_name, output_tensor_info[output_name], batching_enabled - ) - ) - - if batching_enabled: - if config["max_batch_size"] == 0: - auto_complete_model_config.set_max_batch_size(4) - auto_complete_model_config.set_dynamic_batching() - - return auto_complete_model_config - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args["model_config"]) - - savedmodel_path = _get_savedmodel_path(model_config) - - self.model_name = args["model_name"] - self.logger = pb_utils.Logger - self.logger.log_info("Initializing model for " + self.model_name) - - if args["model_instance_kind"] != "CPU": - self.logger.log_warn( - "GPU instances are not supported by this backend. Falling back to KIND_CPU for " - + self.model_name - ) - - tag_set, signature_def = _get_signature_def(savedmodel_path, model_config) - _validate_model_config(model_config, signature_def) - - self.signature_def = signature_def - self.input_tensor_info = self.signature_def.inputs - output_tensor_info = self.signature_def.outputs - - # Get the input output names from model config - self.input_names = [input["name"] for input in model_config["input"]] - self.output_names = [output["name"] for output in model_config["output"]] - - # Get the output tensor names - self.output_tensor_names = [ - output_tensor_info[output_name].name for output_name in self.output_names - ] - - # load the session model - # FIXME Add more configuration options for the model. - sess_config = tf.compat.v1.ConfigProto( - inter_op_parallelism_threads=_parse_num_inter_threads(model_config), - intra_op_parallelism_threads=_parse_num_intra_threads(model_config), - use_per_session_threads=_parse_use_per_session_thread(model_config), - ) - self.tf_session = session.Session(graph=tf.Graph(), config=sess_config) - loader.load(self.tf_session, [tag_set], savedmodel_path) - - # Hoding the input dict for caching input tensor data for - # better inference performance - self.input_feed_dict = {} - - def execute(self, requests): - """`execute` MUST be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference request is made - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # FIXME: Instead of iterating through each request, run - # the inference as a single batch. - for request in requests: - # Prepare the input feed for the model. - for input_name in self.input_names: - self.input_feed_dict[ - self.input_tensor_info[input_name].name - ] = pb_utils.get_input_tensor_by_name(request, input_name).as_numpy() - - # FIXME: Add GPU Tensor handling. DLpack should be utilized - # for better performance - outputs = self.tf_session.run( - self.output_tensor_names, feed_dict=self.input_feed_dict - ) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - output_tensors = [] - for i, output in enumerate(outputs): - output_tensors.append(pb_utils.Tensor(self.output_names[i], output)) - - inference_response = pb_utils.InferenceResponse( - output_tensors=output_tensors - ) - responses.append(inference_response) - - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is OPTIONAL. This function allows - the model to perform any necessary clean ups before exit. - """ - if self.tf_session is not None: - self.tf_session.close - self.logger.log_info("Removed model instance for " + self.model_name) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index de4dd46c..a38409ec 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -62,9 +62,9 @@ StubLauncher::Initialize(ModelState* model_state) model_state->ModelConfig().Write(&model_config_buffer_); is_decoupled_ = model_state->IsDecoupled(); model_repository_path_ = model_state->RepositoryPath(); - platform_ = model_state->Platform(); - if (platform_.empty()) { - platform_ = "NONE"; + runtime_modeldir_ = model_state->RuntimeModelDir(); + if (runtime_modeldir_.empty()) { + runtime_modeldir_ = "DEFAULT"; } // Atomically increase and read the stub process count to avoid shared memory @@ -238,7 +238,8 @@ StubLauncher::Launch() << ":$LD_LIBRARY_PATH " << python_backend_stub << " " << model_path_ << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name << " " << platform_; + << " " << ipc_control_handle_ << " " << stub_name << " " + << runtime_modeldir_; ipc_control_->uses_env = true; bash_argument = ss.str(); } else { @@ -246,7 +247,8 @@ StubLauncher::Launch() ss << " exec " << python_backend_stub << " " << model_path_ << " " << shm_region_name_ << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " << parent_pid_ << " " << python_lib_ - << " " << ipc_control_handle_ << " " << stub_name << " " << platform_; + << " " << ipc_control_handle_ << " " << stub_name << " " + << runtime_modeldir_; bash_argument = ss.str(); } LOG_MESSAGE( diff --git a/src/stub_launcher.h b/src/stub_launcher.h index 89f35422..3bbd2463 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -161,7 +161,7 @@ class StubLauncher { std::string shm_region_name_; std::string model_repository_path_; std::string model_path_; - std::string platform_; + std::string runtime_modeldir_; const std::string stub_process_kind_; std::string model_name_; const std::string model_instance_name_; From 4c4a552b047ff00ca8c6b87ba1fe4ac8f83eaf24 Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:19:58 -0700 Subject: [PATCH 150/216] Remove PyTorch platform handler. (#307) --- examples/pytorch_platform_handler/README.md | 109 -- examples/pytorch_platform_handler/client.py | 92 -- .../pytorch_platform_handler/config.pbtxt | 45 - examples/pytorch_platform_handler/model.py | 47 - .../resnet50_labels.txt | 1000 ----------------- 5 files changed, 1293 deletions(-) delete mode 100644 examples/pytorch_platform_handler/README.md delete mode 100755 examples/pytorch_platform_handler/client.py delete mode 100644 examples/pytorch_platform_handler/config.pbtxt delete mode 100755 examples/pytorch_platform_handler/model.py delete mode 100644 examples/pytorch_platform_handler/resnet50_labels.txt diff --git a/examples/pytorch_platform_handler/README.md b/examples/pytorch_platform_handler/README.md deleted file mode 100644 index 13e32249..00000000 --- a/examples/pytorch_platform_handler/README.md +++ /dev/null @@ -1,109 +0,0 @@ - - -# PyTorch Example - -In this section, we demonstrate an end-to-end example for using the -[PyTorch Platform \[Experimental\]](../../README.md#pytorch-platform-experimental) -to serve a PyTorch model directly, **without** needing to implement the -`TritonPythonModel` class. - -## Create a ResNet50 model repository - -We will use the files that come with this example to create the model -repository. - -First, download [client.py](client.py), [config.pbtxt](config.pbtxt), -[model.py](model.py), -[mug.jpg](https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg) -and [resnet50_labels.txt](resnet50_labels.txt) to your local machine. - -Next, at the directory where the downloaded files are saved at, create a model -repository with the following commands: -``` -$ mkdir -p models/resnet50_pytorch/1 -$ mv model.py models/resnet50_pytorch/1 -$ mv config.pbtxt models/resnet50_pytorch -``` - -## Pull the Triton Docker images - -We need to install Docker and NVIDIA Container Toolkit before proceeding, refer -to the -[installation steps](https://github.com/triton-inference-server/server/tree/main/docs#installation). - -To pull the latest containers, run the following commands: -``` -$ docker pull nvcr.io/nvidia/tritonserver:-py3 -$ docker pull nvcr.io/nvidia/tritonserver:-py3-sdk -``` -See the installation steps above for the `` version. - -For example, if the version is `23.08`, then: -``` -$ docker pull nvcr.io/nvidia/tritonserver:23.08-py3 -$ docker pull nvcr.io/nvidia/tritonserver:23.08-py3-sdk -``` - -Be sure to replace the `` with the version pulled for all the remaining -parts of this example. - -## Start the Triton Server - -At the directory where we created the PyTorch model (at where the "models" -folder is located), run the following command: -``` -$ docker run -it --rm --gpus all --shm-size 1g -p 8000:8000 -v `pwd`:/pytorch_example nvcr.io/nvidia/tritonserver:-py3 /bin/bash -``` - -Inside the container, we need to install PyTorch, Pillow and Requests to run this example. -We recommend using `pip` method for the installations, for example: -``` -$ pip3 install torch Pillow requests -``` - -Finally, we need to start the Triton Server, run the following command: -``` -$ tritonserver --model-repository=/pytorch_example/models -``` - -To leave the container for the next step, press: `CTRL + P + Q`. - -## Test inference - -At the directory where the client.py is located, run the following command: -``` -$ docker run --rm --net=host -v `pwd`:/pytorch_example nvcr.io/nvidia/tritonserver:-py3-sdk python3 /pytorch_example/client.py -``` - -A successful inference will print the following at the end: -``` -Result: COFFEE MUG -Expected result: COFFEE MUG -PASS: PyTorch platform handler -``` diff --git a/examples/pytorch_platform_handler/client.py b/examples/pytorch_platform_handler/client.py deleted file mode 100755 index ccd4624d..00000000 --- a/examples/pytorch_platform_handler/client.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import sys - -import numpy as np -from PIL import Image -from tritonclient import http as httpclient -from tritonclient.utils import * - -script_directory = os.path.dirname(os.path.realpath(__file__)) - -server_url = "localhost:8000" -model_name = "resnet50_pytorch" -input_name = "INPUT" -output_name = "OUTPUT" -label_path = os.path.join(script_directory, "resnet50_labels.txt") -# The 'mug.jpg' image will be present at the script_directory if the steps on -# the provided README.md are followed. The image may also be found at -# '/workspace/images/mug.jpg' on the SDK container or -# '/opt/tritonserver/qa/images/mug.jpg' on the QA container. -image_path = os.path.join(script_directory, "mug.jpg") -expected_output_class = "COFFEE MUG" - - -def _load_input_image(): - raw_image = Image.open(image_path) - raw_image = raw_image.convert("RGB").resize((224, 224), Image.BILINEAR) - input_image = np.array(raw_image).astype(np.float32) - input_image = (input_image / 127.5) - 1 - input_image = np.transpose(input_image, (2, 0, 1)) - input_image = np.reshape(input_image, (1, 3, 224, 224)) - return input_image - - -def _infer(input_image): - with httpclient.InferenceServerClient(server_url) as client: - input_tensors = httpclient.InferInput(input_name, input_image.shape, "FP32") - input_tensors.set_data_from_numpy(input_image) - results = client.infer(model_name=model_name, inputs=[input_tensors]) - output_tensors = results.as_numpy(output_name) - return output_tensors - - -def _check_output(output_tensors): - with open(label_path) as f: - labels_dict = {idx: line.strip() for idx, line in enumerate(f)} - max_id = np.argmax(output_tensors, axis=1)[0] - output_class = labels_dict[max_id] - print("Result: " + output_class) - print("Expected result: " + expected_output_class) - if output_class != expected_output_class: - return False - return True - - -if __name__ == "__main__": - input_image = _load_input_image() - output_tensors = _infer(input_image) - result_valid = _check_output(output_tensors) - - if not result_valid: - print("PyTorch platform handler example error: Unexpected result") - sys.exit(1) - - print("PASS: PyTorch platform handler") diff --git a/examples/pytorch_platform_handler/config.pbtxt b/examples/pytorch_platform_handler/config.pbtxt deleted file mode 100644 index 70d99dad..00000000 --- a/examples/pytorch_platform_handler/config.pbtxt +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "resnet50_pytorch" -backend: "python" -platform: "pytorch" - -max_batch_size: 128 - -input { - name: "INPUT" - data_type: TYPE_FP32 - format: FORMAT_NCHW - dims: [ 3, 224, 224 ] -} -output { - name: "OUTPUT" - data_type: TYPE_FP32 - dims: [ 1000 ] -} - -instance_group [{ kind: KIND_CPU }] diff --git a/examples/pytorch_platform_handler/model.py b/examples/pytorch_platform_handler/model.py deleted file mode 100755 index 391063b8..00000000 --- a/examples/pytorch_platform_handler/model.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import torch -import torchvision - - -class ResNet50(torch.nn.Module): - # This is a native PyTorch model class. `TritonPythonModel` is not needed. - - def __init__(self): - super().__init__() - self._model = torch.hub.load( - "pytorch/vision:v" + torchvision.__version__.split("+")[0], - "resnet50", - weights="ResNet50_Weights.IMAGENET1K_V2", - skip_validation=True, - ) - - def forward(self, input_tensor): - output_tensor = self._model(input_tensor) - return output_tensor diff --git a/examples/pytorch_platform_handler/resnet50_labels.txt b/examples/pytorch_platform_handler/resnet50_labels.txt deleted file mode 100644 index 2376a285..00000000 --- a/examples/pytorch_platform_handler/resnet50_labels.txt +++ /dev/null @@ -1,1000 +0,0 @@ -TENCH -GOLDFISH -WHITE SHARK -TIGER SHARK -HAMMERHEAD SHARK -ELECTRIC RAY -STINGRAY -ROOSTER -HEN -OSTRICH -BRAMBLING -GOLDFINCH -HOUSE FINCH -SNOWBIRD -INDIGO FINCH -ROBIN -BULBUL -JAY -MAGPIE -CHICKADEE -WATER OUZEL -KITE -BALD EAGLE -VULTURE -GREAT GREY OWL -FIRE SALAMANDER -NEWT -EFT -SPOTTED SALAMANDER -AXOLOTL -BULL FROG -TREE FROG -TAILED FROG -LOGGERHEAD -LEATHERBACK TURTLE -MUD TURTLE -TERRAPIN -BOX TURTLE -BANDED GECKO -COMMON IGUANA -AMERICAN CHAMELEON -WHIPTAIL -AGAMA -FRILLED LIZARD -ALLIGATOR LIZARD -GILA MONSTER -GREEN LIZARD -AFRICAN CHAMELEON -KOMODO DRAGON -AFRICAN CROCODILE -AMERICAN ALLIGATOR -TRICERATOPS -THUNDER SNAKE -RINGNECK SNAKE -HOGNOSE SNAKE -GREEN SNAKE -KING SNAKE -GARTER SNAKE -WATER SNAKE -VINE SNAKE -NIGHT SNAKE -BOA -ROCK PYTHON -COBRA -GREEN MAMBA -SEA SNAKE -HORNED VIPER -DIAMONDBACK -SIDEWINDER -TRILOBITE -HARVESTMAN -SCORPION -GARDEN SPIDER -BARN SPIDER -GARDEN SPIDER -BLACK WIDOW -TARANTULA -WOLF SPIDER -TICK -CENTIPEDE -GROUSE -PTARMIGAN -RUFFED GROUSE -PRAIRIE CHICKEN -PEACOCK -QUAIL -PARTRIDGE -AFRICAN GREY -MACAW -COCKATOO -LORIKEET -COUCAL -BEE EATER -HORNBILL -HUMMINGBIRD -JACAMAR -TOUCAN -DRAKE -MERGANSER -GOOSE -BLACK SWAN -TUSKER -ECHIDNA -PLATYPUS -WALLABY -KOALA -WOMBAT -JELLYFISH -SEA ANEMONE -BRAIN CORAL -FLATWORM -NEMATODE -CONCH -SNAIL -SLUG -SEA SLUG -CHITON -CHAMBERED NAUTILUS -DUNGENESS CRAB -ROCK CRAB -FIDDLER CRAB -KING CRAB -AMERICAN LOBSTER -SPINY LOBSTER -CRAYFISH -HERMIT CRAB -ISOPOD -WHITE STORK -BLACK STORK -SPOONBILL -FLAMINGO -LITTLE BLUE HERON -AMERICAN EGRET -BITTERN -CRANE -LIMPKIN -EUROPEAN GALLINULE -AMERICAN COOT -BUSTARD -RUDDY TURNSTONE -RED-BACKED SANDPIPER -REDSHANK -DOWITCHER -OYSTERCATCHER -PELICAN -KING PENGUIN -ALBATROSS -GREY WHALE -KILLER WHALE -DUGONG -SEA LION -CHIHUAHUA -JAPANESE SPANIEL -MALTESE DOG -PEKINESE -SHIH-TZU -BLENHEIM SPANIEL -PAPILLON -TOY TERRIER -RHODESIAN RIDGEBACK -AFGHAN HOUND -BASSET -BEAGLE -BLOODHOUND -BLUETICK -COONHOUND -WALKER HOUND -ENGLISH FOXHOUND -REDBONE -BORZOI -IRISH WOLFHOUND -ITALIAN GREYHOUND -WHIPPET -IBIZAN HOUND -NORWEGIAN ELKHOUND -OTTERHOUND -SALUKI -SCOTTISH DEERHOUND -WEIMARANER -STAFFORDSHIRE BULLTERRIER -STAFFORDSHIRE TERRIER -BEDLINGTON TERRIER -BORDER TERRIER -KERRY BLUE TERRIER -IRISH TERRIER -NORFOLK TERRIER -NORWICH TERRIER -YORKSHIRE TERRIER -WIRE-HAIRED FOX TERRIER -LAKELAND TERRIER -SEALYHAM TERRIER -AIREDALE -CAIRN -AUSTRALIAN TERRIER -DANDIE DINMONT -BOSTON BULL -MINIATURE SCHNAUZER -GIANT SCHNAUZER -STANDARD SCHNAUZER -SCOTCH TERRIER -TIBETAN TERRIER -SILKY TERRIER -WHEATEN TERRIER -WHITE TERRIER -LHASA -RETRIEVER -CURLY-COATED RETRIEVER -GOLDEN RETRIEVER -LABRADOR RETRIEVER -CHESAPEAKE BAY RETRIEVER -SHORT-HAIRED POINTER -VISLA -ENGLISH SETTER -IRISH SETTER -GORDON SETTER -BRITTANY SPANIEL -CLUMBER -ENGLISH SPRINGER -WELSH SPRINGER SPANIEL -COCKER SPANIEL -SUSSEX SPANIEL -IRISH WATERSPANIEL -KUVASZ -SCHIPPERKE -GROENENDAEL -MALINOIS -BRIARD -KELPIE -KOMONDOR -OLD ENGLISH SHEEPDOG -SHETLAND SHEEPDOG -COLLIE -BORDER COLLIE -BOUVIER DES FLANDRES -ROTTWEILER -GERMAN SHEPHERD -DOBERMAN -MINIATURE PINSCHER -GREATER SWISS MOUNTAIN DOG -BERNESE MOUNTAIN DOG -APPENZELLER -ENTLEBUCHER -BOXER -BULL MASTIFF -TIBETAN MASTIFF -FRENCH BULLDOG -GREAT DANE -SAINT BERNARD -ESKIMO DOG -MALAMUTE -SIBERIAN HUSKY -DALMATIAN -AFFENPINSCHER -BASENJI -PUG -LEONBERG -NEWFOUNDLAND -GREAT PYRENEES -SAMOYED -POMERANIAN -CHOW -KEESHOND -BRABANCON GRIFFON -PEMBROKE -CARDIGAN -TOY POODLE -MINIATURE POODLE -STANDARD POODLE -MEXICAN HAIRLESS -TIMBER WOLF -WHITE WOLF -RED WOLF -COYOTE -DINGO -DHOLE -AFRICAN HUNTING DOG -HYENA -RED FOX -KIT FOX -ARCTIC FOX -GREY FOX -TABBY -TIGER CAT -PERSIAN CAT -SIAMESE CAT -EGYPTIAN CAT -COUGAR -LYNX -LEOPARD -SNOW LEOPARD -JAGUAR -LION -TIGER -CHEETAH -BROWN BEAR -AMERICAN BLACK BEAR -ICE BEAR -SLOTH BEAR -MONGOOSE -MEERKAT -TIGER BEETLE -LADYBUG -GROUND BEETLE -LONG-HORNED BEETLE -LEAF BEETLE -DUNG BEETLE -RHINOCEROS BEETLE -WEEVIL -FLY -BEE -ANT -GRASSHOPPER -CRICKET -WALKING STICK -COCKROACH -MANTIS -CICADA -LEAFHOPPER -LACEWING -DRAGONFLY -DAMSELFLY -ADMIRAL -RINGLET -MONARCH -CABBAGE BUTTERFLY -SULPHUR BUTTERFLY -LYCAENID -STARFISH -SEA URCHIN -SEA CUCUMBER -WOOD RABBIT -HARE -ANGORA -HAMSTER -PORCUPINE -FOX SQUIRREL -MARMOT -BEAVER -GUINEA PIG -SORREL -ZEBRA -HOG -WILD BOAR -WARTHOG -HIPPOPOTAMUS -OX -WATER BUFFALO -BISON -RAM -BIGHORN -IBEX -HARTEBEEST -IMPALA -GAZELLE -ARABIAN CAMEL -LLAMA -WEASEL -MINK -POLECAT -BLACK-FOOTED FERRET -OTTER -SKUNK -BADGER -ARMADILLO -THREE-TOED SLOTH -ORANGUTAN -GORILLA -CHIMPANZEE -GIBBON -SIAMANG -GUENON -PATAS -BABOON -MACAQUE -LANGUR -COLOBUS -PROBOSCIS MONKEY -MARMOSET -CAPUCHIN -HOWLER MONKEY -TITI -SPIDER MONKEY -SQUIRREL MONKEY -MADAGASCAR CAT -INDRI -INDIAN ELEPHANT -AFRICAN ELEPHANT -LESSER PANDA -GIANT PANDA -BARRACOUTA -EEL -COHO -ROCK BEAUTY -ANEMONE FISH -STURGEON -GAR -LIONFISH -PUFFER -ABACUS -ABAYA -ACADEMIC GOWN -ACCORDION -ACOUSTIC GUITAR -AIRCRAFT CARRIER -AIRLINER -AIRSHIP -ALTAR -AMBULANCE -AMPHIBIAN -ANALOG CLOCK -APIARY -APRON -ASHCAN -ASSAULT RIFLE -BACKPACK -BAKERY -BALANCE BEAM -BALLOON -BALLPOINT -BAND AID -BANJO -BANNISTER -BARBELL -BARBER CHAIR -BARBERSHOP -BARN -BAROMETER -BARREL -BARROW -BASEBALL -BASKETBALL -BASSINET -BASSOON -BATHING CAP -BATH TOWEL -BATHTUB -BEACH WAGON -BEACON -BEAKER -BEARSKIN -BEER BOTTLE -BEER GLASS -BELL COTE -BIB -BICYCLE-BUILT-FOR-TWO -BIKINI -BINDER -BINOCULARS -BIRDHOUSE -BOATHOUSE -BOBSLED -BOLO TIE -BONNET -BOOKCASE -BOOKSHOP -BOTTLECAP -BOW -BOW TIE -BRASS -BRASSIERE -BREAKWATER -BREASTPLATE -BROOM -BUCKET -BUCKLE -BULLETPROOF VEST -BULLET TRAIN -BUTCHER SHOP -CAB -CALDRON -CANDLE -CANNON -CANOE -CAN OPENER -CARDIGAN -CAR MIRROR -CAROUSEL -CARPENTERS KIT -CARTON -CAR WHEEL -CASH MACHINE -CASSETTE -CASSETTE PLAYER -CASTLE -CATAMARAN -CD PLAYER -CELLO -CELLULAR TELEPHONE -CHAIN -CHAINLINK FENCE -CHAIN MAIL -CHAIN SAW -CHEST -CHIFFONIER -CHIME -CHINA CABINET -CHRISTMAS STOCKING -CHURCH -CINEMA -CLEAVER -CLIFF DWELLING -CLOAK -CLOG -COCKTAIL SHAKER -COFFEE MUG -COFFEEPOT -COIL -COMBINATION LOCK -COMPUTER KEYBOARD -CONFECTIONERY -CONTAINER SHIP -CONVERTIBLE -CORKSCREW -CORNET -COWBOY BOOT -COWBOY HAT -CRADLE -CRANE -CRASH HELMET -CREATE -CRIB -CROCK POT -CROQUET BALL -CRUTCH -CUIRASS -DAM -DESK -DESKTOP COMPUTER -DIAL TELEPHONE -DIAPER -DIGITAL CLOCK -DIGITAL WATCH -DINING TABLE -DISHRAG -DISHWASHER -DISK BRAKE -DOCK -DOGSLED -DOME -DOORMAT -DRILLING PLATFORM -DRUM -DRUMSTICK -DUMBBELL -DUTCH OVEN -ELECTRIC FAN -ELECTRIC GUITAR -ELECTRIC LOCOMOTIVE -ENTERTAINMENT CENTER -ENVELOPE -ESPRESSO MAKER -FACE POWDER -FEATHER BOA -FILE -FIREBOAT -FIRE ENGINE -FIRE SCREEN -FLAGPOLE -FLUTE -FOLDING CHAIR -FOOTBALL HELMET -FORKLIFT -FOUNTAIN -FOUNTAIN PEN -FOUR-POSTER -FREIGHT CAR -FRENCH HORN -FRYING PAN -FUR COAT -GARBAGE TRUCK -GASMASK -GAS PUMP -GOBLET -GO-KART -GOLF BALL -GOLFCART -GONDOLA -GONG -GOWN -GRAND PIANO -GREENHOUSE -GRILLE -GROCERY STORE -GUILLOTINE -HAIR SLIDE -HAIR SPRAY -HALF TRACK -HAMMER -HAMPER -HAND BLOWER -HAND-HELD COMPUTER -HANDKERCHIEF -HARD DISC -HARMONICA -HARP -HARVESTER -HATCHET -HOLSTER -HOME THEATER -HONEYCOMB -HOOK -HOOPSKIRT -HORIZONTAL BAR -HORSE CART -HOURGLASS -IPOD -IRON -JACK-O-LANTERN -JEAN -JEEP -JERSEY -JIGSAW PUZZLE -JINRIKISHA -JOYSTICK -KIMONO -KNEE PAD -KNOT -LAB COAT -LADLE -LAMPSHADE -LAPTOP -LAWN MOWER -LENS CAP -LETTER OPENER -LIBRARY -LIFEBOAT -LIGHTER -LIMOUSINE -LINER -LIPSTICK -LOAFER -LOTION -LOUDSPEAKER -LOUPE -LUMBERMILL -MAGNETIC COMPASS -MAILBAG -MAILBOX -MAILLOT -MAILLOT -MANHOLE COVER -MARACA -MARIMBA -MASK -MATCHSTICK -MAYPOLE -MAZE -MEASURING CUP -MEDICINE CHEST -MEGALITH -MICROPHONE -MICROWAVE -MILITARY UNIFORM -MILK CAN -MINIBUS -MINISKIRT -MINIVAN -MISSILE -MITTEN -MIXING BOWL -MOBILE HOME -MODEL T -MODEM -MONASTERY -MONITOR -MOPED -MORTAR -MORTARBOARD -MOSQUE -MOSQUITO NET -MOTOR SCOOTER -MOUNTAIN BIKE -MOUNTAIN TENT -MOUSE -MOUSETRAP -MOVING VAN -MUZZLE -NAIL -NECK BRACE -NECKLACE -NIPPLE -NOTEBOOK -OBELISK -OBOE -OCARINA -ODOMETER -OIL FILTER -ORGAN -OSCILLOSCOPE -OVERSKIRT -OXCART -OXYGEN MASK -PACKET -PADDLE -PADDLEWHEEL -PADLOCK -PAINTBRUSH -PAJAMA -PALACE -PANPIPE -PAPER TOWEL -PARACHUTE -PARALLEL BARS -PARK BENCH -PARKING METER -PASSENGER CAR -PATIO -PAY-PHONE -PEDESTAL -PENCIL BOX -PENCIL SHARPENER -PERFUME -PETRI DISH -PHOTOCOPIER -PICK -PICKELHAUBE -PICKET FENCE -PICKUP -PIER -PIGGY BANK -PILL BOTTLE -PILLOW -PING-PONG BALL -PINWHEEL -PIRATE -PITCHER -PLANE -PLANETARIUM -PLASTIC BAG -PLATE RACK -PLOW -PLUNGER -POLAROID CAMERA -POLE -POLICE VAN -PONCHO -POOL TABLE -POP BOTTLE -POT -POTTERS WHEEL -POWER DRILL -PRAYER RUG -PRINTER -PRISON -PROJECTILE -PROJECTOR -PUCK -PUNCHING BAG -PURSE -QUILL -QUILT -RACER -RACKET -RADIATOR -RADIO -RADIO TELESCOPE -RAIN BARREL -RECREATIONAL VEHICLE -REEL -REFLEX CAMERA -REFRIGERATOR -REMOTE CONTROL -RESTAURANT -REVOLVER -RIFLE -ROCKING CHAIR -ROTISSERIE -RUBBER ERASER -RUGBY BALL -RULE -RUNNING SHOE -SAFE -SAFETY PIN -SALTSHAKER -SANDAL -SARONG -SAX -SCABBARD -SCALE -SCHOOL BUS -SCHOONER -SCOREBOARD -SCREEN -SCREW -SCREWDRIVER -SEAT BELT -SEWING MACHINE -SHIELD -SHOE SHOP -SHOJI -SHOPPING BASKET -SHOPPING CART -SHOVEL -SHOWER CAP -SHOWER CURTAIN -SKI -SKI MASK -SLEEPING BAG -SLIDE RULE -SLIDING DOOR -SLOT -SNORKEL -SNOWMOBILE -SNOWPLOW -SOAP DISPENSER -SOCCER BALL -SOCK -SOLAR DISH -SOMBRERO -SOUP BOWL -SPACE BAR -SPACE HEATER -SPACE SHUTTLE -SPATULA -SPEEDBOAT -SPIDER WEB -SPINDLE -SPORTS CAR -SPOTLIGHT -STAGE -STEAM LOCOMOTIVE -STEEL ARCH BRIDGE -STEEL DRUM -STETHOSCOPE -STOLE -STONE WALL -STOPWATCH -STOVE -STRAINER -STREETCAR -STRETCHER -STUDIO COUCH -STUPA -SUBMARINE -SUIT -SUNDIAL -SUNGLASS -SUNGLASSES -SUNSCREEN -SUSPENSION BRIDGE -SWAB -SWEATSHIRT -SWIMMING TRUNKS -SWING -SWITCH -SYRINGE -TABLE LAMP -TANK -TAPE PLAYER -TEAPOT -TEDDY -TELEVISION -TENNIS BALL -THATCH -THEATER CURTAIN -THIMBLE -THRESHER -THRONE -TILE ROOF -TOASTER -TOBACCO SHOP -TOILET SEAT -TORCH -TOTEM POLE -TOW TRUCK -TOYSHOP -TRACTOR -TRAILER TRUCK -TRAY -TRENCH COAT -TRICYCLE -TRIMARAN -TRIPOD -TRIUMPHAL ARCH -TROLLEYBUS -TROMBONE -TUB -TURNSTILE -TYPEWRITER KEYBOARD -UMBRELLA -UNICYCLE -UPRIGHT -VACUUM -VASE -VAULT -VELVET -VENDING MACHINE -VESTMENT -VIADUCT -VIOLIN -VOLLEYBALL -WAFFLE IRON -WALL CLOCK -WALLET -WARDROBE -WARPLANE -WASHBASIN -WASHER -WATER BOTTLE -WATER JUG -WATER TOWER -WHISKEY JUG -WHISTLE -WIG -WINDOW SCREEN -WINDOW SHADE -WINDSOR TIE -WINE BOTTLE -WING -WOK -WOODEN SPOON -WOOL -WORM FENCE -WRECK -YAWL -YURT -WEB SITE -COMIC BOOK -CROSSWORD PUZZLE -STREET SIGN -TRAFFIC LIGHT -BOOK JACKET -MENU -PLATE -GUACAMOLE -CONSOMME -HOT POT -TRIFLE -ICE CREAM -ICE LOLLY -FRENCH LOAF -BAGEL -PRETZEL -CHEESEBURGER -HOTDOG -MASHED POTATO -HEAD CABBAGE -BROCCOLI -CAULIFLOWER -ZUCCHINI -SPAGHETTI SQUASH -ACORN SQUASH -BUTTERNUT SQUASH -CUCUMBER -ARTICHOKE -BELL PEPPER -CARDOON -MUSHROOM -GRANNY SMITH -STRAWBERRY -ORANGE -LEMON -FIG -PINEAPPLE -BANANA -JACKFRUIT -CUSTARD APPLE -POMEGRANATE -HAY -CARBONARA -CHOCOLATE SAUCE -DOUGH -MEAT LOAF -PIZZA -POTPIE -BURRITO -RED WINE -ESPRESSO -CUP -EGGNOG -ALP -BUBBLE -CLIFF -CORAL REEF -GEYSER -LAKESIDE -PROMONTORY -SANDBAR -SEASHORE -VALLEY -VOLCANO -BALLPLAYER -GROOM -SCUBA DIVER -RAPESEED -DAISY -LADY SLIPPER -CORN -ACORN -HIP -BUCKEYE -CORAL FUNGUS -AGARIC -GYROMITRA -STINKHORN -EARTHSTAR -HEN-OF-THE-WOODS -BOLETE -EAR -TOILET TISSUE From a2e8f9be200fdfa256d0385d54613729cc7870a0 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 12 Oct 2023 09:42:10 -0400 Subject: [PATCH 151/216] Restructure cleaning up of the futures in decoupled mode (#309) * Restructure cleaning up of the futures in decoupled * Minor improvement --- src/python_be.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/python_be.cc b/src/python_be.cc index db979562..de639df3 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -777,9 +777,7 @@ ModelInstanceState::DecoupledMessageQueueMonitor() std::packaged_task task([this, response_send_message] { ResponseSendDecoupled(response_send_message); }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); + boost::asio::post(*thread_pool_, std::move(task)); } else if ( message->Command() == PYTHONSTUB_InferExecRequest || message->Command() == PYTHONSTUB_InferStreamExecRequest) { @@ -789,9 +787,7 @@ ModelInstanceState::DecoupledMessageQueueMonitor() bls_execute, (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); + boost::asio::post(*thread_pool_, std::move(task)); } } } @@ -1708,12 +1704,14 @@ ModelInstanceState::~ModelInstanceState() Stub()->UpdateHealth(); if (Stub()->IsHealthy()) { if (model_state->IsDecoupled()) { - futures_.clear(); + // Wait for all the pending tasks to finish. + thread_pool_->wait(); // Push a dummy message to signal the thread to terminate. Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); decoupled_monitor_.join(); + } else { + thread_pool_->wait(); } - thread_pool_->wait(); } // Terminate stub first to allow any last messages to be received by the back // end before deallocating the queue memory From 5ceac729f91d1c9db630158dd7a1633e06f897ee Mon Sep 17 00:00:00 2001 From: dyastremsky <58150256+dyastremsky@users.noreply.github.com> Date: Thu, 12 Oct 2023 06:51:43 -0700 Subject: [PATCH 152/216] Remove Pytorch platform handler documentation (#310) --- README.md | 110 ------------------------------------------------------ 1 file changed, 110 deletions(-) diff --git a/README.md b/README.md index 4cb9a960..514d4214 100644 --- a/README.md +++ b/README.md @@ -1451,116 +1451,6 @@ this workflow. For a simple example of using PyTorch in a Python Backend model, see the [AddSubNet PyTorch example](#addsubnet-in-pytorch). -### PyTorch Platform \[Experimental\] - -**NOTE**: *This feature is subject to change and removal, and should not -be used in production.* - -Starting from 23.08, we are adding an experimental support for loading and -serving PyTorch models directly via Python backend. The model can be provided -within the triton server model repository, and a -[pre-built Python model](src/resources/platform_handlers/pytorch/model.py) will -be used to load and serve the PyTorch model. - -#### Model Layout - -The model repository should look like: - -``` -model_repository/ -`-- model_directory - |-- 1 - | |-- model.py - | `-- model.pt - `-- config.pbtxt -``` - -The `model.py` contains the class definition of the PyTorch model. The class -should extend the -[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). -The `model.pt` may be optionally provided which contains the saved -[`state_dict`](https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-loading-model-for-inference) -of the model. For serving TorchScript models, a `model.pt` TorchScript can be -provided in place of the `model.py` file. - -By default, Triton will use the -[PyTorch backend](https://github.com/triton-inference-server/pytorch_backend) to -load and serve TorchScript models. In order to serve from Python backend, -[model configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md) -should explicitly provide the following settings: - -``` -backend: "python" -platform: "pytorch" -``` - -#### PyTorch Installation - -This feature will take advantage of the -[`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) -optimization, make sure the -[PyTorch 2.0+ pip package](https://pypi.org/project/torch/2.0.1/) is available -in the same Python environment. - -``` -pip install torch==2.0.1 -``` -Alternatively, a -[Python Execution Environment](#using-custom-python-execution-environments) -with the PyTorch dependency may be used. - -#### Customization - -The following PyTorch settings may be customized by setting parameters on the -`config.pbtxt`. - -[`torch.set_num_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads) -- Key: NUM_THREADS -- Value: The number of threads used for intraop parallelism on CPU. - -[`torch.set_num_interop_threads(int)`](https://pytorch.org/docs/stable/generated/torch.set_num_interop_threads.html#torch.set_num_interop_threads) -- Key: NUM_INTEROP_THREADS -- Value: The number of threads used for interop parallelism (e.g. in JIT -interpreter) on CPU. - -[`torch.compile()` parameters](https://pytorch.org/docs/stable/generated/torch.compile.html#torch-compile) -- Key: TORCH_COMPILE_OPTIONAL_PARAMETERS -- Value: Any of following parameter(s) encoded as a JSON object. - - fullgraph (*bool*): Whether it is ok to break model into several subgraphs. - - dynamic (*bool*): Use dynamic shape tracing. - - backend (*str*): The backend to be used. - - mode (*str*): Can be either "default", "reduce-overhead" or "max-autotune". - - options (*dict*): A dictionary of options to pass to the backend. - - disable (*bool*): Turn `torch.compile()` into a no-op for testing. - -For example: -``` -parameters: { - key: "NUM_THREADS" - value: { string_value: "4" } -} -parameters: { - key: "TORCH_COMPILE_OPTIONAL_PARAMETERS" - value: { string_value: "{\"disable\": true}" } -} -``` - -#### Example - -You can find the complete example instructions in -[examples/pytorch_platform_handler](examples/pytorch_platform_handler/README.md). - -#### Limitations - -Following are few known limitations of this feature: -- Python functions optimizable by `torch.compile` may not be served directly in -the `model.py` file, they need to be enclosed by a class extending the -[`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module). -- Model weights cannot be shared across multiple instances on the same GPU -device. -- When using `KIND_MODEL` as model instance kind, the default device of the -first parameter on the model is used. - ### PyTorch Determinism When running PyTorch code, you may notice slight differences in output values From a442f3fcc05b07e7937425e037c312b3ca777c25 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 12 Oct 2023 16:43:05 -0700 Subject: [PATCH 153/216] Remove PyTorch Platform option from menu (#312) --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 514d4214..2585d915 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,6 @@ any C++ code. - [Input Tensor Device Placement](#input-tensor-device-placement) - [Frameworks](#frameworks) - [PyTorch](#pytorch) - - [PyTorch Platform \[Experimental\]](#pytorch-platform-experimental) - [PyTorch Determinism](#pytorch-determinism) - [TensorFlow](#tensorflow) - [TensorFlow Determinism](#tensorflow-determinism) From a8e61939d774993c64df388a9e0044a176873865 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 24 Oct 2023 15:00:27 -0700 Subject: [PATCH 154/216] Add parameters support to InferenceRequest (#313) * Add parameters support to InferenceRequest * Safeguard default argument against mutation --- src/pb_stub.cc | 50 +++++++++++++++++++++++++++++++++++++---- src/request_executor.cc | 30 +++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index b38f8d38..bc929525 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -76,6 +76,27 @@ SignalHandler(int signum) // Skip the SIGINT and SIGTERM } +template +PYTYPE +PyDefaultArgumentToMutableType(const py::object& argument) +{ + // The default argument on Python functions always reference the same copy, + // meaning if the default argument is changed by the function, then it is + // changed for all subsequent calls to the function. Thus, default arguments + // should be limited to basic types (i.e. None). This helper function returns + // an empty expected type, if the argument is None (i.e. default initialized). + // If the argument is neither None nor expected type, an exception is thrown. + if (py::isinstance(argument)) { + return PYTYPE(); + } + if (py::isinstance(argument)) { + return argument; + } + throw PythonBackendException( + std::string("Expect ") + typeid(PYTYPE).name() + ", got " + + std::string(py::str(argument.get_type()))); +} + void Stub::Instantiate( int64_t shm_growth_size, int64_t shm_default_size, @@ -1464,15 +1485,35 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const int64_t model_version, const uint32_t flags, const int32_t timeout, const PreferredMemory& preferred_memory, - const InferenceTrace& trace) { + const InferenceTrace& trace, + const py::object& parameters_) { + py::dict parameters = + PyDefaultArgumentToMutableType(parameters_); std::set requested_outputs; for (auto& requested_output_name : requested_output_names) { requested_outputs.emplace(requested_output_name); } - // FIXME: InferenceRequest parameters are not supported in BLS now. + for (const auto& pair : parameters) { + if (!py::isinstance(pair.first)) { + throw PythonBackendException( + "Expect parameters keys to have type str, found type " + + std::string(py::str(pair.first.get_type()))); + } + if (!py::isinstance(pair.second) && + !py::isinstance(pair.second) && + !py::isinstance(pair.second)) { + throw PythonBackendException( + "Expect parameters values to have type bool/int/str, found " + "type " + + std::string(py::str(pair.second.get_type()))); + } + } + py::module_ py_json = py::module_::import("json"); + std::string parameters_str = + py::str(py_json.attr("dumps")(parameters)); return std::make_shared( request_id, correlation_id, inputs, requested_outputs, - model_name, model_version, "" /*parameters*/, flags, timeout, + model_name, model_version, parameters_str, flags, timeout, 0 /*response_factory_address*/, 0 /*request_address*/, preferred_memory, trace); }), @@ -1485,7 +1526,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = PreferredMemory(PreferredMemory::DEFAULT, 0), - py::arg("trace").none(false) = InferenceTrace()) + py::arg("trace").none(false) = InferenceTrace(), + py::arg("parameters").none(true) = py::none()) .def( "inputs", &InferRequest::Inputs, py::return_value_policy::reference_internal) diff --git a/src/request_executor.cc b/src/request_executor.cc index b54e3988..2a6d9575 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -365,6 +365,36 @@ RequestExecutor::Infer( infer_request->Trace().triton_trace_, &trace)); } + const std::string& param_str = infer_request->Parameters(); + triton::common::TritonJson::Value param; + THROW_IF_TRITON_ERROR(param.Parse(param_str.c_str(), param_str.length())); + std::vector param_keys; + THROW_IF_TRITON_ERROR(param.Members(¶m_keys)); + for (const auto& key : param_keys) { + triton::common::TritonJson::Value value; + if (!param.Find(key.c_str(), &value)) { + throw PythonBackendException("Unexpected missing key on parameters"); + } + if (value.IsString()) { + std::string string_value; + THROW_IF_TRITON_ERROR(value.AsString(&string_value)); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetStringParameter( + irequest, key.c_str(), string_value.c_str())); + } else if (value.IsInt()) { + int64_t int_value = 0; + THROW_IF_TRITON_ERROR(value.AsInt(&int_value)); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetIntParameter( + irequest, key.c_str(), int_value)); + } else if (value.IsBool()) { + bool bool_value = false; + THROW_IF_TRITON_ERROR(value.AsBool(&bool_value)); + THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestSetBoolParameter( + irequest, key.c_str(), bool_value)); + } else { + throw PythonBackendException("Unsupported value type on parameters"); + } + } + for (auto& infer_input : infer_request->Inputs()) { THROW_IF_TRITON_ERROR(TRITONSERVER_InferenceRequestAddInput( irequest, infer_input->Name().c_str(), From c50417c4e8e09a0929669588520fa4fdc53df96c Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 24 Oct 2023 15:22:52 -0700 Subject: [PATCH 155/216] Add parameters documentation (#314) --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 2585d915..9c59c144 100644 --- a/README.md +++ b/README.md @@ -638,6 +638,13 @@ returns a JSON string where the keys are the keys of the parameters object and the values are the values for the parameters field. Note that you need to parse this string using `json.loads` to convert it to a dictionary. +Starting from 23.11 release, parameters may be provided to the `InferenceRequest` +object during construction. The parameters should be a dictionary of key value +pairs, where keys are `str` and values are `bool`, `int` or `str`. +```python +request = pb_utils.InferenceRequest(parameters={"key": "value"}, ...) +``` + You can read more about the inference request parameters in the [parameters extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_parameters.md) documentation. From 4c0a977d37d561ccf9773aaf7f355d7445a5c00f Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Wed, 25 Oct 2023 15:15:33 -0700 Subject: [PATCH 156/216] Optimize GPU tensor support for Python backend (#293) * Use CUDA shared memory pool to optimize tensor transfer between processes * Fix up: use the data ptr to get the tensor * Remove extra data copy. Use cudaMemcpyAsync for GPU output to improve performance. * Fix error handling. Fix bls tensor lifetime * Move CUDAMemPoolMessage inside TRITON_ENABLE_GPU directive * Fix CPU build * Address comments * Fix GPU tensor lifecycle * Make it be able to share cuda pool on different devices to the stub when needed * Remove data copy from PbMemory class * Fix up syntax, remove unused comments * Simplify PbMemory functionality. Let different io cases handle the memory allocation * Remove duplicated logic * Address comments * Address comment * Fix CPU only build --- src/infer_request.cc | 2 +- src/infer_response.cc | 63 +++++++++-- src/ipc_message.h | 2 + src/memory_manager.cc | 23 ++-- src/memory_manager.h | 11 +- src/pb_memory.cc | 113 ++++++++++++------ src/pb_memory.h | 33 +++++- src/pb_stub.cc | 245 +++++++++++++++++++++++++++------------- src/pb_stub.h | 7 ++ src/pb_tensor.cc | 4 +- src/pb_tensor.h | 3 +- src/pb_utils.cc | 23 +++- src/pb_utils.h | 20 ++++ src/python_be.cc | 186 ++++++++++++++++++++++++------ src/python_be.h | 3 + src/request_executor.cc | 47 ++++---- src/shm_manager.cc | 49 ++++++++ src/shm_manager.h | 33 ++++++ src/stub_launcher.cc | 108 +++++++++++++++++- src/stub_launcher.h | 10 ++ 20 files changed, 778 insertions(+), 207 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index e9d243f1..4c2d2575 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -598,7 +598,7 @@ InferRequest::Exec(const bool is_decoupled) if (!output_tensor->IsCPU()) { uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId(); output_tensor->Memory()->SetMemoryReleaseCallback( - [&memory_manager_message_queue, memory_release_id]() { + [&memory_manager_message_queue, memory_release_id, &shm_pool]() { memory_manager_message_queue->Push(memory_release_id); }); } diff --git a/src/infer_response.cc b/src/infer_response.cc index ebadc02d..09737b26 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -249,6 +249,11 @@ InferResponse::Send( } bool cuda_copy = false; +#ifdef TRITON_ENABLE_GPU + // This variable is used to avoid printing the same message multiple times + // when the output tensor is failed to be allocated from the CUDA memory pool. + bool log_warning = true; +#endif // TRITON_ENABLE_GPU for (auto& output_tensor : OutputTensors()) { // FIXME: for decoupled models we will skip the requested output names. @@ -270,11 +275,12 @@ InferResponse::Send( static_cast(output_tensor->TritonDtype()), output_tensor->Dims().data(), output_tensor->Dims().size())); - void* buffer; + void* triton_output_buffer; SET_ERROR_AND_RETURN( - response_error, TRITONBACKEND_OutputBuffer( - response_output, &buffer, output_tensor->ByteSize(), - &actual_memory_type, &actual_memory_type_id)); + response_error, + TRITONBACKEND_OutputBuffer( + response_output, &triton_output_buffer, output_tensor->ByteSize(), + &actual_memory_type, &actual_memory_type_id)); bool cuda_used = false; TRITONSERVER_BufferAttributes* output_buffer_attributes; @@ -286,6 +292,40 @@ InferResponse::Send( if (src_memory_type == TRITONSERVER_MEMORY_GPU && actual_memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU + // Check if the triton-provided output buffer is using CUDA shared memory + // pool. If not, try to allocate a new buffer from the pool. + void* buffer = triton_output_buffer; + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + std::unique_ptr& cuda_pool = + shm_pool->GetCUDAMemoryPoolManager(); + if (cuda_pool->UseCudaSharedPool(src_memory_type_id)) { + try { + if (!IsUsingCUDAPool( + cuda_pool, actual_memory_type_id, triton_output_buffer)) { + THROW_IF_TRITON_ERROR(BackendMemory::Create( + reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager() + ->TritonMemoryManager()), + BackendMemory::AllocationType::GPU_POOL, actual_memory_type_id, + output_tensor->ByteSize(), &backend_memory)); + lbackend_memory.reset(backend_memory); + buffer = lbackend_memory->MemoryPtr(); + } + } + catch (const PythonBackendException& pb_exception) { + if (log_warning) { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Failed to allocate memory from CUDA memory pool " + "for output tensor: ") + + pb_exception.what() + + std::string(", will use CUDA IPC for GPU output transfer.")) + .c_str()); + } + log_warning = false; + } + } cudaIpcMemHandle_t* cuda_ipc_mem_handle_p; SET_ERROR_AND_RETURN( response_error, @@ -309,8 +349,13 @@ InferResponse::Send( output_tensor->ByteSize(), reinterpret_cast(buffer), true /* copy_gpu */)); } + + if (lbackend_memory != nullptr) { + output_buffer->SetBackendMemory(std::move(lbackend_memory)); + } gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle()); - output_buffers.push_back({std::move(output_buffer), buffer}); + output_buffers.push_back( + {std::move(output_buffer), triton_output_buffer}); #endif } @@ -325,7 +370,8 @@ InferResponse::Send( output_tensor->ByteSize(), nullptr /* data ptr */)); gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle()); - output_buffers.push_back({std::move(output_buffer), buffer}); + output_buffers.push_back( + {std::move(output_buffer), triton_output_buffer}); } if (src_memory_type != TRITONSERVER_MEMORY_GPU) { @@ -334,8 +380,9 @@ InferResponse::Send( CopyBuffer( "Failed to copy the output tensor to buffer.", src_memory_type, src_memory_type_id, actual_memory_type, actual_memory_type_id, - output_tensor->ByteSize(), output_tensor->DataPtr(), buffer, - reinterpret_cast(cuda_stream), &cuda_used)); + output_tensor->ByteSize(), output_tensor->DataPtr(), + triton_output_buffer, reinterpret_cast(cuda_stream), + &cuda_used)); } cuda_copy |= cuda_used; diff --git a/src/ipc_message.h b/src/ipc_message.h index 14d3dc5f..d720a84d 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -41,12 +41,14 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_ExecuteResponse, PYTHONSTUB_InitializeRequest, PYTHONSTUB_InitializeResponse, + PYTHONSTUB_CUDAPoolInitializeRequest, PYTHONSTUB_FinalizeRequest, PYTHONSTUB_FinalizeResponse, PYTHONSTUB_LoadGPUBuffers, PYTHONSTUB_InferExecRequest, PYTHONSTUB_InferStreamExecRequest, PYTHONSTUB_InferExecResponse, + PYTHONSTUB_InferStreamExecResponse, PYTHONSTUB_ResponseSend, PYTHONSTUB_ResponseClose, PYTHONSTUB_AutoCompleteRequest, diff --git a/src/memory_manager.cc b/src/memory_manager.cc index 23ac99be..716dee9e 100644 --- a/src/memory_manager.cc +++ b/src/memory_manager.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -33,29 +33,23 @@ namespace triton { namespace backend { namespace python { #ifdef TRITON_ENABLE_GPU -GPUMemoryRecord::GPUMemoryRecord(void* ptr) +BackendMemoryRecord::BackendMemoryRecord( + std::unique_ptr backend_memory) + : backend_memory_(std::move(backend_memory)) { - ptr_ = ptr; release_callback_ = [](void* ptr) { - cudaError_t err = cudaFree(ptr); - if (err != cudaSuccess) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - (std::string("Failed to free the allocated cuda memory. error: ") + - cudaGetErrorString(err)) - .c_str()); - } + // Do nothing. The backend_memory_ will be destroyed in the destructor. }; } void* -GPUMemoryRecord::MemoryId() +BackendMemoryRecord::MemoryId() { - return ptr_; + return reinterpret_cast(backend_memory_->MemoryPtr()); } const std::function& -GPUMemoryRecord::ReleaseCallback() +BackendMemoryRecord::ReleaseCallback() { return release_callback_; } @@ -101,6 +95,7 @@ MemoryManager::QueueMonitorThread() // Call the release callback. it->second->ReleaseCallback()(it->second->MemoryId()); + // it->second.reset(); records_.erase(it); } } diff --git a/src/memory_manager.h b/src/memory_manager.h index 3ea6cc12..5b7e35f5 100644 --- a/src/memory_manager.h +++ b/src/memory_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -33,6 +33,7 @@ #include "message_queue.h" #include "triton/backend/backend_common.h" +#include "triton/backend/backend_memory.h" #include "triton/core/tritonserver.h" #ifdef TRITON_ENABLE_GPU @@ -46,17 +47,19 @@ class MemoryRecord { public: virtual const std::function& ReleaseCallback() = 0; virtual void* MemoryId() = 0; + virtual ~MemoryRecord() = default; }; #ifdef TRITON_ENABLE_GPU -class GPUMemoryRecord : public MemoryRecord { +class BackendMemoryRecord : public MemoryRecord { public: - GPUMemoryRecord(void* ptr); + BackendMemoryRecord(std::unique_ptr backend_memory); const std::function& ReleaseCallback() override; void* MemoryId() override; + ~BackendMemoryRecord() { backend_memory_.reset(); } private: - void* ptr_; + std::unique_ptr backend_memory_; std::function release_callback_; }; #endif diff --git a/src/pb_memory.cc b/src/pb_memory.cc index c18bf912..fa32bb1c 100644 --- a/src/pb_memory.cc +++ b/src/pb_memory.cc @@ -35,7 +35,6 @@ PbMemory::Create( uint64_t byte_size, char* data, bool copy_gpu) { size_t requested_byte_size = sizeof(MemoryShm); - if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU requested_byte_size += sizeof(cudaIpcMemHandle_t); @@ -46,9 +45,10 @@ PbMemory::Create( AllocatedSharedMemory memory_shm = shm_pool->Construct(requested_byte_size); + PbMemory::FillShmData( - memory_type, memory_type_id, byte_size, data, memory_shm.data_.get(), - memory_shm.handle_, copy_gpu); + shm_pool->GetCUDAMemoryPoolManager(), memory_type, memory_type_id, + byte_size, data, memory_shm.data_.get(), memory_shm.handle_, copy_gpu); if (memory_type == TRITONSERVER_MEMORY_CPU) { data = memory_shm.data_.get() + sizeof(MemoryShm); @@ -83,12 +83,14 @@ PbMemory::Create( std::unique_ptr PbMemory::Create( + std::unique_ptr& shm_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu) { PbMemory::FillShmData( - memory_type, memory_type_id, byte_size, data, data_shm, handle, copy_gpu); + shm_pool->GetCUDAMemoryPoolManager(), memory_type, memory_type_id, + byte_size, data, data_shm, handle, copy_gpu); if (memory_type == TRITONSERVER_MEMORY_CPU) { data = data_shm + sizeof(MemoryShm); @@ -176,14 +178,15 @@ PbMemory::CopyBuffer( void PbMemory::FillShmData( + std::unique_ptr& cuda_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu) { char* memory_data_shm = data_shm + sizeof(MemoryShm); MemoryShm* memory_shm_ptr = reinterpret_cast(data_shm); - memory_shm_ptr->is_cuda_handle_set = copy_gpu; memory_shm_ptr->memory_release_id = 0; + bool use_cuda_shared_pool = false; if (memory_type == TRITONSERVER_MEMORY_GPU) { #ifdef TRITON_ENABLE_GPU @@ -193,8 +196,15 @@ PbMemory::FillShmData( THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle( reinterpret_cast(memory_data_shm), data)); } + if (cuda_pool->UseCudaSharedPool(memory_type_id) && + IsUsingCUDAPool(cuda_pool, memory_type_id, data)) { + use_cuda_shared_pool = true; + memory_shm_ptr->cuda_pool_offset = + data - + reinterpret_cast(cuda_pool->CUDAPoolAddress(memory_type_id)); + } } -#endif +#endif // TRITON_ENABLE_GPU } else { if (data != nullptr) { std::copy(data, data + byte_size, memory_data_shm); @@ -204,10 +214,12 @@ PbMemory::FillShmData( memory_shm_ptr->byte_size = byte_size; memory_shm_ptr->memory_type_id = memory_type_id; memory_shm_ptr->memory_type = memory_type; + memory_shm_ptr->use_cuda_shared_pool = use_cuda_shared_pool; } std::unique_ptr PbMemory::LoadFromSharedMemory( + std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t handle, char* data_shm, bool open_cuda_handle) { @@ -219,21 +231,32 @@ PbMemory::LoadFromSharedMemory( if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU && open_cuda_handle) { #ifdef TRITON_ENABLE_GPU - cudaIpcMemHandle_t* cuda_handle = - reinterpret_cast(memory_data_shm); + if (memory_shm_ptr->use_cuda_shared_pool) { + // When CUDA shared memory pool is used, the stub will retrieve the + // data pointer using the offset. + data_ptr = + (reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager()->CUDAPoolAddress( + memory_shm_ptr->memory_type_id)) + + memory_shm_ptr->cuda_pool_offset); + } else { + cudaIpcMemHandle_t* cuda_handle = + reinterpret_cast(memory_data_shm); - // The pointer opened by the cudaIpcOpenMemHandle will refer to the base - // address. We need to manually correct the offset. - void* data_ptr_base; - CUDAHandler& cuda_handler = CUDAHandler::getInstance(); - cuda_handler.OpenCudaHandle( - memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); + // The pointer opened by the cudaIpcOpenMemHandle will refer to the base + // address. We need to manually correct the offset. + void* data_ptr_base; + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.OpenCudaHandle( + memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); - data_ptr = - (reinterpret_cast(data_ptr_base) + - memory_shm_ptr->gpu_pointer_offset); - opened_cuda_ipc_handle = true; -#endif + data_ptr = + (reinterpret_cast(data_ptr_base) + + memory_shm_ptr->gpu_pointer_offset); + opened_cuda_ipc_handle = true; + } + +#endif // TRITON_ENABLE_GPU } else { data_ptr = memory_data_shm; } @@ -242,7 +265,6 @@ PbMemory::LoadFromSharedMemory( opened_cuda_ipc_handle /* opened_cuda_ipc_handle */)); } - std::unique_ptr PbMemory::LoadFromSharedMemory( std::unique_ptr& shm_pool, @@ -258,21 +280,30 @@ PbMemory::LoadFromSharedMemory( if (memory_shm_ptr->memory_type == TRITONSERVER_MEMORY_GPU) { if (memory_shm_ptr->byte_size > 0 && open_cuda_handle) { #ifdef TRITON_ENABLE_GPU - cudaIpcMemHandle_t* cuda_handle = - reinterpret_cast(memory_data_shm); - - // The pointer opened by the cudaIpcOpenMemHandle will refer to the base - // address. We need to manually correct the offset. - - void* data_ptr_base; - CUDAHandler& cuda_handler = CUDAHandler::getInstance(); - cuda_handler.OpenCudaHandle( - memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); - - data_ptr = - (reinterpret_cast(data_ptr_base) + - memory_shm_ptr->gpu_pointer_offset); - opened_cuda_ipc_handle = true; + if (memory_shm_ptr->use_cuda_shared_pool) { + // When CUDA shared memory pool is used, the stub will retrieve the + // data pointer using the offset. + data_ptr = + (reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager()->CUDAPoolAddress( + memory_shm_ptr->memory_type_id)) + + memory_shm_ptr->cuda_pool_offset); + } else { + cudaIpcMemHandle_t* cuda_handle = + reinterpret_cast(memory_data_shm); + + // The pointer opened by the cudaIpcOpenMemHandle will refer to the base + // address. We need to manually correct the offset. + void* data_ptr_base; + CUDAHandler& cuda_handler = CUDAHandler::getInstance(); + cuda_handler.OpenCudaHandle( + memory_shm_ptr->memory_type_id, cuda_handle, &data_ptr_base); + + data_ptr = + (reinterpret_cast(data_ptr_base) + + memory_shm_ptr->gpu_pointer_offset); + opened_cuda_ipc_handle = true; + } #endif } } else { @@ -403,6 +434,18 @@ PbMemory::SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle) { *(reinterpret_cast(ShmData())) = *(cuda_ipc_handle); } + +void +PbMemory::UpdateCUDAOffset(std::unique_ptr& cuda_pool) +{ + if (cuda_pool->UseCudaSharedPool(MemoryTypeId()) && + IsUsingCUDAPool(cuda_pool, MemoryTypeId(), DataPtr())) { + memory_shm_ptr_->cuda_pool_offset = + DataPtr() - + reinterpret_cast(cuda_pool->CUDAPoolAddress(MemoryTypeId())); + memory_shm_ptr_->use_cuda_shared_pool = true; + } +} #endif PbMemory::~PbMemory() diff --git a/src/pb_memory.h b/src/pb_memory.h index e7986014..ad79daed 100644 --- a/src/pb_memory.h +++ b/src/pb_memory.h @@ -1,4 +1,4 @@ -// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -42,13 +42,18 @@ namespace triton { namespace backend { namespace python { // struct MemoryShm { // If the memory type is a GPU pointer, the offset of the GPU pointer from the - // base address. For CPU memory type this field contains garbage data. + // base address. For CPU memory type this field contains garbage data. This + // field will only be used when the memory is not allocated from the CUDA + // shared memory pool. uint64_t gpu_pointer_offset; + bool use_cuda_shared_pool; + // The offset of the memory from the base address of the CUDA shared memory + // pool. + uint64_t cuda_pool_offset; TRITONSERVER_MemoryType memory_type; int64_t memory_type_id; uint64_t byte_size; - bool is_cuda_handle_set; uint64_t memory_release_id; }; @@ -60,6 +65,7 @@ class PbMemory { uint64_t byte_size, char* data, bool copy_gpu = true); static std::unique_ptr Create( + std::unique_ptr& shm_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu = true); @@ -72,6 +78,8 @@ class PbMemory { #ifdef TRITON_ENABLE_GPU void SetCudaIpcHandle(cudaIpcMemHandle_t* cuda_ipc_handle); + + void UpdateCUDAOffset(std::unique_ptr& cuda_pool); #endif // Copy the destination buffer to the source buffer. @@ -83,6 +91,7 @@ class PbMemory { bi::managed_external_buffer::handle_t memory_handle, bool open_cuda_handle); static std::unique_ptr LoadFromSharedMemory( + std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t handle, char* data_shm, bool open_cuda_handle); static uint64_t ShmStructSize( @@ -117,8 +126,25 @@ class PbMemory { void SetMemoryReleaseCallback(std::function release_callback); + bool UseCUDASharedPool() const + { + return memory_shm_ptr_->use_cuda_shared_pool; + } + ~PbMemory(); +#ifndef TRITON_PB_STUB + void SetBackendMemory(std::unique_ptr&& backend_memory) + { + backend_memory_ = std::move(backend_memory); + }; + + std::unique_ptr GetBackendMemory() + { + return std::move(backend_memory_); + }; +#endif + private: AllocatedSharedMemory memory_shm_; MemoryShm* memory_shm_ptr_; @@ -150,6 +176,7 @@ class PbMemory { #endif static void FillShmData( + std::unique_ptr& cuda_pool, TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, uint64_t byte_size, char* data, char* data_shm, bi::managed_external_buffer::handle_t handle, bool copy_gpu = true); diff --git a/src/pb_stub.cc b/src/pb_stub.cc index bc929525..123b2832 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -892,6 +892,21 @@ Stub::SendIPCUtilsMessage(std::unique_ptr& ipc_message) Stub::~Stub() { +#ifdef TRITON_ENABLE_GPU + try { + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + for (auto& m : + shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) { + if (m.second != nullptr) { + cuda_api.CloseCudaHandle(m.first, m.second); + } + } + } + catch (const PythonBackendException& pb_exception) { + std::cerr << "Error when closing CUDA handle: " << pb_exception.what(); + } +#endif + { py::gil_scoped_acquire acquire; model_instance_ = py::none(); @@ -1125,86 +1140,18 @@ Stub::ParentToStubMQMonitor() break; } - std::unique_ptr ipc_message; - ResponseBatch* response_batch = nullptr; - bi::managed_external_buffer::handle_t* response_handle = nullptr; - std::unique_ptr infer_response; - bool responses_is_set = false; - PythonBackendException pb_exception(std::string{}); - - try { - ipc_message = IPCMessage::LoadFromSharedMemory(shm_pool_, handle); - AllocatedSharedMemory response_batch_shm = - shm_pool_->Load(ipc_message->Args()); - response_batch = - reinterpret_cast(response_batch_shm.data_.get()); - response_handle = - reinterpret_cast( - response_batch_shm.data_.get() + sizeof(ResponseBatch)); - responses_is_set = true; - - if (response_batch->has_error) { - if (response_batch->is_error_set) { - std::unique_ptr pb_string = - PbString::LoadFromSharedMemory(shm_pool_, response_batch->error); - infer_response = std::make_unique( - std::vector>{}, - std::make_shared(pb_string->String())); - } else { - infer_response = std::make_unique( - std::vector>{}, - std::make_shared( - "An error occurred while performing BLS request.")); - } - } - - if (responses_is_set) { - infer_response = InferResponse::LoadFromSharedMemory( - shm_pool_, *response_handle, true /* open cuda handle */); - - for (auto& output_tensor : infer_response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - uint64_t memory_release_id = - output_tensor->Memory()->MemoryReleaseId(); - output_tensor->Memory()->SetMemoryReleaseCallback( - [this, memory_release_id]() { - this->MemoryManagerQueue()->Push(memory_release_id); - }); - } - } - } else { - infer_response = std::make_unique( - std::vector>{}, - std::make_shared( - "An error occurred while performing BLS request.")); - } - } - catch (const PythonBackendException& pb_exception) { - infer_response = std::make_unique( - std::vector>{}, - std::make_shared(pb_exception.what())); - } - - { - std::lock_guard lock(response_iterator_map_mu_); - if (response_iterator_map_.find(infer_response->Id()) != - response_iterator_map_.end()) { - response_iterator_map_[infer_response->Id()]->EnqueueResponse( - std::move(infer_response)); - } else { - auto response_iterator = - std::make_shared(std::move(infer_response)); - response_iterator_map_.insert( - std::pair>( - response_iterator->Id(), response_iterator)); - } - } - - { - bi::scoped_lock lock{ - *(ipc_message->ResponseMutex())}; - response_batch->waiting_on_stub = true; - ipc_message->ResponseCondition()->notify_all(); + std::unique_ptr ipc_message = + IPCMessage::LoadFromSharedMemory(shm_pool_, handle); + + switch (ipc_message->Command()) { + case PYTHONSTUB_CommandType::PYTHONSTUB_CUDAPoolInitializeRequest: { + GetCUDAMemoryPoolAddress(ipc_message); + } break; + case PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecResponse: { + ProcessBLSResponseDecoupled(ipc_message); + } break; + default: + break; } } } @@ -1288,6 +1235,144 @@ Stub::GetProxyStream(const int& device_id) #endif } +void +Stub::GetCUDAMemoryPoolAddress(std::unique_ptr& ipc_message) +{ +#ifdef TRITON_ENABLE_GPU + bool has_exception = false; + std::string error_string; + std::unique_ptr error_string_shm; + + CUDAMemPoolMessage* cuda_pool_message_ptr = nullptr; + try { + AllocatedSharedMemory cuda_handle_shm = + shm_pool_->Load(ipc_message->Args()); + cuda_pool_message_ptr = cuda_handle_shm.data_.get(); + + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + void* cuda_pool_address; + cuda_api.OpenCudaHandle( + cuda_pool_message_ptr->device_id, &cuda_pool_message_ptr->cuda_handle, + &cuda_pool_address); + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + cuda_pool_message_ptr->device_id, cuda_pool_address); + } + catch (const PythonBackendException& pb_exception) { + has_exception = true; + error_string = pb_exception.what(); + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + cuda_pool_message_ptr->device_id, nullptr); + } + + if (has_exception) { + LOG_INFO << "Failed to initialize CUDA shared memory pool in Python stub: " + << error_string; + cuda_pool_message_ptr->has_error = true; + cuda_pool_message_ptr->is_error_set = false; + + LOG_IF_EXCEPTION( + error_string_shm = PbString::Create(shm_pool_, error_string)); + if (error_string_shm != nullptr) { + cuda_pool_message_ptr->is_error_set = true; + cuda_pool_message_ptr->error = error_string_shm->ShmHandle(); + } + } + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + cuda_pool_message_ptr->waiting_on_stub = true; + ipc_message->ResponseCondition()->notify_all(); + while (cuda_pool_message_ptr->waiting_on_stub) { + ipc_message->ResponseCondition()->wait(lock); + } + } +#endif +} + +void +Stub::ProcessBLSResponseDecoupled(std::unique_ptr& ipc_message) +{ + ResponseBatch* response_batch = nullptr; + bi::managed_external_buffer::handle_t* response_handle = nullptr; + std::unique_ptr infer_response; + bool responses_is_set = false; + PythonBackendException pb_exception(std::string{}); + + try { + AllocatedSharedMemory response_batch_shm = + shm_pool_->Load(ipc_message->Args()); + response_batch = + reinterpret_cast(response_batch_shm.data_.get()); + response_handle = reinterpret_cast( + response_batch_shm.data_.get() + sizeof(ResponseBatch)); + responses_is_set = true; + + if (response_batch->has_error) { + if (response_batch->is_error_set) { + std::unique_ptr pb_string = + PbString::LoadFromSharedMemory(shm_pool_, response_batch->error); + infer_response = std::make_unique( + std::vector>{}, + std::make_shared(pb_string->String())); + } else { + infer_response = std::make_unique( + std::vector>{}, + std::make_shared( + "An error occurred while performing BLS request.")); + } + } + + if (responses_is_set) { + infer_response = InferResponse::LoadFromSharedMemory( + shm_pool_, *response_handle, true /* open cuda handle */); + + for (auto& output_tensor : infer_response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + uint64_t memory_release_id = + output_tensor->Memory()->MemoryReleaseId(); + output_tensor->Memory()->SetMemoryReleaseCallback( + [this, memory_release_id]() { + this->MemoryManagerQueue()->Push(memory_release_id); + }); + } + } + } else { + infer_response = std::make_unique( + std::vector>{}, + std::make_shared( + "An error occurred while performing BLS request.")); + } + } + catch (const PythonBackendException& pb_exception) { + infer_response = std::make_unique( + std::vector>{}, + std::make_shared(pb_exception.what())); + } + + { + std::lock_guard lock(response_iterator_map_mu_); + if (response_iterator_map_.find(infer_response->Id()) != + response_iterator_map_.end()) { + response_iterator_map_[infer_response->Id()]->EnqueueResponse( + std::move(infer_response)); + } else { + auto response_iterator = + std::make_shared(std::move(infer_response)); + response_iterator_map_.insert( + std::pair>( + response_iterator->Id(), response_iterator)); + } + } + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + response_batch->waiting_on_stub = true; + ipc_message->ResponseCondition()->notify_all(); + } +} + std::unique_ptr Logger::log_instance_; std::unique_ptr& diff --git a/src/pb_stub.h b/src/pb_stub.h index 94b4d8a1..12b47abc 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -272,7 +272,11 @@ class Stub { std::unique_ptr& ShmPool() { return shm_pool_; } void ProcessResponse(InferResponse* response); + + void ProcessBLSResponseDecoupled(std::unique_ptr& ipc_message); + void LoadGPUBuffers(std::unique_ptr& ipc_message); + bool IsDecoupled(); ~Stub(); @@ -350,6 +354,9 @@ class Stub { /// for provided device cudaStream_t GetProxyStream(const int& device_id); + /// Get the CUDA memory pool address from the parent process. + void GetCUDAMemoryPoolAddress(std::unique_ptr& ipc_message); + private: bi::interprocess_mutex* stub_mutex_; bi::interprocess_condition* stub_cond_; diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 84cd8f3f..d9d47784 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -555,7 +555,7 @@ PbTensor::SaveToSharedMemory( if (!pb_memory_) { pb_memory_ = PbMemory::Create( - memory_type_, memory_type_id_, byte_size_, + shm_pool, memory_type_, memory_type_id_, byte_size_, reinterpret_cast(memory_ptr_), reinterpret_cast(tensor_shm_ptr_) + pb_memory_offset, shm_handle_ + pb_memory_offset, copy_gpu); @@ -585,7 +585,7 @@ PbTensor::LoadFromSharedMemory( if (tensor_shm_ptr->memory == 0) { std::size_t pb_memory_offset = name_offset + name_shm->Size(); pb_memory = PbMemory::LoadFromSharedMemory( - pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset, + shm_pool, pb_memory_offset, tensor_shm.data_.get() + pb_memory_offset, open_cuda_handle); } else { pb_memory = PbMemory::LoadFromSharedMemory( diff --git a/src/pb_tensor.h b/src/pb_tensor.h index b9c0d593..4f97b643 100644 --- a/src/pb_tensor.h +++ b/src/pb_tensor.h @@ -99,8 +99,7 @@ class PbTensor { int64_t memory_type_id, void* memory_ptr, uint64_t byte_size, DLManagedTensor* dl_managed_tensor = nullptr); - /// This constructor is used when - /// loading the tensor from shared memory. + /// This constructor is used when loading the tensor from shared memory. /// \param tensor_shm The name of the tensor /// \param dims_shm Tensor dimensions /// \param pb_string Triton dtype diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 089f4cf0..5aa95b8b 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -239,7 +239,24 @@ ScopedSetDevice::~ScopedSetDevice() cuda_handler.MaybeSetDevice(current_device_); } } -#endif + +bool +IsUsingCUDAPool( + std::unique_ptr& cuda_pool, int64_t memory_type_id, + void* data) +{ + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + CUdeviceptr cuda_pool_address = 0; + cuda_api.PointerGetAttribute( + &cuda_pool_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + reinterpret_cast(data)); + + return ( + cuda_pool->CUDAPoolAddress(memory_type_id) == + reinterpret_cast(cuda_pool_address)); +} + +#endif // TRITON_ENABLE_GPU #ifndef TRITON_PB_STUB std::shared_ptr @@ -258,5 +275,5 @@ WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) *response_error = error; return response_error; } -#endif -}}} // namespace triton::backend::python +#endif // NOT TRITON_PB_STUB +}}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 612c46a4..0873eb03 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -241,7 +241,22 @@ struct RequestBatch { bi::managed_external_buffer::handle_t gpu_buffers_handle; }; +struct MemoryReleaseMessage { + std::mutex mu; + std::condition_variable cv; + uint64_t id; + bool waiting_on_stub; +}; + #ifdef TRITON_ENABLE_GPU +struct CUDAMemPoolMessage : SendMessageBase { + cudaIpcMemHandle_t cuda_handle; + int32_t device_id; + bi::managed_external_buffer::handle_t error; + bool has_error; + bool is_error_set; +}; + class CUDAHandler { public: static CUDAHandler& getInstance() @@ -301,6 +316,11 @@ class ScopedSetDevice { int current_device_; }; +// Check if the data is allocated from the pool by the base address. +bool IsUsingCUDAPool( + std::unique_ptr& cuda_pool, int64_t memory_type_id, + void* data); + #endif // TRITON_ENABLE_GPU #ifndef TRITON_PB_STUB diff --git a/src/python_be.cc b/src/python_be.cc index de639df3..33b2ec77 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -521,6 +521,8 @@ ModelInstanceState::GetInputTensor( } } else { #ifdef TRITON_ENABLE_GPU + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(src_memory_type_id); // Retrieving GPU input tensors const void* buffer = nullptr; @@ -529,6 +531,8 @@ ModelInstanceState::GetInputTensor( // collector is used in the non-decoupled mode. if (collector) { + // The ProcessTensor function will try to allocate the buffer in the CUDA + // pool first. RETURN_IF_ERROR(collector->ProcessTensor( input_name, nullptr, 0, alloc_perference, reinterpret_cast(&buffer), &input_byte_size, @@ -568,10 +572,22 @@ ModelInstanceState::GetInputTensor( Stub()->ShmPool(), true /* copy_gpu */)); } } else { + // Try to use the cuda shared memory pool first. void* dev_ptr; - RETURN_IF_CUDA_ERROR( - cudaMalloc(&dev_ptr, input_byte_size), TRITONSERVER_ERROR_INTERNAL, - std::string("Failed to allocated CUDA memory")); + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + RETURN_IF_ERROR(BackendMemory::Create( + reinterpret_cast( + Stub() + ->ShmPool() + ->GetCUDAMemoryPoolManager() + ->TritonMemoryManager()), + {BackendMemory::AllocationType::GPU_POOL, + BackendMemory::AllocationType::GPU}, + src_memory_type_id, input_byte_size, &backend_memory)); + + dev_ptr = backend_memory->MemoryPtr(); + lbackend_memory.reset(backend_memory); size_t byte_size = input_byte_size; @@ -594,14 +610,11 @@ ModelInstanceState::GetInputTensor( const_cast(dev_ptr), input_byte_size, nullptr /* DLManagedTensor */); + input_tensor->SetMemory(std::move( + PbMemory::Create(Stub()->ShmPool(), std::move(lbackend_memory)))); + RETURN_IF_EXCEPTION(input_tensor->SaveToSharedMemory( Stub()->ShmPool(), true /* copy_gpu */)); - - std::unique_ptr gpu_memory_record = - std::make_unique(input_tensor->Memory()->DataPtr()); - uint64_t memory_release_id = - Stub()->GetMemoryManager()->AddRecord(std::move(gpu_memory_record)); - input_tensor->Memory()->SetMemoryReleaseId(memory_release_id); } #else return TRITONSERVER_ErrorNew( @@ -662,6 +675,8 @@ ModelInstanceState::ExecuteBLSRequest( for (auto& input_tensor : infer_request->Inputs()) { if (!input_tensor->IsCPU()) { #ifdef TRITON_ENABLE_GPU + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(input_tensor->MemoryTypeId()); BackendMemory* backend_memory; std::unique_ptr lbackend_memory; has_gpu_tensor = true; @@ -1161,6 +1176,16 @@ ModelInstanceState::ResponseSendDecoupled( response_factory_ptr.reset( reinterpret_cast(response_factory)); } + +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : infer_response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + } + } +#endif // TRITON_ENABLE_GPU + infer_response->Send( response, CudaStream(), requires_deferred_callback, send_message_payload->flags, Stub()->ShmPool(), gpu_buffer_helper, @@ -1184,23 +1209,52 @@ ModelInstanceState::ResponseSendDecoupled( bool cuda_copy = false; for (auto& output_buffer_pair : gpu_output_buffers) { auto& pb_memory = output_buffer_pair.first; + void* pointer = output_buffer_pair.second; + bool cuda_used; - if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - bool cuda_used; - void* pointer = output_buffer_pair.second; - - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used); - cuda_copy |= cuda_used; - } + try { + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + THROW_IF_TRITON_ERROR(CopyBuffer( + "Failed to copy the CPU output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } else if ( + (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && + pb_memory->UseCUDASharedPool() && + (pb_memory->DataPtr() != pointer)) { + // If the data pointer from pb_memory is not the same as the + // pointer, it means that the Triton-provided buffer is not used + // during tensor transfer. Instead, an intermediate buffer that uses + // CUDA shared memory pool is used. In this case, we need to copy + // the data from the intermediate buffer back to the Triton-provided + // buffer. + THROW_IF_TRITON_ERROR(CopyBuffer( + "Failed to copy the GPU output tensor to buffer.", + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } #ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } #endif // TRITON_ENABLE_GPU + } + catch (const PythonBackendException& pb_exception) { + TRITONSERVER_Error* error = TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string( + "Failed to copy output tensor to Triton-provided buffer: ") + + pb_exception.what()) + .c_str()); + SetErrorForResponseSendMessage( + send_message_payload, WrapTritonErrorInSharedPtr(error), + error_message); + } } } } else { @@ -1534,6 +1588,15 @@ ModelInstanceState::ProcessRequests( bool require_deferred_callback = false; +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : infer_response->OutputTensors()) { + if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + } + } +#endif // TRITON_ENABLE_GPU + gpu_output_buffers[r] = std::vector, void*>>{}; infer_response->Send( @@ -1567,10 +1630,10 @@ ModelInstanceState::ProcessRequests( for (auto& gpu_output_buffer : gpu_output_buffers) { for (auto& buffer_memory_pair : gpu_output_buffer) { auto& pb_memory = buffer_memory_pair.first; - if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - bool cuda_used = false; - void* pointer = buffer_memory_pair.second; + void* pointer = buffer_memory_pair.second; + bool cuda_used = false; + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { GUARDED_RESPOND_IF_ERROR( responses, response_index, CopyBuffer( @@ -1579,6 +1642,24 @@ ModelInstanceState::ProcessRequests( pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, CudaStream(), &cuda_used)); cuda_copy |= cuda_used; + } else if ( + (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && + pb_memory->UseCUDASharedPool() && + (pb_memory->DataPtr() != pointer)) { + // If the data pointer from pb_memory is not the same as the pointer, + // it means that the Triton-provided buffer is not used during tensor + // transfer. Instead, an intermediate buffer that uses CUDA shared + // memory pool is used. In this case, we need to copy the data + // from the intermediate buffer back to the Triton-provided buffer. + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; } } response_index++; @@ -1633,16 +1714,36 @@ ModelInstanceState::PrepareResponseHandle( std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle) { +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : (*infer_response)->OutputTensors()) { + if (!output_tensor->IsCPU()) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + // It's possible that the CUDA memory pool offset isn't set correctly, + // even if the BLS output is using CUDA memory. This can occur when the + // CUDA memory pool hasn't been shared with the stub process at the time + // the BLS output is allocated during the ResponseAlloc callback. In such + // cases, we need to adjust the CUDA pool offset accordingly. + if (!output_tensor->Memory()->UseCUDASharedPool()) { + output_tensor->Memory()->UpdateCUDAOffset( + Stub()->ShmPool()->GetCUDAMemoryPoolManager()); + } + } + } +#endif // TRITON_ENABLE_GPU + (*infer_response)->SaveToSharedMemory(Stub()->ShmPool()); + for (auto& output_tensor : (*infer_response)->OutputTensors()) { - // For GPU tensors we need to store the memory release id in - // memory manager. if (!output_tensor->IsCPU()) { #ifdef TRITON_ENABLE_GPU - std::unique_ptr gpu_memory_record = - std::make_unique(output_tensor->Memory()->DataPtr()); + std::unique_ptr memory_record; + // Need to transfer the ownership of the BackendMemory to the + // MemoryManager so that the lifetime of the BackendMemory is managed. + memory_record = std::make_unique( + output_tensor->Memory()->GetBackendMemory()); uint64_t memory_release_id = - Stub()->GetMemoryManager()->AddRecord(std::move(gpu_memory_record)); + Stub()->GetMemoryManager()->AddRecord(std::move(memory_record)); output_tensor->Memory()->SetMemoryReleaseId(memory_release_id); #endif } @@ -1666,6 +1767,7 @@ ModelInstanceState::SendBLSDecoupledResponse( ipc_message = IPCMessage::Create(Stub()->ShmPool(), true /* inline_response */); ipc_message->Args() = response_batch_shm.handle_; + ipc_message->Command() = PYTHONSTUB_InferStreamExecResponse; PrepareResponseBatch( &response_batch, response_batch_shm, &ipc_message, &response_handle); is_response_batch_set = true; @@ -1698,6 +1800,23 @@ ModelInstanceState::SendBLSDecoupledResponse( } } +void +ModelInstanceState::ShareCUDAMemoryPool(const int32_t device_id) +{ +#ifdef TRITON_ENABLE_GPU + try { + Stub()->ShareCUDAMemoryPool(Model()->TritonMemoryManager(), device_id); + } + catch (const PythonBackendException& ex) { + LOG_MESSAGE( + TRITONSERVER_LOG_WARN, + (std::string("Failed to share CUDA memory pool with stub process: ") + + ex.what() + ". Will use CUDA IPC.") + .c_str()); + } +#endif // TRITON_ENABLE_GPU +} + ModelInstanceState::~ModelInstanceState() { ModelState* model_state = reinterpret_cast(Model()); @@ -2258,7 +2377,10 @@ TRITONBACKEND_ModelInstanceExecute( } LOG_IF_ERROR(err, "Failed to restart the stub process."); err = instance_state->Stub()->Launch(); - LOG_IF_ERROR(err, "Failed to restart the stub process."); + LOG_IF_ERROR( + err, + "Failed to restart the stub process: failed to launch " + "the stub process."); } } else { std::vector> infer_requests; diff --git a/src/python_be.h b/src/python_be.h index 51793125..fce1f417 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -415,5 +415,8 @@ class ModelInstanceState : public BackendModelInstance { // Process a model control request void ProcessModelControlRequest(const std::unique_ptr& message); + + // Attempt to share CUDA memory pool with the stub process + void ShareCUDAMemoryPool(const int32_t device_id); }; }}} // namespace triton::backend::python diff --git a/src/request_executor.cc b/src/request_executor.cc index 2a6d9575..65f53710 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -109,7 +109,6 @@ InferResponseComplete( std::string sname = cname; std::vector dims_vector{shape, shape + dim_count}; - // userp is only set for the CPU tensors if (memory_type != TRITONSERVER_MEMORY_GPU) { if (byte_size != 0) { std::shared_ptr pb_tensor = std::make_shared( @@ -129,10 +128,15 @@ InferResponseComplete( nullptr /* DLManagedTensor */)); } } else { - output_tensors.push_back(std::make_shared( + std::shared_ptr pb_tensor = std::make_shared( sname, dims_vector, datatype, memory_type, memory_type_id, const_cast(base), byte_size, - nullptr /* DLManagedTensor */)); + nullptr /* DLManagedTensor */); + + std::unique_ptr pb_memory( + reinterpret_cast(userp)); + pb_tensor->SetMemory(std::move(pb_memory)); + output_tensors.push_back(pb_tensor); } } } @@ -241,24 +245,27 @@ ResponseAlloc( } break; #ifdef TRITON_ENABLE_GPU case TRITONSERVER_MEMORY_GPU: { - auto err = cudaSetDevice(*actual_memory_type_id); - if ((err != cudaSuccess) && (err != cudaErrorNoDevice) && - (err != cudaErrorInsufficientDriver)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "unable to set current CUDA device: " + - std::string(cudaGetErrorString(err))) - .c_str()); - } + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + try { + THROW_IF_TRITON_ERROR(BackendMemory::Create( + reinterpret_cast( + shm_pool->GetCUDAMemoryPoolManager()->TritonMemoryManager()), + {BackendMemory::AllocationType::GPU_POOL, + BackendMemory::AllocationType::GPU}, + *actual_memory_type_id, byte_size, &backend_memory)); + lbackend_memory.reset(backend_memory); - err = cudaMalloc(buffer, byte_size); - if (err != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - std::string( - "cudaMalloc failed: " + std::string(cudaGetErrorString(err))) - .c_str()); + std::unique_ptr pb_memory = PbMemory::Create( + shm_pool, std::move(lbackend_memory), true /* copy_gpu */); + *buffer = pb_memory->DataPtr(); + *buffer_userp = reinterpret_cast(pb_memory.get()); + pb_memory.release(); + } + catch (const PythonBackendException& pb_exception) { + TRITONSERVER_Error* err = + CreateTritonErrorFromException(pb_exception); + return err; } break; } diff --git a/src/shm_manager.cc b/src/shm_manager.cc index b52d5a4f..1c7c4d65 100644 --- a/src/shm_manager.cc +++ b/src/shm_manager.cc @@ -33,6 +33,53 @@ namespace triton { namespace backend { namespace python { +void +CUDAMemoryPoolManager::SetCUDAPoolAddress( + const int32_t device_id, void* cuda_pool_address) +{ + std::lock_guard lock(mu_); + cuda_pool_address_map_[device_id] = cuda_pool_address; +} + +void* +CUDAMemoryPoolManager::CUDAPoolAddress(const int32_t device_id) +{ + if (cuda_pool_address_map_.find(device_id) != cuda_pool_address_map_.end()) { + return cuda_pool_address_map_[device_id]; + } else { + throw PythonBackendException( + "CUDA pool address for device " + std::to_string(device_id) + + " is not set."); + } +} + +void +CUDAMemoryPoolManager::SetTritonMemoryManager(void* triton_memory_manager) +{ + triton_memory_manager_ = triton_memory_manager; +} + +void* +CUDAMemoryPoolManager::TritonMemoryManager() +{ + return triton_memory_manager_; +} + +bool +CUDAMemoryPoolManager::UseCudaSharedPool(const int32_t device_id) +{ + return (cuda_pool_address_map_.find(device_id) != + cuda_pool_address_map_.end()) && + (cuda_pool_address_map_[device_id] != nullptr) && + (triton_memory_manager_ != nullptr); +} + +std::unordered_map& +CUDAMemoryPoolManager::CUDAPoolAddressMap() +{ + return cuda_pool_address_map_; +} + SharedMemoryManager::SharedMemoryManager( const std::string& shm_region_name, size_t shm_size, size_t shm_growth_bytes, bool create) @@ -40,6 +87,7 @@ SharedMemoryManager::SharedMemoryManager( shm_region_name_ = shm_region_name; create_ = create; shm_growth_bytes_ = shm_growth_bytes; + cuda_memory_pool_manager_ = std::make_unique(); try { if (create) { @@ -99,6 +147,7 @@ SharedMemoryManager::SharedMemoryManager(const std::string& shm_region_name) shm_region_name_ = shm_region_name; create_ = false; shm_growth_bytes_ = 1024; + cuda_memory_pool_manager_ = std::make_unique(); shm_obj_ = std::make_unique( bi::open_only, shm_region_name.c_str(), bi::read_write); diff --git a/src/shm_manager.h b/src/shm_manager.h index bd462403..adfa03ac 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,32 @@ namespace triton { namespace backend { namespace python { namespace bi = boost::interprocess; +class CUDAMemoryPoolManager { + public: + CUDAMemoryPoolManager() : triton_memory_manager_(nullptr) {} + + void SetCUDAPoolAddress(const int32_t device_id, void* cuda_pool_address); + + void* CUDAPoolAddress(const int32_t device_id); + + void SetTritonMemoryManager(void* triton_memory_manager); + + void* TritonMemoryManager(); + + bool UseCudaSharedPool(const int32_t device_id); + + // Return cuda pool address map + std::unordered_map& CUDAPoolAddressMap(); + + private: + // The base address of the Triton CUDA memory pool + std::unordered_map cuda_pool_address_map_; + // The mutex to protect the cuda_pool_address_map_ + std::mutex mu_; + // TRITONBACKEND_MemoryManager + void* triton_memory_manager_; +}; + template struct AllocatedSharedMemory { AllocatedSharedMemory() = default; @@ -157,6 +184,11 @@ class SharedMemoryManager { void SetDeleteRegion(bool delete_region); + std::unique_ptr& GetCUDAMemoryPoolManager() + { + return cuda_memory_pool_manager_; + } + ~SharedMemoryManager() noexcept(false); private: @@ -171,6 +203,7 @@ class SharedMemoryManager { uint64_t* total_size_; bool create_; bool delete_region_; + std::unique_ptr cuda_memory_pool_manager_; template AllocatedSharedMemory WrapObjectInUniquePtr( diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index a38409ec..b0627486 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -34,7 +34,6 @@ StubLauncher::StubLauncher(const std::string stub_process_kind) : parent_pid_(0), stub_pid_(0), is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(""), device_id_(0), kind_("") - { } @@ -327,7 +326,7 @@ StubLauncher::Launch() // The reason it is broken into two steps is that creation of the health // monitoring thread may take longer which can make the server process think // that the stub process is unhealthy and return early. Waiting until the - // health thread is spawn would make sure would prevent this issue. + // health thread is spawn would prevent this issue. parent_message_queue_->Pop(); if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { @@ -600,4 +599,107 @@ StubLauncher::ReceiveMessageFromStub( return nullptr; // success } -}}}; // namespace triton::backend::python + +#ifdef TRITON_ENABLE_GPU +void +StubLauncher::ShareCUDAMemoryPool( + TRITONBACKEND_MemoryManager* triton_mem_manager, const int32_t device_id) +{ + std::lock_guard lock(cuda_shm_pool_mutex_); + if ((tried_sharing_cuda_pool_map_.find(device_id) != + tried_sharing_cuda_pool_map_.end()) && + tried_sharing_cuda_pool_map_[device_id]) { + return; + } + + std::unique_ptr ipc_message = + IPCMessage::Create(shm_pool_, true /* inline_response */); + CUDAMemPoolMessage* cuda_pool_message_ptr = nullptr; + PythonBackendException pb_exception(std::string{}); + + try { + // Create a dummy BackendMemory object to get the start address of the CUDA + // memory pool. + BackendMemory* backend_memory; + std::unique_ptr lbackend_memory; + + THROW_IF_TRITON_ERROR(BackendMemory::Create( + triton_mem_manager, BackendMemory::AllocationType::GPU_POOL, device_id, + 1 /* byte size*/, &backend_memory)); + lbackend_memory.reset(backend_memory); + + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + CUdeviceptr cuda_pool_address = 0; + cuda_api.PointerGetAttribute( + &cuda_pool_address, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, + reinterpret_cast(lbackend_memory->MemoryPtr())); + + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + device_id, reinterpret_cast(cuda_pool_address)); + shm_pool_->GetCUDAMemoryPoolManager()->SetTritonMemoryManager( + reinterpret_cast(triton_mem_manager)); + + // Get the memory handle from the CUDA memory pool. + AllocatedSharedMemory cuda_pool_message = + shm_pool_->Construct(); + cuda_pool_message_ptr = cuda_pool_message.data_.get(); + { + ScopedSetDevice scoped_set_device(device_id); + THROW_IF_CUDA_ERROR(cudaIpcGetMemHandle( + reinterpret_cast( + &cuda_pool_message_ptr->cuda_handle), + reinterpret_cast(shm_pool_->GetCUDAMemoryPoolManager() + ->CUDAPoolAddress(device_id)))); + } + + ipc_message->Command() = PYTHONSTUB_CUDAPoolInitializeRequest; + ipc_message->Args() = cuda_pool_message.handle_; + + cuda_pool_message_ptr->device_id = device_id; + cuda_pool_message_ptr->has_error = false; + cuda_pool_message_ptr->is_error_set = false; + cuda_pool_message_ptr->waiting_on_stub = false; + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + parent_to_stub_mq_->Push(ipc_message->ShmHandle()); + while (!cuda_pool_message_ptr->waiting_on_stub) { + ipc_message->ResponseCondition()->wait(lock); + } + } + + if (cuda_pool_message_ptr->has_error) { + if (cuda_pool_message_ptr->is_error_set) { + std::unique_ptr error_message = + PbString::LoadFromSharedMemory( + shm_pool_, cuda_pool_message_ptr->error); + throw PythonBackendException(error_message->String()); + } else { + throw PythonBackendException( + "Failed to share CUDA memory pool with stub process: " + + model_name_); + } + } + } + catch (const PythonBackendException& exception) { + shm_pool_->GetCUDAMemoryPoolManager()->SetCUDAPoolAddress( + device_id, nullptr); + pb_exception = exception; + } + + { + bi::scoped_lock lock{ + *(ipc_message->ResponseMutex())}; + cuda_pool_message_ptr->waiting_on_stub = false; + ipc_message->ResponseCondition()->notify_all(); + } + + tried_sharing_cuda_pool_map_[device_id] = true; + + if (pb_exception.what() != std::string{""}) { + throw pb_exception; + } +} +#endif // TRITON_ENABLE_GPU +}}}; // namespace triton::backend::python diff --git a/src/stub_launcher.h b/src/stub_launcher.h index 3bbd2463..fbbbdbad 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -151,6 +151,12 @@ class StubLauncher { TRITONSERVER_Error* ReceiveMessageFromStub( bi::managed_external_buffer::handle_t& message); +#ifdef TRITON_ENABLE_GPU + // Share CUDA memory pool with stub process + void ShareCUDAMemoryPool( + TRITONBACKEND_MemoryManager* triton_mem_manager, const int32_t device_id); +#endif // TRITON_ENABLE_GPU + private: pid_t parent_pid_; pid_t stub_pid_; @@ -196,5 +202,9 @@ class StubLauncher { ipc_control_; bi::managed_external_buffer::handle_t ipc_control_handle_; std::unique_ptr shm_pool_; +#ifdef TRITON_ENABLE_GPU + std::mutex cuda_shm_pool_mutex_; + std::unordered_map tried_sharing_cuda_pool_map_; +#endif // TRITON_ENABLE_GPU }; }}} // namespace triton::backend::python From f91cbe981e13f0be7a4aaeadbb220586b3b2b65b Mon Sep 17 00:00:00 2001 From: Piotr Marcinkiewicz Date: Thu, 26 Oct 2023 15:20:32 +0200 Subject: [PATCH 157/216] Include missing unordered_map in shm (#316) --- src/shm_manager.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/shm_manager.h b/src/shm_manager.h index adfa03ac..deae06f3 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -38,6 +38,7 @@ #include #include #include +#include #include "pb_exception.h" From cba7ed3663bdceb023cde18f8715324b5f98ad43 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Wed, 1 Nov 2023 16:52:09 -0700 Subject: [PATCH 158/216] Fixing pre-commit issue (#318) --- src/shm_manager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shm_manager.h b/src/shm_manager.h index deae06f3..5063273b 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -37,8 +37,8 @@ #include #include #include -#include #include +#include #include "pb_exception.h" From 0f1221129a01b067b93a4abc4a2c30a9e2856e01 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 2 Nov 2023 14:46:00 -0700 Subject: [PATCH 159/216] Enhanced python_backend autocomplete (#317) * Added to python_backend autocomplete: optional input and model_transaction_policy --- README.md | 21 +++++- src/resources/triton_python_backend_utils.py | 74 ++++++++++++++++++-- 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 9c59c144..8a93dd07 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,9 @@ class TritonPythonModel: inputs = [{ 'name': 'INPUT0', 'data_type': 'TYPE_FP32', - 'dims': [4] + 'dims': [4], + # this parameter will set `INPUT0 as an optional input` + 'optional': True }, { 'name': 'INPUT1', 'data_type': 'TYPE_FP32', @@ -394,6 +396,23 @@ function to gain read-only access to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object being returned from here will be used as the final configuration for the model. +In addition to minimal properties, you can also set [model_transaction_policy]( + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#model-transaction-policy) +through `auto_complete_config` using `set_model_transaction_policy`. +For example, +```python +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + ... + transaction_policy = {"decoupled": True} + auto_complete_model_config.set_model_transaction_policy(transaction_policy) + ... +``` + Note: The Python interpreter used to invoke this function will be destroyed upon returning from this function and as a result none of the objects created here will be available in the `initialize`, `execute`, or `finalize` diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index 560a3198..9828ab5b 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -381,12 +381,12 @@ def add_input(self, input): Raises ------ ValueError - If input contains property other than 'name', 'data_type' - and 'dims' or any of the properties are not set, or if an - input with the same name already exists in the configuration - but has different data_type or dims property + If input contains property other than 'name', 'data_type', + 'dims', 'optional' or any of the non-optional properties + are not set, or if an input with the same name already exists + in the configuration but has different data_type or dims property """ - valid_properties = ["name", "data_type", "dims"] + valid_properties = ["name", "data_type", "dims", "optional"] for current_property in input: if current_property not in valid_properties: raise ValueError( @@ -447,9 +447,26 @@ def add_input(self, input): + " but the model configuration specifies dims " + str(current_input["dims"]) ) + elif ( + "optional" in current_input + and "optional" in input + and current_input["optional"] != input["optional"] + ): + raise ValueError( + "model '" + + self._model_config["name"] + + "', tensor '" + + input["name"] + + "': the model expects optional " + + str(input["optional"]) + + " but the model configuration specifies optional " + + str(current_input["optional"]) + ) else: current_input["data_type"] = input["data_type"] current_input["dims"] = input["dims"] + if "optional" in input: + current_input["optional"] = input["optional"] return self._model_config["input"].append(input) @@ -538,6 +555,53 @@ def add_output(self, output): self._model_config["output"].append(output) + def set_model_transaction_policy(self, transaction_policy_dict): + """ + Set model transaction policy for the model. + Parameters + ---------- + transaction_policy_dict : dict + The dict, containing all properties to be set as a part + of `model_transaction_policy` field. + Raises + ------ + ValueError + If transaction_policy_dict contains property other + than 'decoupled', or if `model_transaction_policy` already exists + in the configuration, but has different `decoupled` property. + """ + valid_properties = ["decoupled"] + for current_property in transaction_policy_dict.keys(): + if current_property not in valid_properties: + raise ValueError( + "model transaction property in auto-complete-config " + + "function for model '" + + self._model_config["name"] + + "' contains property other than 'decoupled'." + ) + + if "model_transaction_policy" not in self._model_config: + self._model_config["model_transaction_policy"] = {} + + if "decoupled" in transaction_policy_dict.keys(): + if ( + "decoupled" in self._model_config["model_transaction_policy"] + and self._model_config["model_transaction_policy"]["decoupled"] + != transaction_policy_dict["decoupled"] + ): + raise ValueError( + "trying to change decoupled property in auto-complete-config " + + "for model '" + + self._model_config["name"] + + "', which is already set to '" + + str(self._model_config["model_transaction_policy"]["decoupled"]) + + "'." + ) + + self._model_config["model_transaction_policy"][ + "decoupled" + ] = transaction_policy_dict["decoupled"] + TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1 TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2 From 60a9091cc232f77645a88a19ac63809866319e50 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Tue, 7 Nov 2023 20:40:34 -0800 Subject: [PATCH 160/216] Follow up to autocomplete pr #317 (#320) * Fllow up with error msg * Setting decoupled after autocomplete in ModelState:Create * Refactor * Refactor according to Tanmay discussion --- src/python_be.cc | 23 ++++++++++++++++++++ src/python_be.h | 7 ++++++ src/resources/triton_python_backend_utils.py | 2 +- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/python_be.cc b/src/python_be.cc index 33b2ec77..1f5a2e34 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -2005,6 +2005,29 @@ ModelState::ValidateModelConfig() return nullptr; } +TRITONSERVER_Error* +ModelState::SetModelConfig() +{ + BackendModel::SetModelConfig(); + // `Update model_transaction_policy` if setting was set + // with `set_model_transaction_policy` + triton::common::TritonJson::Value model_transaction_policy; + bool is_decoupled = false; + if (ModelConfig().Find( + "model_transaction_policy", &model_transaction_policy)) { + triton::common::TritonJson::Value decoupled; + if (model_transaction_policy.Find("decoupled", &decoupled)) { + auto error = decoupled.AsBool(&is_decoupled); + if (error != nullptr) { + throw BackendModelException(error); + } + SetDecoupled(is_decoupled); + } + } + + return nullptr; +} + extern "C" { diff --git a/src/python_be.h b/src/python_be.h index fce1f417..f8ec8cfa 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -238,6 +238,9 @@ class ModelState : public BackendModel { // Is decoupled API being used. bool IsDecoupled() { return decoupled_; } + // Set decoupled mode + void SetDecoupled(bool decoupled) { decoupled_ = decoupled; } + // Returns the value in the `runtime_modeldir_` field std::string RuntimeModelDir() { return runtime_modeldir_; } @@ -247,6 +250,10 @@ class ModelState : public BackendModel { // Validate Model Configuration TRITONSERVER_Error* ValidateModelConfig(); + // Overrides `BackendModel::SetModelConfig` to also + // set `ModelState::decoupled_` + TRITONSERVER_Error* SetModelConfig(); + // Auto-complete stub std::unique_ptr& Stub() { return auto_complete_stub_; } diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index 9828ab5b..b4732da6 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -394,7 +394,7 @@ def add_input(self, input): + input["name"] + "' in auto-complete-config function for model '" + self._model_config["name"] - + "' contains property other than 'name', 'data_type' and 'dims'." + + "' contains property other than 'name', 'data_type', 'dims' and 'optional'." ) if "name" not in input: From 889585ce0fc4d4c88eeb69dddbe0f26d08af4b6f Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 9 Nov 2023 11:58:31 -0800 Subject: [PATCH 161/216] Add support for request rescheduling (#319) * Add support for request rescheduling * Address comment * Add documentation * Fix up for doc * Revert response sender changes * Address comment --- README.md | 97 +++++++++++++++++ src/infer_request.cc | 18 ++- src/infer_request.h | 4 + src/pb_stub.cc | 49 ++++++--- src/python_be.cc | 109 ++++++++++++++++--- src/python_be.h | 4 + src/resources/triton_python_backend_utils.py | 4 +- 7 files changed, 249 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 8a93dd07..70ebbe18 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ any C++ code. - [Decoupled mode](#decoupled-mode) - [Use Cases](#use-cases) - [Known Issues](#known-issues) + - [Request Rescheduling](#request-rescheduling) - [`finalize`](#finalize) - [Model Config File](#model-config-file) - [Inference Request Parameters](#inference-request-parameters) @@ -623,6 +624,102 @@ for more details on how to host a decoupled model. * Currently, decoupled Python models can not make async infer requests. +#### Request Rescheduling + +Starting from 23.11, Python backend supports request rescheduling. By calling +the `set_release_flags` function on the request object with the flag +`pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE`, you can reschedule the +request for further execution in a future batch. This feature is useful for +handling generative sequences. + +The model config must be configured to enable generative sequence batching in +order to use the request rescheduling API: + +``` +sequence_batching { + generative_sequence : true +} +``` + +For non-decoupled models, there can only be one response for each request. Since +the rescheduled request is the same as the original, you must append a `None` +object to the response list for the rescheduled request. For example: + +```python +import triton_python_backend_utils as pb_utils + +class TritonPythonModel: + ... + + def execute(self, requests): + responses = [] + + for request in requests: + # Explicitly reschedule the first request + if self.idx == 0: + request.set_release_flags( + pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE + ) + responses.append(None) + self.idx += 1 + else: + responses.append(inference_response) + + return responses +``` + +For decoupled models, it is required to reschedule a request *before* returning +from the `execute` function. +Below is an example of a decoupled model using request rescheduling. This model +takes 1 input tensor, an INT32 [ 1 ] input named "IN", and produces an output +tensor "OUT" with the same shape as the input tensor. The input value indicates +the total number of responses to be generated and the output value indicates the +number of remaining responses. For example, if the request input has value 2, +the model will: + - Send a response with value 1. + - Release request with RESCHEDULE flag. + - When execute on the same request, send the last response with value 0. + - Release request with ALL flag. + +```python +import triton_python_backend_utils as pb_utils + +class TritonPythonModel: + ... + + def execute(self, requests): + responses = [] + + for request in requests: + in_input = pb_utils.get_input_tensor_by_name(request, "IN").as_numpy() + + if self.reset_flag: + self.remaining_response = in_input[0] + self.reset_flag = False + + response_sender = request.get_response_sender() + + self.remaining_response -= 1 + + out_output = pb_utils.Tensor( + "OUT", np.array([self.remaining_response], np.int32) + ) + response = pb_utils.InferenceResponse(output_tensors=[out_output]) + + if self.remaining_response <= 0: + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + self.reset_flag = True + else: + request.set_release_flags( + pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE + ) + response_sender.send(response) + + return None +``` + ### `finalize` Implementing `finalize` is optional. This function allows you to do any clean diff --git a/src/infer_request.cc b/src/infer_request.cc index 4c2d2575..d641526e 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -50,7 +50,7 @@ InferRequest::InferRequest( model_version_(model_version), parameters_(parameters), flags_(flags), timeout_(timeout), response_factory_address_(response_factory_address), request_address_(request_address), preferred_memory_(preferred_memory), - trace_(trace) + trace_(trace), request_release_flags_(TRITONSERVER_REQUEST_RELEASE_ALL) { for (auto& input : inputs) { if (!input) { @@ -175,6 +175,20 @@ InferRequest::Trace() return trace_; } +uint32_t +InferRequest::ReleaseFlags() +{ + request_release_flags_ = infer_request_shm_ptr_->request_release_flags; + return request_release_flags_; +} + +void +InferRequest::SetReleaseFlags(const uint32_t& flags) +{ + request_release_flags_ = flags; + infer_request_shm_ptr_->request_release_flags = request_release_flags_; +} + void InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) { @@ -201,6 +215,7 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) infer_request_shm_ptr_->timeout = timeout_; infer_request_shm_ptr_->preferred_memory = preferred_memory_; infer_request_shm_ptr_->trace = trace_; + infer_request_shm_ptr_->request_release_flags = request_release_flags_; output_names_handle_shm_ptr_ = reinterpret_cast( @@ -379,6 +394,7 @@ InferRequest::InferRequest( timeout_ = infer_request_shm_ptr_->timeout; preferred_memory_ = infer_request_shm_ptr_->preferred_memory; trace_ = infer_request_shm_ptr_->trace; + request_release_flags_ = infer_request_shm_ptr_->request_release_flags; #ifdef TRITON_PB_STUB pb_cancel_ = diff --git a/src/infer_request.h b/src/infer_request.h index bc6a2acf..3d81c5d2 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -73,6 +73,7 @@ struct InferRequestShm { int32_t timeout; PreferredMemory preferred_memory; InferenceTrace trace; + uint32_t request_release_flags; }; class InferRequest { @@ -104,6 +105,8 @@ class InferRequest { void SetIsDecoupled(const bool is_decoupled); PreferredMemory& GetPreferredMemory(); InferenceTrace& Trace(); + uint32_t ReleaseFlags(); + void SetReleaseFlags(const uint32_t& flags); #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); @@ -161,6 +164,7 @@ class InferRequest { bool is_decoupled_; PreferredMemory preferred_memory_; InferenceTrace trace_; + uint32_t request_release_flags_; // Shared Memory Data Structures AllocatedSharedMemory infer_request_shm_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 123b2832..3d473101 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -793,26 +793,39 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) std::to_string(response_size) + "\n"; throw PythonBackendException(err); } - for (auto& response : responses) { + + for (size_t i = 0; i < response_size; i++) { // Check the return type of execute function. - if (!py::isinstance(response)) { - std::string str = py::str(response.get_type()); - throw PythonBackendException( - std::string("Expected an 'InferenceResponse' object in the execute " - "function return list, found type '") + - str + "'."); + InferRequest* infer_request = py_request_list[i].cast(); + if (infer_request->ReleaseFlags() == + TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + if (!py::isinstance(responses[i])) { + // When the request is rescheduled in non-decoupled model, the + // response must be None. + std::string str = py::str(responses[i].get_type()); + throw PythonBackendException( + "Expected a None object in the execute function return list for " + "reschduled request, " + "found type '" + + str + "'."); + } + } else { + if (!py::isinstance(responses[i])) { + std::string str = py::str(responses[i].get_type()); + throw PythonBackendException( + std::string( + "Expected an 'InferenceResponse' object in the execute " + "function return list, found type '") + + str + "'."); + } + InferResponse* infer_response = responses[i].cast(); + infer_response->PruneOutputTensors( + infer_request->RequestedOutputNames()); + ProcessResponse(infer_response); + responses_shm_handle[i] = infer_response->ShmHandle(); } } response_batch_shm_ptr->batch_size = response_size; - - for (size_t i = 0; i < batch_size; i++) { - InferResponse* infer_response = responses[i].cast(); - InferRequest* infer_request = py_request_list[i].cast(); - infer_response->PruneOutputTensors(infer_request->RequestedOutputNames()); - - ProcessResponse(infer_response); - responses_shm_handle[i] = infer_response->ShmHandle(); - } } catch (const PythonBackendException& pb_exception) { has_exception = true; @@ -1675,7 +1688,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) "requested_output_names", &InferRequest::RequestedOutputNames, py::return_value_policy::reference_internal) .def("get_response_sender", &InferRequest::GetResponseSender) - .def("is_cancelled", &InferRequest::IsCancelled); + .def("is_cancelled", &InferRequest::IsCancelled) + .def("set_release_flags", &InferRequest::SetReleaseFlags), + py::arg("flags").none(false); py::class_>(module, "Tensor") .def(py::init(&PbTensor::FromNumpy)) diff --git a/src/python_be.cc b/src/python_be.cc index 1f5a2e34..cec2d18a 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -271,12 +271,12 @@ ModelInstanceState::IsStubProcessAlive() TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_inference_requests, + std::vector>& pb_infer_requests, AllocatedSharedMemory& request_batch, std::shared_ptr>& responses) { // Clear any existing items in the requests vector - pb_inference_requests.clear(); + pb_infer_requests.clear(); ModelState* model_state = reinterpret_cast(Model()); RETURN_IF_EXCEPTION( @@ -375,7 +375,22 @@ ModelInstanceState::SaveRequestsToSharedMemory( std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; - RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); + // Reuse the response factory if there is already a response factory + // associated with the request + std::lock_guard guard{response_factory_map_mutex_}; + { + if (response_factory_map_.find(reinterpret_cast(request)) != + response_factory_map_.end()) { + factory_ptr = + response_factory_map_[reinterpret_cast(request)]; + } else { + RETURN_IF_ERROR( + TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); + response_factory_map_[reinterpret_cast(request)] = + factory_ptr; + } + } + infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, @@ -393,7 +408,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); requests_shm[r] = infer_request->ShmHandle(); - pb_inference_requests.emplace_back(std::move(infer_request)); + pb_infer_requests.emplace_back(std::move(infer_request)); } return nullptr; // success @@ -1149,8 +1164,16 @@ ModelInstanceState::ResponseSendDecoupled( reinterpret_cast( send_message_payload->response_factory_address); if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - std::lock_guard guard{closed_requests_mutex_}; - closed_requests_.push_back(send_message_payload->request_address); + { + std::lock_guard guard{closed_requests_mutex_}; + closed_requests_.push_back(send_message_payload->request_address); + } + + // Clean up the response factory map. + { + std::lock_guard guard{response_factory_map_mutex_}; + response_factory_map_.erase(send_message_payload->request_address); + } } if (send_message_payload->response != 0) { @@ -1275,7 +1298,7 @@ ModelInstanceState::ResponseSendDecoupled( TRITONSERVER_Error* ModelInstanceState::ProcessRequestsDecoupled( TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_inference_requests, + std::vector>& pb_infer_requests, PbMetricReporter& reporter) { NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); @@ -1301,8 +1324,7 @@ ModelInstanceState::ProcessRequestsDecoupled( std::shared_ptr> responses; RETURN_IF_ERROR(SaveRequestsToSharedMemory( - requests, request_count, pb_inference_requests, request_batch, - responses)); + requests, request_count, pb_infer_requests, request_batch, responses)); uint64_t compute_start_ns = 0; SET_TIMESTAMP(compute_start_ns); @@ -1342,6 +1364,11 @@ ModelInstanceState::ProcessRequestsDecoupled( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } + // Reset the release flags for all the requests. + for (auto& infer_request : pb_infer_requests) { + infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + } + return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests."); } @@ -1352,6 +1379,7 @@ ModelInstanceState::ProcessRequestsDecoupled( void ModelInstanceState::ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector>& pb_infer_requests, bool& restart) { NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); @@ -1399,12 +1427,11 @@ ModelInstanceState::ProcessRequests( // Wait for all the pending BLS requests to be completed. ScopedDefer bls_defer([this] { WaitForBLSRequestsToFinish(); }); - std::vector> pb_inference_requests; AllocatedSharedMemory request_batch; RESPOND_ALL_AND_RETURN_IF_ERROR( responses, request_count, SaveRequestsToSharedMemory( - requests, request_count, pb_inference_requests, request_batch, + requests, request_count, pb_infer_requests, request_batch, responses)); std::shared_ptr ipc_message = @@ -1515,6 +1542,11 @@ ModelInstanceState::ProcessRequests( RespondErrorToAllRequests( error_message, responses, requests, request_count); } + + // Reset the release flags for all the requests. + for (auto& infer_request : pb_infer_requests) { + infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + } return; } @@ -1542,6 +1574,15 @@ ModelInstanceState::ProcessRequests( shm_responses.emplace_back(nullptr); std::unique_ptr& infer_response = shm_responses.back(); try { + if (pb_infer_requests[r]->ReleaseFlags() == + TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + // For rescheduled requests, we do not need to send a response. + LOG_IF_ERROR( + TRITONBACKEND_ResponseDelete((*responses)[r]), + "failed to delete response"); + (*responses)[r] = nullptr; + continue; + } infer_response = InferResponse::LoadFromSharedMemory( Stub()->ShmPool(), response_shm_handle[r], false /* open_cuda_handle */); @@ -1557,6 +1598,9 @@ ModelInstanceState::ProcessRequests( TRITONSERVER_ErrorDelete(err); (*responses)[r] = nullptr; + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + // If has_error is true, we do not look at the response tensors. continue; } @@ -1570,6 +1614,10 @@ ModelInstanceState::ProcessRequests( "failed sending response"); TRITONSERVER_ErrorDelete(err); (*responses)[r] = nullptr; + + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + continue; } @@ -2385,8 +2433,10 @@ TRITONBACKEND_ModelInstanceExecute( bool restart = false; ModelState* model_state = reinterpret_cast(instance_state->Model()); + std::vector> infer_requests; if (!model_state->IsDecoupled()) { - instance_state->ProcessRequests(requests, request_count, restart); + instance_state->ProcessRequests( + requests, request_count, infer_requests, restart); if (restart) { LOG_MESSAGE( @@ -2404,10 +2454,12 @@ TRITONBACKEND_ModelInstanceExecute( err, "Failed to restart the stub process: failed to launch " "the stub process."); + // Reset the release flags for all the requests. + for (auto& infer_request : infer_requests) { + infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + } } } else { - std::vector> infer_requests; - uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); @@ -2456,11 +2508,34 @@ TRITONBACKEND_ModelInstanceExecute( } } + // The InferRequest object might not be created if an error occurs. Explicitly + // update the release flags here based on the number of InferRequest objects. + std::vector request_release_flags( + request_count, TRITONSERVER_REQUEST_RELEASE_ALL); + for (size_t i = 0; i < infer_requests.size(); ++i) { + request_release_flags[i] = infer_requests[i]->ReleaseFlags(); + } + for (uint32_t r = 0; r < request_count; ++r) { TRITONBACKEND_Request* request = requests[r]; - LOG_IF_ERROR( - TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL), - "failed releasing request"); + try { + THROW_IF_TRITON_ERROR( + TRITONBACKEND_RequestRelease(request, request_release_flags[r])); + } + catch (const PythonBackendException& pb_exception) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("Failed to release request: ") + pb_exception.what()) + .c_str()); + if (request_release_flags[r] == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + // If error occurs during request rescheduling, release the request with + // `TRITONSERVER_REQUEST_RELEASE_ALL` flag. + LOG_IF_ERROR( + TRITONBACKEND_RequestRelease( + request, TRITONSERVER_REQUEST_RELEASE_ALL), + "Failed to release request."); + } + } } LOG_MESSAGE( diff --git a/src/python_be.h b/src/python_be.h index f8ec8cfa..5504e0c9 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -288,6 +288,9 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr thread_pool_; std::unordered_map> infer_payload_; std::unique_ptr request_executor_; + std::mutex response_factory_map_mutex_; + std::unordered_map + response_factory_map_; public: static TRITONSERVER_Error* Create( @@ -338,6 +341,7 @@ class ModelInstanceState : public BackendModelInstance { // Process all the requests obtained from Triton. void ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, + std::vector>& pb_infer_requests, bool& restart); // Process all the requests in the decoupled mode. diff --git a/src/resources/triton_python_backend_utils.py b/src/resources/triton_python_backend_utils.py index b4732da6..de332cf7 100644 --- a/src/resources/triton_python_backend_utils.py +++ b/src/resources/triton_python_backend_utils.py @@ -1,4 +1,4 @@ -# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -606,3 +606,5 @@ def set_model_transaction_policy(self, transaction_policy_dict): TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1 TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2 TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1 +TRITONSERVER_REQUEST_RELEASE_ALL = 1 +TRITONSERVER_REQUEST_RELEASE_RESCHEDULE = 2 From 6a53b8709a5b53fa9c3d3c05fdfd31461a593fd4 Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Mon, 20 Nov 2023 14:30:15 -0800 Subject: [PATCH 162/216] updated naming from generative to iterative --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 70ebbe18..9182ae37 100644 --- a/README.md +++ b/README.md @@ -630,14 +630,14 @@ Starting from 23.11, Python backend supports request rescheduling. By calling the `set_release_flags` function on the request object with the flag `pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE`, you can reschedule the request for further execution in a future batch. This feature is useful for -handling generative sequences. +handling iterative sequences. -The model config must be configured to enable generative sequence batching in +The model config must be configured to enable iterative sequence batching in order to use the request rescheduling API: ``` sequence_batching { - generative_sequence : true + iterative_sequence : true } ``` From ffbac67072c903210440b552333a2b8346de17db Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Mon, 4 Dec 2023 15:29:44 -0800 Subject: [PATCH 163/216] BLS Timeout Fix (#315) * Pass request timeout and increase size of timeout variable --- src/infer_request.cc | 4 ++-- src/infer_request.h | 8 ++++---- src/pb_stub.cc | 2 +- src/python_be.cc | 8 ++++++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index d641526e..da2a6b6c 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -42,7 +42,7 @@ InferRequest::InferRequest( const std::vector>& inputs, const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, - const std::string& parameters, const uint32_t flags, const int32_t timeout, + const std::string& parameters, const uint32_t flags, const uint64_t timeout, const intptr_t response_factory_address, const intptr_t request_address, const PreferredMemory& preferred_memory, const InferenceTrace& trace) : request_id_(request_id), correlation_id_(correlation_id), inputs_(inputs), @@ -145,7 +145,7 @@ InferRequest::ShmHandle() return shm_handle_; } -int32_t +uint64_t InferRequest::Timeout() { return timeout_; diff --git a/src/infer_request.h b/src/infer_request.h index 3d81c5d2..38850c61 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -70,7 +70,7 @@ struct InferRequestShm { intptr_t address; intptr_t response_factory_address; bool is_decoupled; - int32_t timeout; + uint64_t timeout; PreferredMemory preferred_memory; InferenceTrace trace; uint32_t request_release_flags; @@ -84,7 +84,7 @@ class InferRequest { const std::set& requested_output_names, const std::string& model_name, const int64_t model_version, const std::string& parameters, const uint32_t flags = 0, - const int32_t timeout = 0, const intptr_t response_factory_address = 0, + const uint64_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = PreferredMemory(PreferredMemory::DEFAULT, 0), @@ -100,7 +100,7 @@ class InferRequest { void SetFlags(uint32_t flags); const std::set& RequestedOutputNames(); bi::managed_external_buffer::handle_t ShmHandle(); - int32_t Timeout(); + uint64_t Timeout(); bool IsDecoupled(); void SetIsDecoupled(const bool is_decoupled); PreferredMemory& GetPreferredMemory(); @@ -158,7 +158,7 @@ class InferRequest { int64_t model_version_; std::string parameters_; uint32_t flags_; - int32_t timeout_; + uint64_t timeout_; intptr_t response_factory_address_; intptr_t request_address_; bool is_decoupled_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 3d473101..4c5e9ae7 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1581,7 +1581,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) const std::vector& requested_output_names, const std::string& model_name, const int64_t model_version, const uint32_t flags, - const int32_t timeout, + const uint64_t timeout, const PreferredMemory& preferred_memory, const InferenceTrace& trace, const py::object& parameters_) { diff --git a/src/python_be.cc b/src/python_be.cc index cec2d18a..ccdae3e4 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -372,6 +372,10 @@ ModelInstanceState::SaveRequestsToSharedMemory( } InferenceTrace trace = InferenceTrace(triton_trace); + uint64_t request_timeout; + RETURN_IF_ERROR(TRITONBACKEND_InferenceRequestTimeoutMicroseconds( + request, &request_timeout)); + std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; @@ -394,14 +398,14 @@ ModelInstanceState::SaveRequestsToSharedMemory( infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, - 0 /* BLS request timeout*/, reinterpret_cast(factory_ptr), + request_timeout, reinterpret_cast(factory_ptr), reinterpret_cast(request), PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, - 0 /* BLS request timeout*/, 0 /* response_factory_address */, + request_timeout, 0 /* response_factory_address */, reinterpret_cast(request), PreferredMemory(PreferredMemory::DEFAULT, 0), trace); } From 8b0fa4cc5daa4b1891cdc5b0b42079dbe2a60eae Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Fri, 8 Dec 2023 14:00:03 -0800 Subject: [PATCH 164/216] Fix BLS decoupled segfault and hang (#325) * Store InferPayload using the address of the object managed by the shared_ptr * Fix hang * Release GIL before sending message to the other process * Release GIL in the beginning --- src/infer_request.cc | 8 +++++++- src/python_be.cc | 4 ++-- src/python_be.h | 2 +- src/response_sender.cc | 7 +++++++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index da2a6b6c..c21feeaa 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -442,6 +442,13 @@ InferRequest::GetResponseSender() std::shared_ptr InferRequest::Exec(const bool is_decoupled) { + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; + // BLS should not be used in "initialize" or "finalize" function. std::unique_ptr& stub = Stub::GetOrCreateInstance(); if (!stub->IsInitialized() || stub->IsFinalizing()) { @@ -465,7 +472,6 @@ InferRequest::Exec(const bool is_decoupled) }); try { - py::gil_scoped_release release; ipc_message = IPCMessage::Create(shm_pool, true /* inline_response */); bool has_exception = false; PythonBackendException pb_exception(std::string{}); diff --git a/src/python_be.cc b/src/python_be.cc index ccdae3e4..6de5bcf3 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -752,7 +752,7 @@ ModelInstanceState::ExecuteBLSRequest( if (is_decoupled && (infer_response->Id() != nullptr)) { // Need to manage the lifetime of InferPayload object for bls // decoupled responses. - infer_payload_[reinterpret_cast(&infer_payload)] = + infer_payload_[reinterpret_cast(infer_payload.get())] = infer_payload; } @@ -943,7 +943,7 @@ ModelInstanceState::ProcessBLSCleanupRequest( reinterpret_cast(cleanup_request_message.data_.get()); void* id = cleanup_message_ptr->id; - infer_payload_.erase(id); + infer_payload_.erase(reinterpret_cast(id)); { bi::scoped_lock lock{*(message->ResponseMutex())}; diff --git a/src/python_be.h b/src/python_be.h index 5504e0c9..2fc755ca 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -286,7 +286,7 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr received_message_; std::vector> futures_; std::unique_ptr thread_pool_; - std::unordered_map> infer_payload_; + std::unordered_map> infer_payload_; std::unique_ptr request_executor_; std::mutex response_factory_map_mutex_; std::unordered_map diff --git a/src/response_sender.cc b/src/response_sender.cc index 1e2e9b50..c6b8f788 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -50,6 +50,13 @@ void ResponseSender::Send( std::shared_ptr infer_response, const uint32_t flags) { + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; + if (closed_) { throw PythonBackendException( "Unable to send response. Response sender has been closed."); From c5f304decda609ab21a004c525436e58dd527190 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 14 Dec 2023 16:03:59 -0800 Subject: [PATCH 165/216] Fix segfault for decoupled models (#327) * Set release flags and clean up response factory map before returning error * Address comments * Move the cleanup function to the outside scope * Delete response factory when response sender goes out of scope --- src/infer_request.cc | 14 -------- src/infer_request.h | 4 --- src/ipc_message.h | 3 +- src/pb_response_iterator.cc | 2 +- src/pb_stub.cc | 18 ++++++---- src/pb_stub.h | 9 +++-- src/python_be.cc | 68 +++++++++---------------------------- src/python_be.h | 6 ++-- src/response_sender.cc | 7 ++++ src/response_sender.h | 1 + 10 files changed, 47 insertions(+), 85 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index c21feeaa..f18900d0 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -405,20 +405,6 @@ InferRequest::InferRequest( #endif } -#ifndef TRITON_PB_STUB -TRITONSERVER_Error* -InferRequest::DeleteResponseFactory() -{ - TRITONBACKEND_ResponseFactory* response_factory = - reinterpret_cast( - response_factory_address_); - TRITONSERVER_Error* error = - TRITONBACKEND_ResponseFactoryDelete(response_factory); - - return error; -} -#endif - #ifdef TRITON_PB_STUB bool InferRequest::IsCancelled() diff --git a/src/infer_request.h b/src/infer_request.h index 38850c61..b8dee87c 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -137,10 +137,6 @@ class InferRequest { intptr_t RequestAddress(); ~InferRequest() {} -#ifndef TRITON_PB_STUB - TRITONSERVER_Error* DeleteResponseFactory(); -#endif - private: InferRequest( AllocatedSharedMemory& infer_request_shm, diff --git a/src/ipc_message.h b/src/ipc_message.h index d720a84d..866070f6 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -54,7 +54,8 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_AutoCompleteRequest, PYTHONSTUB_AutoCompleteResponse, PYTHONSTUB_LogRequest, - PYTHONSTUB_CleanupRequest, + PYTHONSTUB_BLSDecoupledInferPayloadCleanup, + PYTHONSTUB_BLSDecoupledResponseFactoryCleanup, PYTHONSTUB_MetricFamilyRequestNew, PYTHONSTUB_MetricFamilyRequestDelete, PYTHONSTUB_MetricRequestNew, diff --git a/src/pb_response_iterator.cc b/src/pb_response_iterator.cc index 1e0d631a..9abf4997 100644 --- a/src/pb_response_iterator.cc +++ b/src/pb_response_iterator.cc @@ -133,7 +133,7 @@ void ResponseIterator::Clear() { std::unique_ptr& stub = Stub::GetOrCreateInstance(); - stub->EnqueueCleanupId(id_); + stub->EnqueueCleanupId(id_, PYTHONSTUB_BLSDecoupledInferPayloadCleanup); { std::lock_guard lock{mu_}; response_buffer_.push(DUMMY_MESSAGE); diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 4c5e9ae7..53a6c540 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -993,8 +993,12 @@ Stub::ServiceStubToParentRequests() stub_to_parent_buffer_.pop(); if (utils_msg_payload->command_type == PYTHONSTUB_LogRequest) { SendLogMessage(utils_msg_payload); - } else if (utils_msg_payload->command_type == PYTHONSTUB_CleanupRequest) { - SendCleanupId(utils_msg_payload); + } else if ( + (utils_msg_payload->command_type == + PYTHONSTUB_BLSDecoupledInferPayloadCleanup) || + (utils_msg_payload->command_type == + PYTHONSTUB_BLSDecoupledResponseFactoryCleanup)) { + SendCleanupId(utils_msg_payload, utils_msg_payload->command_type); } else if ( utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) { SendIsCancelled(utils_msg_payload); @@ -1040,7 +1044,9 @@ Stub::SendLogMessage(std::unique_ptr& utils_msg_payload) } void -Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) +Stub::SendCleanupId( + std::unique_ptr& utils_msg_payload, + const PYTHONSTUB_CommandType& command_type) { void* id = utils_msg_payload->utils_message_ptr; { @@ -1050,7 +1056,7 @@ Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) std::unique_ptr ipc_message = IPCMessage::Create(shm_pool_, true /* inline_response */); - ipc_message->Command() = PYTHONSTUB_CleanupRequest; + ipc_message->Command() = command_type; AllocatedSharedMemory cleanup_request_message = shm_pool_->Construct( sizeof(CleanupMessage) + @@ -1072,11 +1078,11 @@ Stub::SendCleanupId(std::unique_ptr& utils_msg_payload) } void -Stub::EnqueueCleanupId(void* id) +Stub::EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type) { if (id != nullptr) { std::unique_ptr utils_msg_payload = - std::make_unique(PYTHONSTUB_CleanupRequest, id); + std::make_unique(command_type, id); EnqueueUtilsMessage(std::move(utils_msg_payload)); } } diff --git a/src/pb_stub.h b/src/pb_stub.h index 12b47abc..74a66b95 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -315,10 +315,13 @@ class Stub { std::shared_ptr infer_response); /// Send the id to the python backend for object cleanup - void SendCleanupId(std::unique_ptr& utils_msg_payload); + void SendCleanupId( + std::unique_ptr& utils_msg_payload, + const PYTHONSTUB_CommandType& command_type); - /// Add cleanup id to queue - void EnqueueCleanupId(void* id); + /// Add cleanup id to queue. This is used for cleaning up the infer_payload + /// and the response factory for BLS decoupled response. + void EnqueueCleanupId(void* id, const PYTHONSTUB_CommandType& command_type); /// Add request cancellation query to queue void EnqueueIsCancelled(PbCancel* pb_cancel); diff --git a/src/python_be.cc b/src/python_be.cc index 6de5bcf3..8dfa72b1 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -379,21 +379,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( std::unique_ptr infer_request; if (model_state->IsDecoupled()) { TRITONBACKEND_ResponseFactory* factory_ptr; - // Reuse the response factory if there is already a response factory - // associated with the request - std::lock_guard guard{response_factory_map_mutex_}; - { - if (response_factory_map_.find(reinterpret_cast(request)) != - response_factory_map_.end()) { - factory_ptr = - response_factory_map_[reinterpret_cast(request)]; - } else { - RETURN_IF_ERROR( - TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); - response_factory_map_[reinterpret_cast(request)] = - factory_ptr; - } - } + RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, @@ -843,7 +829,8 @@ ModelInstanceState::StubToParentMQMonitor() ProcessLogRequest(message); break; } - case PYTHONSTUB_CleanupRequest: { + case PYTHONSTUB_BLSDecoupledInferPayloadCleanup: + case PYTHONSTUB_BLSDecoupledResponseFactoryCleanup: { ProcessBLSCleanupRequest(message); break; } @@ -941,9 +928,17 @@ ModelInstanceState::ProcessBLSCleanupRequest( Stub()->ShmPool()->Load(message->Args()); CleanupMessage* cleanup_message_ptr = reinterpret_cast(cleanup_request_message.data_.get()); - - void* id = cleanup_message_ptr->id; - infer_payload_.erase(reinterpret_cast(id)); + intptr_t id = reinterpret_cast(cleanup_message_ptr->id); + if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { + // Remove the InferPayload object from the map. + infer_payload_.erase(id); + } else if ( + message->Command() == PYTHONSTUB_BLSDecoupledResponseFactoryCleanup) { + // Delete response factory + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + response_factory(reinterpret_cast(id)); + } { bi::scoped_lock lock{*(message->ResponseMutex())}; @@ -1172,12 +1167,6 @@ ModelInstanceState::ResponseSendDecoupled( std::lock_guard guard{closed_requests_mutex_}; closed_requests_.push_back(send_message_payload->request_address); } - - // Clean up the response factory map. - { - std::lock_guard guard{response_factory_map_mutex_}; - response_factory_map_.erase(send_message_payload->request_address); - } } if (send_message_payload->response != 0) { @@ -1195,14 +1184,7 @@ ModelInstanceState::ResponseSendDecoupled( error_message); std::vector, void*>> gpu_output_buffers; - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory_ptr; GPUBuffersHelper gpu_buffer_helper; - if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - response_factory_ptr.reset( - reinterpret_cast(response_factory)); - } #ifdef TRITON_ENABLE_GPU for (auto& output_tensor : infer_response->OutputTensors()) { @@ -1289,13 +1271,6 @@ ModelInstanceState::ResponseSendDecoupled( response_factory, send_message_payload->flags); SetErrorForResponseSendMessage( send_message_payload, WrapTritonErrorInSharedPtr(error), error_message); - - if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - std::unique_ptr< - TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> - response_factory(reinterpret_cast( - send_message_payload->response_factory_address)); - } } } @@ -1368,11 +1343,6 @@ ModelInstanceState::ProcessRequestsDecoupled( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } - // Reset the release flags for all the requests. - for (auto& infer_request : pb_infer_requests) { - infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - } - return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests."); } @@ -2499,15 +2469,9 @@ TRITONBACKEND_ModelInstanceExecute( } } - // We should only delete the response factory for the requests that have - // not been closed. for (auto& infer_request : infer_requests) { - if (!instance_state->ExistsInClosedRequests( - infer_request->RequestAddress())) { - LOG_IF_ERROR( - infer_request->DeleteResponseFactory(), - "Failed to delete the response factory."); - } + // Reset the release flags for all the requests. + infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); } } } diff --git a/src/python_be.h b/src/python_be.h index 2fc755ca..e644e159 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -288,9 +288,6 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr thread_pool_; std::unordered_map> infer_payload_; std::unique_ptr request_executor_; - std::mutex response_factory_map_mutex_; - std::unordered_map - response_factory_map_; public: static TRITONSERVER_Error* Create( @@ -403,7 +400,8 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle); - // Process the bls decoupled cleanup request + // Process the bls decoupled cleanup request for InferPayload and + // ResponseFactory void ProcessBLSCleanupRequest(const std::unique_ptr& message); // Process request cancellation query diff --git a/src/response_sender.cc b/src/response_sender.cc index c6b8f788..fe06e554 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -45,6 +45,13 @@ ResponseSender::ResponseSender( { } +ResponseSender::~ResponseSender() +{ + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->EnqueueCleanupId( + reinterpret_cast(response_factory_address_), + PYTHONSTUB_BLSDecoupledResponseFactoryCleanup); +} void ResponseSender::Send( diff --git a/src/response_sender.h b/src/response_sender.h index fda0d5d3..d29a6ab6 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -38,6 +38,7 @@ class ResponseSender { intptr_t request_address, intptr_t response_factory_address, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); + ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); bool IsCancelled(); From 7551f036fead433ab29edc21dd58e6ccc10b2daa Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 18 Dec 2023 10:22:51 -0500 Subject: [PATCH 166/216] Fix warning for GPU tensors (#330) --- src/infer_response.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/infer_response.cc b/src/infer_response.cc index 09737b26..5a898a7e 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -211,6 +211,10 @@ InferResponse::Send( std::vector, void*>>& output_buffers, const std::set& requested_output_names) { +#ifdef TRITON_ENABLE_GPU + static bool log_warning = true; +#endif // TRITON_ENABLE_GPU + std::shared_ptr response_error = WrapTritonErrorInSharedPtr(nullptr); std::unique_ptr response_error_handling; @@ -249,11 +253,6 @@ InferResponse::Send( } bool cuda_copy = false; -#ifdef TRITON_ENABLE_GPU - // This variable is used to avoid printing the same message multiple times - // when the output tensor is failed to be allocated from the CUDA memory pool. - bool log_warning = true; -#endif // TRITON_ENABLE_GPU for (auto& output_tensor : OutputTensors()) { // FIXME: for decoupled models we will skip the requested output names. From 950c47f0f989ae757136ff7d6441d653d6009de1 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Mon, 18 Dec 2023 16:16:18 -0800 Subject: [PATCH 167/216] Update name of ipc message type (#329) --- src/ipc_message.h | 2 +- src/pb_stub.cc | 2 +- src/python_be.cc | 9 ++++----- src/python_be.h | 5 ++--- src/response_sender.cc | 2 +- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/ipc_message.h b/src/ipc_message.h index 866070f6..ac28238c 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -55,7 +55,7 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_AutoCompleteResponse, PYTHONSTUB_LogRequest, PYTHONSTUB_BLSDecoupledInferPayloadCleanup, - PYTHONSTUB_BLSDecoupledResponseFactoryCleanup, + PYTHONSTUB_DecoupledResponseFactoryCleanup, PYTHONSTUB_MetricFamilyRequestNew, PYTHONSTUB_MetricFamilyRequestDelete, PYTHONSTUB_MetricRequestNew, diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 53a6c540..d1f8f6fd 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -997,7 +997,7 @@ Stub::ServiceStubToParentRequests() (utils_msg_payload->command_type == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) || (utils_msg_payload->command_type == - PYTHONSTUB_BLSDecoupledResponseFactoryCleanup)) { + PYTHONSTUB_DecoupledResponseFactoryCleanup)) { SendCleanupId(utils_msg_payload, utils_msg_payload->command_type); } else if ( utils_msg_payload->command_type == PYTHONSTUB_IsRequestCancelled) { diff --git a/src/python_be.cc b/src/python_be.cc index 8dfa72b1..3c9dd19d 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -830,8 +830,8 @@ ModelInstanceState::StubToParentMQMonitor() break; } case PYTHONSTUB_BLSDecoupledInferPayloadCleanup: - case PYTHONSTUB_BLSDecoupledResponseFactoryCleanup: { - ProcessBLSCleanupRequest(message); + case PYTHONSTUB_DecoupledResponseFactoryCleanup: { + ProcessCleanupRequest(message); break; } case PYTHONSTUB_IsRequestCancelled: { @@ -921,7 +921,7 @@ ModelInstanceState::ProcessLogRequest( } void -ModelInstanceState::ProcessBLSCleanupRequest( +ModelInstanceState::ProcessCleanupRequest( const std::unique_ptr& message) { AllocatedSharedMemory cleanup_request_message = @@ -932,8 +932,7 @@ ModelInstanceState::ProcessBLSCleanupRequest( if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { // Remove the InferPayload object from the map. infer_payload_.erase(id); - } else if ( - message->Command() == PYTHONSTUB_BLSDecoupledResponseFactoryCleanup) { + } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) { // Delete response factory std::unique_ptr< TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> diff --git a/src/python_be.h b/src/python_be.h index e644e159..f5620d07 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -400,9 +400,8 @@ class ModelInstanceState : public BackendModelInstance { std::unique_ptr* infer_response, bi::managed_external_buffer::handle_t* response_handle); - // Process the bls decoupled cleanup request for InferPayload and - // ResponseFactory - void ProcessBLSCleanupRequest(const std::unique_ptr& message); + // Process the decoupled cleanup request for InferPayload and ResponseFactory + void ProcessCleanupRequest(const std::unique_ptr& message); // Process request cancellation query void ProcessIsRequestCancelled(const std::unique_ptr& message); diff --git a/src/response_sender.cc b/src/response_sender.cc index fe06e554..94e3f0c8 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -50,7 +50,7 @@ ResponseSender::~ResponseSender() std::unique_ptr& stub = Stub::GetOrCreateInstance(); stub->EnqueueCleanupId( reinterpret_cast(response_factory_address_), - PYTHONSTUB_BLSDecoupledResponseFactoryCleanup); + PYTHONSTUB_DecoupledResponseFactoryCleanup); } void From 2bdb14c03011c618ddd5e8080d70052c34b19a9f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 8 Jan 2024 14:33:54 -0800 Subject: [PATCH 168/216] Move from jfrog artifactory to archives.boost.io to fix boost download (#334) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 54341e01..6fae6a00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,7 @@ FetchContent_MakeAvailable(dlpack) # ExternalProject_Add( boostorg - URL https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz + URL https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz URL_HASH SHA256=273f1be93238a068aba4f9735a4a2b003019af067b9c183ed227780b8f36062c PREFIX "boost-src" CONFIGURE_COMMAND ${CMAKE_COMMAND} -E copy_directory From 4ee0fce531eb6e0aa793d895101846115518ea5c Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 11 Jan 2024 11:57:35 -0800 Subject: [PATCH 169/216] Clean up response iterator map properly (#335) --- src/pb_stub.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index d1f8f6fd..a7d39852 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -372,6 +372,14 @@ Stub::RunCommand() } break; case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest: ipc_message->Command() = PYTHONSTUB_FinalizeResponse; + // Clean up response_iterator_map_ before sending sending message back to + // the parent process to make sure that the clean up message can be + // processed before the message queue is destroyed. + { + std::lock_guard lock(response_iterator_map_mu_); + std::unordered_map>().swap( + response_iterator_map_); + } SendIPCMessage(ipc_message); return true; // Terminate the stub process case PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers: @@ -1049,7 +1057,7 @@ Stub::SendCleanupId( const PYTHONSTUB_CommandType& command_type) { void* id = utils_msg_payload->utils_message_ptr; - { + if (command_type == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { std::lock_guard lock(response_iterator_map_mu_); response_iterator_map_.erase(id); } From 980a5bb00c3b136e9464d7667718f462e083afb9 Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 11 Jan 2024 12:02:19 -0800 Subject: [PATCH 170/216] Bumping min required cxx standard to 17 (#332) --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fae6a00..2b47df1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,9 @@ cmake_minimum_required(VERSION 3.17) project(tritonpythonbackend LANGUAGES C CXX) +# Use C++17 standard as Triton's minimum required. +set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") + # # Options # @@ -231,14 +234,14 @@ add_library( TritonPythonBackend::triton-python-backend ALIAS triton-python-backend ) -target_compile_features(triton-python-backend PRIVATE cxx_std_11) +target_compile_features(triton-python-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> ) -target_compile_features(triton-python-backend-stub PRIVATE cxx_std_11) +target_compile_features(triton-python-backend-stub PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend-stub PRIVATE $<$,$,$>: From 9d67dc39d2e42658c650525eccc836b2e991627b Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:21:50 -0800 Subject: [PATCH 171/216] Changing cuda cxx flag (#338) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b47df1d..2be987cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,7 +119,7 @@ set(boostorg_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/boost/") if(${TRITON_ENABLE_GPU}) find_package(CUDAToolkit REQUIRED) message(STATUS "Using CUDA ${CUDA_VERSION}") - set(CUDA_NVCC_FLAGS -std=c++11) + set(CUDA_NVCC_FLAGS -std=c++${TRITON_MIN_CXX_STANDARD}) elseif() message(WARNING "TRITON_ENABLE_GPU is OFF, GPU Tensor support will be disabled") endif() # TRITON_ENABLE_GPU From 37d29025f8da7c81cf9b6d88f5ff4d44e389a732 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:33:58 -0800 Subject: [PATCH 172/216] Improve decoupled shm handling (#337) * [DO NOT MERGE] Add shm trace util * [DO NOT MERGE] Expand shm leak util naming to ipc load * Revert "[DO NOT MERGE] Expand shm leak util naming to ipc load" This reverts commit 68906f2dd32fa70fe247321391ce26967d04ec5a. * Revert "[DO NOT MERGE] Add shm trace util" This reverts commit 37824ce137b009e0ef13b46f440e1f94c865180e. * Fix decoupled shared memory leak --- src/python_be.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python_be.cc b/src/python_be.cc index 3c9dd19d..a8dfab07 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -1328,6 +1328,7 @@ ModelInstanceState::ProcessRequestsDecoupled( AllocatedSharedMemory response_batch = Stub()->ShmPool()->Load(received_message_->Args()); + received_message_.reset(); uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); From 0371eb8f9ffd6e1f50ba5ceeee5da0d3cb1f6888 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 1 Feb 2024 10:06:23 -0800 Subject: [PATCH 173/216] Add double parameter handling (#333) * Support Double-Type Infer/Response Parameters --- src/python_be.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python_be.cc b/src/python_be.cc index a8dfab07..befdd593 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -340,6 +340,9 @@ ModelInstanceState::SaveRequestsToSharedMemory( } else if (type == TRITONSERVER_PARAMETER_STRING) { std::string string = reinterpret_cast(vvalue); RETURN_IF_ERROR(parameters_json.AddString(name, string)); + } else if (type == TRITONSERVER_PARAMETER_DOUBLE) { + RETURN_IF_ERROR(parameters_json.AddDouble( + name, *(reinterpret_cast(vvalue)))); } else { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, From ba616e26c256f11c41f7249c6a55220af8becee9 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 8 Feb 2024 11:28:10 -0800 Subject: [PATCH 174/216] Python Backend Windows Support (#294) * Base Python Backend Support for Windows --- CMakeLists.txt | 138 ++++++++++++------- src/infer_request.h | 2 +- src/metric_family.cc | 4 +- src/pb_env.cc | 48 ++++--- src/pb_env.h | 7 + src/pb_preferred_memory.h | 4 +- src/pb_stub.cc | 115 +++++++++++----- src/pb_stub.h | 32 ++--- src/pb_utils.cc | 120 +++++++++++------ src/pb_utils.h | 13 +- src/python_be.cc | 79 ++++++----- src/python_be.h | 23 +++- src/request_executor.cc | 6 +- src/shm_manager.h | 6 +- src/stub_launcher.cc | 272 ++++++++++++++++++++++++++++++++------ src/stub_launcher.h | 21 ++- 16 files changed, 629 insertions(+), 261 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2be987cd..bc5387ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,12 @@ option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) +# FIXME: CI needs to enable the GPU flag. Python for window currently does not +# support GPU tensors. For simplicity, we will override this option here. +if(WIN32) + set(TRITON_ENABLE_GPU OFF CACHE BOOL "GPU disabled" FORCE) +endif() + set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") @@ -96,6 +102,9 @@ FetchContent_Declare( GIT_TAG "v0.8" GIT_SHALLOW ON ) +# Option must be set off so WIN32 build does not break +set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) +set(BUILD_MOCK OFF) FetchContent_MakeAvailable(dlpack) # @@ -129,7 +138,10 @@ if(${TRITON_ENABLE_NVTX}) endif() # TRITON_ENABLE_NVTX find_package(ZLIB REQUIRED) -find_package(Threads REQUIRED) + +if(NOT WIN32) + find_package(Threads REQUIRED) +endif() include_directories(${CMAKE_BINARY_DIR}) configure_file(src/libtriton_python.ldscript libtriton_python.ldscript COPYONLY) @@ -174,21 +186,21 @@ set( ) set( - PYTHON_BACKEND_SRCS - src/python_be.cc - src/python_be.h - src/pb_env.cc - src/pb_env.h - src/pb_metric_reporter.cc - src/pb_metric_reporter.h - src/memory_manager.cc - src/memory_manager.h - src/request_executor.cc - src/request_executor.h - src/stub_launcher.h - src/stub_launcher.cc - src/infer_payload.h - src/infer_payload.cc + PYTHON_BACKEND_SRCS + src/python_be.cc + src/python_be.h + src/pb_env.cc + src/pb_env.h + src/pb_metric_reporter.cc + src/pb_metric_reporter.h + src/memory_manager.cc + src/memory_manager.h + src/request_executor.cc + src/request_executor.h + src/stub_launcher.h + src/stub_launcher.cc + src/infer_payload.h + src/infer_payload.cc ) list(APPEND @@ -239,48 +251,82 @@ target_compile_options( triton-python-backend PRIVATE $<$,$,$>: -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_compile_features(triton-python-backend-stub PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) target_compile_options( triton-python-backend-stub PRIVATE $<$,$,$>: - -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + -fvisibility=hidden -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) target_compile_definitions(triton-python-backend-stub PRIVATE TRITON_PB_STUB) -target_link_libraries( - triton-python-backend - PRIVATE +# For WIN32 do not link Threads and DL_LIBS +if(WIN32) + target_link_libraries( + triton-python-backend + PRIVATE + dlpack + triton-backend-utils # from repo-backend + -lrt # shared memory + triton-core-serverstub # from repo-core + ZLIB::ZLIB + -larchive + ) + + target_link_libraries( + triton-python-backend-stub + PRIVATE + dlpack + triton-backend-utils # from repo-backend + pybind11::embed + -lrt # shared memory + -larchive # libarchive + ) +else() + target_link_libraries( + triton-python-backend + PRIVATE + dlpack + Threads::Threads + triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose + -lrt # shared memory + triton-core-serverstub # from repo-core + ZLIB::ZLIB + -larchive + ) + + target_link_libraries( + triton-python-backend-stub + PRIVATE dlpack Threads::Threads - triton-backend-utils # from repo-backend - ${CMAKE_DL_LIBS} # dlopen and dlclose - -lrt # shared memory - triton-core-serverstub # from repo-core - ZLIB::ZLIB - -larchive -) - -target_link_libraries( - triton-python-backend-stub - PRIVATE - dlpack - Threads::Threads - triton-backend-utils # from repo-backend - ${CMAKE_DL_LIBS} # dlopen and dlclose - pybind11::embed - -lrt # shared memory - -larchive # libarchive -) + triton-backend-utils # from repo-backend + ${CMAKE_DL_LIBS} # dlopen and dlclose + pybind11::embed + -lrt # shared memory + -larchive # libarchive + ) +endif() -set_target_properties( - triton-python-backend PROPERTIES - POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_python - LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript - LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" -) +if(WIN32) + set_target_properties( + triton-python-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_python + ) +else() + set_target_properties( + triton-python-backend PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_python + LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_python.ldscript + LINK_FLAGS "-Wl,--version-script libtriton_python.ldscript" + ) +endif() add_subdirectory(./src/shm_monitor) diff --git a/src/infer_request.h b/src/infer_request.h index b8dee87c..ba586535 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -87,7 +87,7 @@ class InferRequest { const uint64_t timeout = 0, const intptr_t response_factory_address = 0, const intptr_t request_address = 0, const PreferredMemory& preferred_memory = - PreferredMemory(PreferredMemory::DEFAULT, 0), + PreferredMemory(PreferredMemory::kDefault, 0), const InferenceTrace& trace = InferenceTrace()); const std::vector>& Inputs(); diff --git a/src/metric_family.cc b/src/metric_family.cc index fb0fb93a..77e8aedf 100644 --- a/src/metric_family.cc +++ b/src/metric_family.cc @@ -201,9 +201,9 @@ TRITONSERVER_MetricKind MetricFamily::ToTritonServerMetricKind(const MetricKind& kind) { switch (kind) { - case COUNTER: + case kCounter: return TRITONSERVER_METRIC_KIND_COUNTER; - case GAUGE: + case kGauge: return TRITONSERVER_METRIC_KIND_GAUGE; default: throw PythonBackendException("Unknown metric kind"); diff --git a/src/pb_env.cc b/src/pb_env.cc index 0b6eb9ec..d9643a62 100644 --- a/src/pb_env.cc +++ b/src/pb_env.cc @@ -26,9 +26,11 @@ #include "pb_env.h" +#ifndef _WIN32 #include #include #include +#endif #include #include @@ -40,6 +42,29 @@ namespace triton { namespace backend { namespace python { +bool +FileExists(std::string& path) +{ + struct stat buffer; + return stat(path.c_str(), &buffer) == 0; +} + +void +LastModifiedTime(const std::string& path, time_t* last_modified_time) +{ + struct stat result; + if (stat(path.c_str(), &result) == 0) { + *last_modified_time = result.st_mtime; + } else { + throw PythonBackendException(std::string( + "LastModifiedTime() failed as file \'" + path + + std::string("\' does not exists."))); + } +} + +// FIXME: [DLIS-5969]: Develop platforom-agnostic functions +// to support custom python environments. +#ifndef _WIN32 void CopySingleArchiveEntry(archive* input_archive, archive* output_archive) { @@ -73,7 +98,6 @@ CopySingleArchiveEntry(archive* input_archive, archive* output_archive) } } - void ExtractTarFile(std::string& archive_path, std::string& dst_path) { @@ -153,27 +177,6 @@ ExtractTarFile(std::string& archive_path, std::string& dst_path) } } -bool -FileExists(std::string& path) -{ - struct stat buffer; - return stat(path.c_str(), &buffer) == 0; -} - -void -LastModifiedTime(const std::string& path, time_t* last_modified_time) -{ - struct stat result; - if (stat(path.c_str(), &result) == 0) { - *last_modified_time = result.st_mtime; - } else { - throw PythonBackendException(std::string( - "LastModifiedTime() failed as file \'" + path + - std::string("\' does not exists."))); - } -} - - void RecursiveDirectoryDelete(const char* dir) { @@ -326,5 +329,6 @@ EnvironmentManager::~EnvironmentManager() { RecursiveDirectoryDelete(base_path_); } +#endif }}} // namespace triton::backend::python diff --git a/src/pb_env.h b/src/pb_env.h index 09890ee8..04e01fa3 100644 --- a/src/pb_env.h +++ b/src/pb_env.h @@ -30,6 +30,11 @@ #include #include +#ifdef WIN32 +#include +#undef PATH_MAX +#define PATH_MAX MAX_PATH +#endif namespace triton { namespace backend { namespace python { void ExtractTarFile(std::string& archive_path, std::string& dst_path); @@ -39,6 +44,7 @@ bool FileExists(std::string& path); // // A class that manages Python environments // +#ifndef _WIN32 class EnvironmentManager { std::map> env_map_; char base_path_[PATH_MAX + 1]; @@ -52,5 +58,6 @@ class EnvironmentManager { std::string ExtractIfNotExtracted(std::string env_path); ~EnvironmentManager(); }; +#endif }}} // namespace triton::backend::python diff --git a/src/pb_preferred_memory.h b/src/pb_preferred_memory.h index 55f4db89..c28f1b87 100644 --- a/src/pb_preferred_memory.h +++ b/src/pb_preferred_memory.h @@ -30,10 +30,10 @@ namespace triton { namespace backend { namespace python { class PreferredMemory { public: - enum MemoryType { GPU, CPU, DEFAULT }; + enum MemoryType { kGPU, kCPU, kDefault }; PreferredMemory() - : preferred_memory_type_(MemoryType::DEFAULT), preferred_device_id_(0) + : preferred_memory_type_(MemoryType::kDefault), preferred_device_id_(0) { } diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a7d39852..26003f71 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -28,7 +28,6 @@ #include #include -#include #include #include @@ -55,6 +54,13 @@ #include "shm_manager.h" #include "triton/common/nvtx.h" +#ifdef _WIN32 +#include // SIGINT & SIGTERM +#include +#else +#include +#endif + #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU @@ -148,6 +154,7 @@ Stub::Instantiate( // interfere with the shared library resolution of other executable and // binaries. if (ipc_control_->uses_env) { +#ifndef _WIN32 char* ld_library_path = std::getenv("LD_LIBRARY_PATH"); if (ld_library_path != nullptr) { @@ -173,6 +180,11 @@ Stub::Instantiate( "When using an execution environment, LD_LIBRARY_PATH variable " "cannot be empty."); } +#else + throw PythonBackendException( + "Custom execution environments are not currently supported on " + "Windows."); +#endif } } catch (const PythonBackendException& pb_exception) { @@ -1444,10 +1456,22 @@ Logger::Log( // and pass messages to cerr if (!BackendLoggingActive()) { std::string path(filename); - size_t pos = path.rfind('/'); + size_t pos = path.rfind(std::filesystem::path::preferred_separator); if (pos != std::string::npos) { path = path.substr(pos + 1, std::string::npos); } +#ifdef _WIN32 + std::stringstream ss; + SYSTEMTIME system_time; + GetSystemTime(&system_time); + ss << LeadingLogChar(level) << std::setfill('0') << std::setw(2) + << system_time.wMonth << std::setw(2) << system_time.wDay << ' ' + << std::setw(2) << system_time.wHour << ':' << std::setw(2) + << system_time.wMinute << ':' << std::setw(2) << system_time.wSecond + << '.' << std::setw(6) << system_time.wMilliseconds * 1000 << ' ' + << static_cast(GetCurrentProcessId()) << ' ' << path << ':' + << lineno << "] "; +#else std::stringstream ss; struct timeval tv; gettimeofday(&tv, NULL); @@ -1460,6 +1484,7 @@ Logger::Log( << std::setw(6) << tv.tv_usec << ' ' << static_cast(getpid()) << ' ' << path << ':' << lineno << "] "; std::cerr << ss.str() << " " << message << std::endl; +#endif } else { // Ensure we do not create a stub instance before it has initialized std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -1471,37 +1496,37 @@ Logger::Log( void Logger::LogInfo(const std::string& message) { - Logger::Log(message, LogLevel::INFO); + Logger::Log(message, LogLevel::kInfo); } void Logger::LogWarn(const std::string& message) { - Logger::Log(message, LogLevel::WARNING); + Logger::Log(message, LogLevel::kWarning); } void Logger::LogError(const std::string& message) { - Logger::Log(message, LogLevel::ERROR); + Logger::Log(message, LogLevel::kError); } void Logger::LogVerbose(const std::string& message) { - Logger::Log(message, LogLevel::VERBOSE); + Logger::Log(message, LogLevel::kVerbose); } const std::string Logger::LeadingLogChar(const LogLevel& level) { switch (level) { - case LogLevel::WARNING: + case LogLevel::kWarning: return "W"; - case LogLevel::ERROR: + case LogLevel::kError: return "E"; - case LogLevel::INFO: - case LogLevel::VERBOSE: + case LogLevel::kInfo: + case LogLevel::kVerbose: default: return "I"; } @@ -1580,8 +1605,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("preferred_device_id").none(false) = 0); py::enum_(module, "MemoryType") - .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::GPU) - .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::CPU) + .value("TRITONSERVER_MEMORY_GPU", PreferredMemory::MemoryType::kGPU) + .value("TRITONSERVER_MEMORY_CPU", PreferredMemory::MemoryType::kCPU) .export_values(); py::class_>( @@ -1637,7 +1662,7 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("model_version").none(false) = -1, py::arg("flags").none(false) = 0, py::arg("timeout").none(false) = 0, py::arg("preferred_memory").none(false) = - PreferredMemory(PreferredMemory::DEFAULT, 0), + PreferredMemory(PreferredMemory::kDefault, 0), py::arg("trace").none(false) = InferenceTrace(), py::arg("parameters").none(true) = py::none()) .def( @@ -1758,14 +1783,14 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::class_ logger(module, "Logger"); py::enum_(logger, "LogLevel") - .value("INFO", LogLevel::INFO) - .value("WARNING", LogLevel::WARNING) - .value("ERROR", LogLevel::ERROR) - .value("VERBOSE", LogLevel::VERBOSE) + .value("INFO", LogLevel::kInfo) + .value("WARNING", LogLevel::kWarning) + .value("ERROR", LogLevel::kError) + .value("VERBOSE", LogLevel::kVerbose) .export_values(); logger.def_static( "log", py::overload_cast(&Logger::Log), - py::arg("message"), py::arg("level") = LogLevel::INFO); + py::arg("message"), py::arg("level") = LogLevel::kInfo); logger.def_static("log_info", &Logger::LogInfo, py::arg("message")); logger.def_static("log_warn", &Logger::LogWarn, py::arg("message")); logger.def_static("log_error", &Logger::LogError, py::arg("message")); @@ -1777,8 +1802,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def("value", &Metric::SendGetValueRequest); py::enum_(module, "MetricKind") - .value("COUNTER", MetricKind::COUNTER) - .value("GAUGE", MetricKind::GAUGE) + .value("COUNTER", MetricKind::kCounter) + .value("GAUGE", MetricKind::kGauge) .export_values(); py::class_>( @@ -1790,8 +1815,8 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) .def( "Metric", &MetricFamily::CreateMetric, py::arg("labels").none(true) = py::none()); - module.attr("MetricFamily").attr("COUNTER") = MetricKind::COUNTER; - module.attr("MetricFamily").attr("GAUGE") = MetricKind::GAUGE; + module.attr("MetricFamily").attr("COUNTER") = MetricKind::kCounter; + module.attr("MetricFamily").attr("GAUGE") = MetricKind::kGauge; module.def( "load_model", &LoadModel, py::arg("model_name").none(false), @@ -1819,12 +1844,13 @@ ModelContext::Init( const std::string& model_path, const std::string& runtime_modeldir, const std::string& triton_install_path, const std::string& model_version) { - type_ = ModelType::DEFAULT; + const char os_slash = std::filesystem::path::preferred_separator; + type_ = ModelType::kDefault; if (runtime_modeldir != "DEFAULT") { // For python based backends, existence of `model.py` in the corresponding // backend folder happens on the core side, so we can omit this check here. - python_model_path_ = runtime_modeldir + "/model.py"; - type_ = ModelType::BACKEND; + python_model_path_ = runtime_modeldir + os_slash + "model.py"; + type_ = ModelType::kBackend; } else { python_model_path_ = model_path; // Check if model file exists in this path. @@ -1835,7 +1861,7 @@ ModelContext::Init( } } - model_dir_ = model_path.substr(0, model_path.find_last_of("\\/")); + model_dir_ = model_path.substr(0, model_path.find_last_of(os_slash)); python_backend_folder_ = triton_install_path; model_version_ = model_version; runtime_modeldir_ = runtime_modeldir; @@ -1844,8 +1870,9 @@ ModelContext::Init( void ModelContext::StubSetup(py::module& sys) { + const char os_slash = std::filesystem::path::preferred_separator; std::string model_name = - python_model_path_.substr(python_model_path_.find_last_of("/") + 1); + python_model_path_.substr(python_model_path_.find_last_of(os_slash) + 1); // Model name without the .py extension auto dotpy_pos = model_name.find_last_of(".py"); @@ -1858,11 +1885,11 @@ ModelContext::StubSetup(py::module& sys) // returned by 'find_last_of'. Need to manually adjust the position. std::string model_name_trimmed = model_name.substr(0, dotpy_pos - 2); - if (type_ == ModelType::DEFAULT) { + if (type_ == ModelType::kDefault) { std::string model_path_parent = - python_model_path_.substr(0, python_model_path_.find_last_of("/")); + python_model_path_.substr(0, python_model_path_.find_last_of(os_slash)); std::string model_path_parent_parent = - model_path_parent.substr(0, model_path_parent.find_last_of("/")); + model_path_parent.substr(0, model_path_parent.find_last_of(os_slash)); sys.attr("path").attr("append")(model_path_parent); sys.attr("path").attr("append")(model_path_parent_parent); sys.attr("path").attr("append")(python_backend_folder_); @@ -1870,7 +1897,7 @@ ModelContext::StubSetup(py::module& sys) (std::string(model_version_) + "." + model_name_trimmed).c_str()); } else { std::string model_path_parent = - python_model_path_.substr(0, python_model_path_.find_last_of("/")); + python_model_path_.substr(0, python_model_path_.find_last_of(os_slash)); std::string backend_model_dir(model_path_parent); sys.attr("path").attr("append")(backend_model_dir); sys.attr("path").attr("append")(python_backend_folder_); @@ -1878,6 +1905,22 @@ ModelContext::StubSetup(py::module& sys) } } +#ifdef _WIN32 +bool +ParentProcessActive(DWORD parent_id) +{ + HANDLE parent = OpenProcess(PROCESS_ALL_ACCESS, FALSE, parent_id); + DWORD exit_code; + GetExitCodeProcess(parent, &exit_code); + return (exit_code == STILL_ACTIVE); +} +#else +bool +ParentProcessActive(pid_t parent_id) +{ + return (kill(parent_id, 0) == 0); +} +#endif extern "C" { @@ -1902,8 +1945,9 @@ main(int argc, char** argv) // Find the package name from model path. size_t prev = 0, pos = 0; + const char os_slash = std::filesystem::path::preferred_separator; do { - pos = model_path.find("/", prev); + pos = model_path.find(os_slash, prev); if (pos == std::string::npos) pos = model_path.length(); std::string token = model_path.substr(prev, pos - prev); @@ -1938,8 +1982,11 @@ main(int argc, char** argv) // Start the Python Interpreter py::scoped_interpreter guard{}; +#ifdef _WIN32 + DWORD parent_pid = (DWORD)std::stoul(argv[5]); +#else pid_t parent_pid = std::stoi(argv[5]); - +#endif std::atomic background_thread_running = {true}; std::thread background_thread = std::thread([&parent_pid, &background_thread_running, &stub, &logger] { @@ -1958,7 +2005,7 @@ main(int argc, char** argv) stub->UpdateHealth(); - if (kill(parent_pid, 0) != 0) { + if (!ParentProcessActive(parent_pid)) { // When unhealthy, we should stop attempting to send // messages to the backend ASAP. if (stub->StubToParentServiceActive()) { diff --git a/src/pb_stub.h b/src/pb_stub.h index 74a66b95..a51f25f5 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -30,18 +30,7 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include "infer_request.h" #include "infer_response.h" @@ -81,17 +70,17 @@ namespace triton { namespace backend { namespace python { } while (false) /// Macros that use current filename and line number. -#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::INFO) -#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::WARNING) -#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::ERROR) -#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::VERBOSE) +#define LOG_INFO LOG_FL(__FILE__, __LINE__, LogLevel::kInfo) +#define LOG_WARN LOG_FL(__FILE__, __LINE__, LogLevel::kWarning) +#define LOG_ERROR LOG_FL(__FILE__, __LINE__, LogLevel::kError) +#define LOG_VERBOSE LOG_FL(__FILE__, __LINE__, LogLevel::kVerbose) class Logger { public: Logger() { backend_logging_active_ = false; }; ~Logger() { log_instance_.reset(); }; /// Python client log function - static void Log(const std::string& message, LogLevel level = LogLevel::INFO); + static void Log(const std::string& message, LogLevel level = LogLevel::kInfo); /// Python client log info function static void LogInfo(const std::string& message); @@ -138,7 +127,8 @@ class LogMessage { LogMessage(const char* file, int line, LogLevel level) : level_(level) { std::string path(file); - size_t pos = path.rfind('/'); + const char os_slash = std::filesystem::path::preferred_separator; + size_t pos = path.rfind(os_slash); if (pos != std::string::npos) { path = path.substr(pos + 1, std::string::npos); } @@ -185,10 +175,10 @@ class ModelContext { // Triton supports python-based backends, // i.e. backends that provide common `model.py`, that can be re-used // between different models. `ModelType` helps to differentiate - // between models running with c++ python backend (ModelType::DEFAULT) - // and models running with python-based backend (ModelType::BACKEND) + // between models running with c++ python backend (ModelType::kDefault) + // and models running with python-based backend (ModelType::kBackend) // at the time of ModelContext::StubSetup to properly set up paths. - enum ModelType { DEFAULT, BACKEND }; + enum ModelType { kDefault, kBackend }; ModelType type_; }; diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 5aa95b8b..7bc17fa4 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -26,27 +26,14 @@ #include "pb_utils.h" -#include -#include +#ifdef _WIN32 +#include + +#include +#else #include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "scoped_defer.h" +#endif + #ifdef TRITON_ENABLE_GPU #include @@ -59,42 +46,43 @@ namespace triton { namespace backend { namespace python { CUDAHandler::CUDAHandler() { - dl_open_handle_ = dlopen("libcuda.so", RTLD_LAZY); + dl_open_handle_ = LoadSharedObject("libcuda.so"); // If libcuda.so is successfully opened, it must be able to find // "cuPointerGetAttribute", "cuGetErrorString", and // "cuDevicePrimaryCtxGetState" symbols. if (dl_open_handle_ != nullptr) { - void* cu_pointer_get_attribute_fn = - dlsym(dl_open_handle_, "cuPointerGetAttribute"); + void* cu_pointer_get_attribute_fn = LocateSymbol("cuPointerGetAttribute"); if (cu_pointer_get_attribute_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuPointerGetAttribute'. Error: ") + - dlerror()); + std::string("Failed to locate 'cuPointerGetAttribute'. Error: ") + + LocateSymbolError()); } *((void**)&cu_pointer_get_attribute_fn_) = cu_pointer_get_attribute_fn; - void* cu_get_error_string_fn = dlsym(dl_open_handle_, "cuGetErrorString"); + void* cu_get_error_string_fn = LocateSymbol("cuGetErrorString"); if (cu_get_error_string_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuGetErrorString'. Error: ") + - dlerror()); + std::string("Failed to locate 'cuGetErrorString'. Error: ") + + LocateSymbolError()); } *((void**)&cu_get_error_string_fn_) = cu_get_error_string_fn; - void* cu_init_fn = dlsym(dl_open_handle_, "cuInit"); + void* cu_init_fn = LocateSymbol("cuInit"); if (cu_init_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuInit'. Error: ") + dlerror()); + std::string("Failed to locate 'cuInit'. Error: ") + + LocateSymbolError()); } *((void**)&cu_init_fn_) = cu_init_fn; void* cu_device_primary_ctx_get_state_fn = - dlsym(dl_open_handle_, "cuDevicePrimaryCtxGetState"); + LocateSymbol("cuDevicePrimaryCtxGetState"); if (cu_device_primary_ctx_get_state_fn == nullptr) { throw PythonBackendException( - std::string("Failed to dlsym 'cuDevicePrimaryCtxGetState'. Error: ") + - dlerror()); + std::string( + "Failed to locate 'cuDevicePrimaryCtxGetState'. Error: ") + + LocateSymbolError()); } *((void**)&cu_device_primary_ctx_get_state_fn_) = cu_device_primary_ctx_get_state_fn; @@ -105,10 +93,7 @@ CUDAHandler::CUDAHandler() const char* error_string; (*cu_get_error_string_fn_)(cuda_err, &error_string); error_str_ = std::string("failed to call cuInit: ") + error_string; - int status = dlclose(dl_open_handle_); - if (status != 0) { - throw PythonBackendException("Failed to close the libcuda handle."); - } + CloseLibrary(); dl_open_handle_ = nullptr; } } @@ -215,13 +200,58 @@ CUDAHandler::MaybeSetDevice(int device) CUDAHandler::~CUDAHandler() noexcept(false) { if (dl_open_handle_ != nullptr) { - int status = dlclose(dl_open_handle_); - if (status != 0) { - throw PythonBackendException("Failed to close the libcuda handle."); - } + CloseLibrary(); + } +} + +void* +CUDAHandler::LoadSharedObject(const char* filename) +{ +#ifdef _WIN32 + // NOTE: 'nvcuda.dll' is a placeholder library. Apparently, this should be the + // equivalent library for Windows, but need to verify. + return LoadLibraryA("nvcuda.dll"); +#else + return dlopen("libcuda.so", RTLD_LAZY); +#endif +} + +void* +CUDAHandler::LocateSymbol(const char* symbol) +{ +#ifdef _WIN32 + return GetProcAddress(static_cast(dl_open_handle_), symbol); +#else + return dlsym(dl_open_handle_, symbol); +#endif +} + + +std::string +CUDAHandler::LocateSymbolError() +{ +#ifdef _WIN32 + return std::to_string(GetLastError()); +#else + return dlerror(); +#endif +} + +void +CUDAHandler::CloseLibrary() +{ + bool successful = true; +#ifdef _WIN32 + successful = (FreeLibrary(static_cast(dl_open_handle_)) != 0); +#else + successful = (dlclose(dl_open_handle_) == 0); +#endif + if (!successful) { + throw PythonBackendException("Failed to close the cuda library handle."); } } + ScopedSetDevice::ScopedSetDevice(int device) { device_ = device; @@ -258,6 +288,14 @@ IsUsingCUDAPool( #endif // TRITON_ENABLE_GPU +// FIXME: [DLIS-6078]: We should not need this function. However, some paths are +// being retrieved from core that are not platform-agnostic. +void +SanitizePath(std::string& path) +{ + std::replace(path.begin(), path.end(), '/', '\\'); +} + #ifndef TRITON_PB_STUB std::shared_ptr WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) diff --git a/src/pb_utils.h b/src/pb_utils.h index 0873eb03..6d5f21ce 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -29,7 +29,6 @@ #ifdef TRITON_ENABLE_GPU #include #endif // TRITON_ENABLE_GPU -#include #include #include @@ -167,9 +166,9 @@ struct ResponseBatch : SendMessageBase { uint32_t response_size; }; -enum LogLevel { INFO = 0, WARNING, ERROR, VERBOSE }; +enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; -enum MetricKind { COUNTER, GAUGE }; +enum MetricKind { kCounter = 0, kGauge }; struct LogSendMessage : SendMessageBase { bi::managed_external_buffer::handle_t filename; @@ -294,6 +293,10 @@ class CUDAHandler { int64_t memory_type_id, cudaIpcMemHandle_t* cuda_mem_handle, void** data_ptr); void CloseCudaHandle(int64_t memory_type_id, void* data_ptr); + void* LoadSharedObject(const char* filename); + void* LocateSymbol(const char* symbol); + std::string LocateSymbolError(); + void CloseLibrary(); /// Set the device only if the primary context has already been created for /// this device. Inspired from PyTorch's MaybeSetDevice. @@ -323,6 +326,10 @@ bool IsUsingCUDAPool( #endif // TRITON_ENABLE_GPU +// FIXME: [DLIS-6078]: We should not need this function. However, some paths are +// being retrieved from core that are not platform-agnostic. +void SanitizePath(std::string& path); + #ifndef TRITON_PB_STUB std::shared_ptr WrapTritonErrorInSharedPtr( TRITONSERVER_Error* error); diff --git a/src/python_be.cc b/src/python_be.cc index befdd593..0fa318ff 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -25,6 +25,8 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "python_be.h" +#include + #include "gpu_buffers.h" #include "infer_payload.h" #include "model_loader.h" @@ -367,12 +369,15 @@ ModelInstanceState::SaveRequestsToSharedMemory( uint32_t flags; RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(request, &flags)); + // Do not return if error in this case, because Triton core + // will return an error if tracing is disabled (see PYBE PR#295). TRITONSERVER_InferenceTrace* triton_trace; auto err = TRITONBACKEND_RequestTrace(request, &triton_trace); if (err != nullptr) { triton_trace = nullptr; TRITONSERVER_ErrorDelete(err); } + InferenceTrace trace = InferenceTrace(triton_trace); uint64_t request_timeout; @@ -389,14 +394,14 @@ ModelInstanceState::SaveRequestsToSharedMemory( model_state->Name(), model_state->Version(), parameters_string, flags, request_timeout, reinterpret_cast(factory_ptr), reinterpret_cast(request), - PreferredMemory(PreferredMemory::DEFAULT, 0), trace); + PreferredMemory(PreferredMemory::kDefault, 0), trace); } else { infer_request = std::make_unique( id, correlation_id, pb_input_tensors, requested_output_names, model_state->Name(), model_state->Version(), parameters_string, flags, request_timeout, 0 /* response_factory_address */, reinterpret_cast(request), - PreferredMemory(PreferredMemory::DEFAULT, 0), trace); + PreferredMemory(PreferredMemory::kDefault, 0), trace); } RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); @@ -884,25 +889,25 @@ ModelInstanceState::ProcessLogRequest( LogLevel level = pb_log_message->Level(); switch (level) { - case LogLevel::INFO: { + case LogLevel::kInfo: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_INFO, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::WARNING: { + case LogLevel::kWarning: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_WARN, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::ERROR: { + case LogLevel::kError: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_ERROR, (filename.c_str()), line, (log_message.c_str())); break; } - case LogLevel::VERBOSE: { + case LogLevel::kVerbose: { TRITONSERVER_LogMessage( TRITONSERVER_LOG_VERBOSE, (filename.c_str()), line, (log_message.c_str())); @@ -1422,7 +1427,7 @@ ModelInstanceState::ProcessRequests( // This means that the stub process has exited and Python // backend failed to restart the stub process. - if (Stub()->StubPid() == 0) { + if (!Stub()->StubActive()) { const char* error_message = "The stub process has exited unexpectedly."; RespondErrorToAllRequests( error_message, responses, requests, request_count); @@ -2056,7 +2061,7 @@ ModelState::SetModelConfig() extern "C" { -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) { const char* cname; @@ -2239,27 +2244,33 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) .c_str()); // Use BackendArtifacts to determine the location of Python files - const char* location; + const char* clocation; TRITONBACKEND_ArtifactType artifact_type; RETURN_IF_ERROR( - TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &location)); - + TRITONBACKEND_BackendArtifacts(backend, &artifact_type, &clocation)); + + const char os_slash = std::filesystem::path::preferred_separator; + std::string location(clocation); +#ifdef _WIN32 + const std::string stub_executable_name = "triton_python_backend_stub.exe"; + SanitizePath(location); + SanitizePath(default_backend_dir_string); +#else + const std::string stub_executable_name = "triton_python_backend_stub"; +#endif // Check if `triton_python_backend_stub` and `triton_python_backend_utils.py` // are located under `location`. - // DLIS-5596: Add forward slash to be platform agnostic - // (i.e. For Windows, we need to use backward slash). std::string default_python_backend_dir = - default_backend_dir_string + "/python"; - std::string backend_stub_path = - std::string(location) + "/triton_python_backend_stub"; + default_backend_dir_string + os_slash + "python"; + std::string backend_stub_path = location + os_slash + stub_executable_name; std::string backend_utils = - std::string(location) + "/triton_python_backend_utils.py"; + location + os_slash + "triton_python_backend_utils.py"; // Both, stub and utils should be in the same location if (FileExists(backend_stub_path) && FileExists(backend_utils)) { backend_state->python_lib = location; // If `location` is default location of a python backend, // then we are using default python backend. - if (default_python_backend_dir == std::string(location)) { + if (default_python_backend_dir == location) { backend_state->runtime_modeldir = ""; } else { // If `location` is not default location of a python backend, @@ -2272,22 +2283,26 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) // then we are using a python backend based backend and stub and utils are // stored in the default python backend location. if (!default_backend_dir_string.empty()) { - std::string backend_stub_path = - default_backend_dir_string + "/python/triton_python_backend_stub"; + std::string backend_stub_path = default_backend_dir_string + os_slash + + "python" + os_slash + + stub_executable_name; if (!FileExists(backend_stub_path)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_NOT_FOUND, - (std::string("triton_python_backend_stub") + - " is not found. Searched paths: " + default_backend_dir_string + - "/python and" + std::string(location)) + (stub_executable_name + " is not found. Searched paths: " + + default_backend_dir_string + os_slash + "python and " + location) .c_str()); } } backend_state->runtime_modeldir = location; - backend_state->python_lib = default_backend_dir_string + "/python"; + backend_state->python_lib = + default_backend_dir_string + os_slash + "python"; } - +// FIXME [DLIS-5969]: Enable for Windows when custom execution environments +// are supported. +#ifndef _WIN32 backend_state->env_manager = std::make_unique(); +#endif RETURN_IF_ERROR(TRITONBACKEND_BackendSetState( backend, reinterpret_cast(backend_state.get()))); @@ -2296,7 +2311,7 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_Finalize: Start"); @@ -2308,7 +2323,7 @@ TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend) return nullptr; // success } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { const char* cname; @@ -2335,7 +2350,7 @@ TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; @@ -2351,7 +2366,7 @@ TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { const char* cname; @@ -2394,7 +2409,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) @@ -2519,7 +2534,7 @@ TRITONBACKEND_ModelInstanceExecute( return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; @@ -2536,7 +2551,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) return nullptr; } -TRITONSERVER_Error* +TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_GetBackendAttribute( TRITONBACKEND_Backend* backend, TRITONBACKEND_BackendAttribute* backend_attributes) diff --git a/src/python_be.h b/src/python_be.h index f5620d07..4430767c 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -26,12 +26,8 @@ #pragma once -#include #include #include -#include -#include -#include #include #include @@ -84,6 +80,14 @@ #include "triton/core/tritonbackend.h" #include "triton/core/tritonserver.h" +#ifdef _WIN32 +#define NOMINMAX +#include +#else +#include +#include +#endif + #define LOG_IF_EXCEPTION(X) \ do { \ try { \ @@ -217,7 +221,12 @@ struct BackendState { std::atomic number_of_instance_inits; std::string shared_memory_region_prefix; int64_t thread_pool_size; + +// FIXME [DLIS-5969]: Enable for Windows when custom execution environments +// are supported. +#ifndef _WIN32 std::unique_ptr env_manager; +#endif std::string runtime_modeldir; }; @@ -299,7 +308,8 @@ class ModelInstanceState : public BackendModelInstance { // Launch stub process. TRITONSERVER_Error* LaunchStubProcess(); - TRITONSERVER_Error* SendMessageToStub(off_t message); + TRITONSERVER_Error* SendMessageToStub( + bi::managed_external_buffer::handle_t message); void ResponseSendDecoupled(std::shared_ptr response_send_message); // Checks whether the stub process is live @@ -307,7 +317,8 @@ class ModelInstanceState : public BackendModelInstance { // Get a message from the stub process void SendMessageAndReceiveResponse( - off_t message, off_t& response, bool& restart, + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, bool& restart, std::shared_ptr>& responses, TRITONBACKEND_Request** requests, const uint32_t request_count); diff --git a/src/request_executor.cc b/src/request_executor.cc index 65f53710..d78972a5 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -48,10 +48,10 @@ MemoryTypeToTritonMemoryType( const PreferredMemory::MemoryType& memory_type) { switch (memory_type) { - case PreferredMemory::MemoryType::CPU: + case PreferredMemory::MemoryType::kCPU: *triton_memory_type = TRITONSERVER_MEMORY_CPU; break; - case PreferredMemory::MemoryType::GPU: + case PreferredMemory::MemoryType::kGPU: *triton_memory_type = TRITONSERVER_MEMORY_GPU; break; @@ -202,7 +202,7 @@ ResponseAlloc( ScopedDefer _([&shm_pool] { shm_pool.release(); }); if (p->preferred_memory.PreferredMemoryType() == - PreferredMemory::MemoryType::DEFAULT) { + PreferredMemory::MemoryType::kDefault) { *actual_memory_type = preferred_memory_type; *actual_memory_type_id = preferred_memory_type_id; } else { diff --git a/src/shm_manager.h b/src/shm_manager.h index 5063273b..25e04570 100644 --- a/src/shm_manager.h +++ b/src/shm_manager.h @@ -26,8 +26,6 @@ #pragma once -#include - #include #include #include @@ -92,9 +90,9 @@ struct AllocatedSharedMemory { // info is placed in the beginning and the actual object is placed after that // (i.e. 4 plus the aligned address is not 16-bytes aligned). The aligned memory // is required by semaphore otherwise it may lead to SIGBUS error on ARM. -struct AllocatedShmOwnership { +struct alignas(16) AllocatedShmOwnership { uint32_t ref_count_; -} __attribute__((aligned(16))); +}; class SharedMemoryManager { public: diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index b0627486..a9956b55 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -26,12 +26,18 @@ #include "stub_launcher.h" +#include + #include "python_be.h" +#ifdef _WIN32 +#include // getpid() +#endif + namespace triton { namespace backend { namespace python { StubLauncher::StubLauncher(const std::string stub_process_kind) - : parent_pid_(0), stub_pid_(0), is_initialized_(false), + : parent_pid_(0), is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(""), device_id_(0), kind_("") { @@ -40,8 +46,7 @@ StubLauncher::StubLauncher(const std::string stub_process_kind) StubLauncher::StubLauncher( const std::string stub_process_kind, const std::string model_instance_name, const int32_t device_id, const std::string kind) - : parent_pid_(0), stub_pid_(0), is_initialized_(false), - stub_process_kind_(stub_process_kind), + : is_initialized_(false), stub_process_kind_(stub_process_kind), model_instance_name_(model_instance_name), device_id_(device_id), kind_(kind) { @@ -65,6 +70,13 @@ StubLauncher::Initialize(ModelState* model_state) if (runtime_modeldir_.empty()) { runtime_modeldir_ = "DEFAULT"; } +#ifdef _WIN32 + ZeroMemory(&startup_info_, sizeof(startup_info_)); + startup_info_.cb = sizeof(startup_info_); + ZeroMemory(&stub_pid_, sizeof(stub_pid_)); +#else + stub_pid_ = 0; +#endif // Atomically increase and read the stub process count to avoid shared memory // region name collision @@ -76,7 +88,8 @@ StubLauncher::Initialize(ModelState* model_state) model_version_ = model_state->Version(); std::stringstream ss; - ss << model_repository_path_ << "/" << model_version_ << "/"; + const char os_slash = std::filesystem::path::preferred_separator; + ss << model_repository_path_ << os_slash << model_version_ << os_slash; std::string artifact_name; RETURN_IF_ERROR(model_state->ModelConfig().MemberAsString( "default_model_filename", &artifact_name)); @@ -89,31 +102,20 @@ StubLauncher::Initialize(ModelState* model_state) model_path_ = ss.str(); - // Path to the extracted Python env - std::string python_execution_env = ""; + // FIXME [DLIS-5969]: Enable for Windows when custom execution environments + // are supported. if (python_execution_env_ != "") { - try { - python_execution_env = - model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( - python_execution_env_); - } - catch (PythonBackendException& pb_exception) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - } - - path_to_activate_ = python_execution_env + "/bin/activate"; - path_to_libpython_ = python_execution_env + "/lib"; - if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - ("Path " + path_to_activate_ + - " does not exist. The Python environment should contain an " - "'activate' script.") - .c_str()); - } +#ifndef _WIN32 + RETURN_IF_ERROR(GetPythonEnvironment(model_state)); +#else + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "Custom execution environments are not currently supported on " + "Windows."); +#endif } + parent_pid_ = getpid(); return nullptr; @@ -195,6 +197,139 @@ StubLauncher::Setup() return nullptr; } +// FIXME: This should be merged with the Unix launch function once Windows +// CI and functionality are demonstrably stable. The goal of keeping the +// functions separate is to help debug Windows-specific issues without worrying +// about the impact to our Unix builds. +#ifdef _WIN32 +TRITONSERVER_Error* +StubLauncher::Launch() +{ + std::string stub_name; + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + stub_name = model_name_; + } else { + stub_name = model_instance_name_; + } + + const char os_slash = std::filesystem::path::preferred_separator; + + const std::string stub_executable_name = "triton_python_backend_stub.exe"; + SanitizePath(model_path_); + SanitizePath(model_repository_path_); + + // Default Python backend stub + std::string python_backend_stub = + python_lib_ + os_slash + stub_executable_name; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Stub path ") + python_backend_stub).c_str()); + + // Path to alternative Python backend stub + std::string model_python_backend_stub = + std::string(model_repository_path_) + os_slash + stub_executable_name; + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Alt path ") + python_backend_stub).c_str()); + + // Check if file exists + // TODO: Integrate win32 and pb_env + if (FileExists(model_python_backend_stub)) { + python_backend_stub = model_python_backend_stub; + } + + std::string launch_command; + + std::stringstream ss; + ss << python_backend_stub << " " << model_path_ << " " << shm_region_name_ + << " " << shm_default_byte_size_ << " " << shm_growth_byte_size_ << " " + << parent_pid_ << " " << python_lib_ << " " << ipc_control_handle_ << " " + << stub_name << " " << runtime_modeldir_; + launch_command = ss.str(); + + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Starting Python backend stub: ") + launch_command).c_str()); + + LPSTR launch_command_lpstr = const_cast(launch_command.c_str()); + // Start the child process. Unlike fork(), the remainder of this + // function exists in the context of the parent, only. + if (!CreateProcess( + NULL, // No module name (use command line) + launch_command_lpstr, // Command line + NULL, // Process handle not inheritable + NULL, // Thread handle not inheritable + FALSE, // Set handle inheritance to FALSE + 0, // No creation flags + NULL, // Use parent's environment block + NULL, // Use parent's starting directory + &startup_info_, // Pointer to STARTUPINFO structure + &stub_pid_) // Pointer to PROCESS_INFORMATION structure + ) { + std::stringstream ss; + ss << "Failed to run python backend stub. Errno = " << errno << '\n' + << "Python backend stub path: " << python_backend_stub << '\n' + << "Shared Memory Region Name: " << shm_region_name_ << '\n' + << "Shared Memory Default Byte Size: " << shm_default_byte_size_ << '\n' + << "Shared Memory Growth Byte Size: " << shm_growth_byte_size_ << '\n'; + // Print the error message directly because the underlying mutexes in + // LOG_MESSAGE() could be forked when it is locked by other thread(s). + std::cerr << '\n' << ss.str() << '\n'; + _Exit(1); + } + ScopedDefer _([&] { + // Push a dummy message to the message queue so that the stub + // process is notified that it can release the object stored in + // shared memory. + stub_message_queue_->Push(DUMMY_MESSAGE); + + // If the model is not initialized, wait for the stub process to exit. + if (!is_initialized_) { + stub_message_queue_.reset(); + parent_message_queue_.reset(); + memory_manager_.reset(); + WaitForStubProcess(); + } + }); + + // The stub process would send two messages to the parent process during the + // initialization. + // 1. When the stub process's health monitoring thread has started. + // 2. When the initialization is fully completed and the Python model is + // loaded. + // + // The reason it is broken into two steps is that creation of the health + // monitoring thread may take longer which can make the server process think + // that the stub process is unhealthy and return early. Waiting until the + // health thread is spawn would make sure would prevent this issue. + parent_message_queue_->Pop(); + + if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + try { + AutocompleteStubProcess(); + } + catch (const PythonBackendException& ex) { + // Need to kill the stub process first + KillStubProcess(); + throw BackendModelException( + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); + } + } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(ModelInstanceStubProcess()); + } else { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("Unknown stub_process_kind: ") + stub_process_kind_) + .c_str()); + } + + is_initialized_ = true; + + return nullptr; +} +#else TRITONSERVER_Error* StubLauncher::Launch() { @@ -307,11 +442,10 @@ StubLauncher::Launch() // If the model is not initialized, wait for the stub process to exit. if (!is_initialized_) { - int status; stub_message_queue_.reset(); parent_message_queue_.reset(); memory_manager_.reset(); - waitpid(stub_pid_, &status, 0); + WaitForStubProcess(); } }); @@ -335,10 +469,7 @@ StubLauncher::Launch() } catch (const PythonBackendException& ex) { // Need to kill the stub process first - kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); - stub_pid_ = 0; + KillStubProcess(); throw BackendModelException( TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); } @@ -357,6 +488,34 @@ StubLauncher::Launch() return nullptr; } +TRITONSERVER_Error* +StubLauncher::GetPythonEnvironment(ModelState* model_state) +{ + std::string python_execution_env = ""; + try { + python_execution_env = + model_state->StateForBackend()->env_manager->ExtractIfNotExtracted( + python_execution_env_); + } + catch (PythonBackendException& pb_exception) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + } + + path_to_activate_ = python_execution_env + "/bin/activate"; + path_to_libpython_ = python_execution_env + "/lib"; + if (python_execution_env.length() > 0 && !FileExists(path_to_activate_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + ("Path " + path_to_activate_ + + " does not exist. The Python environment should contain an " + "'activate' script.") + .c_str()); + } + return nullptr; +} +#endif + void StubLauncher::AutocompleteStubProcess() { @@ -473,6 +632,18 @@ StubLauncher::ModelInstanceStubProcess() return nullptr; } +bool +StubLauncher::StubActive() +{ +#ifdef _WIN32 + DWORD ec; + GetExitCodeProcess(stub_pid_.hProcess, &ec); + return (ec == STILL_ACTIVE); +#else + return (stub_pid_ != 0); +#endif +} + void StubLauncher::UpdateHealth() { @@ -483,9 +654,13 @@ StubLauncher::UpdateHealth() ipc_control_->stub_health = false; } - // Sleep 1 second so that the child process has a chance to change the - // health variable +// Sleep 1 second so that the child process has a chance to change the +// health variable +#ifdef _WIN32 + Sleep(1); +#else sleep(1); +#endif { bi::scoped_lock lock(*health_mutex_); @@ -515,11 +690,11 @@ StubLauncher::TerminateStub() force_kill = true; } - int status; if (force_kill) { - kill(stub_pid_, SIGKILL); + KillStubProcess(); + } else { + WaitForStubProcess(); } - waitpid(stub_pid_, &status, 0); } // First destroy the IPCControl. This makes sure that IPCControl is @@ -540,10 +715,16 @@ StubLauncher::ClearQueues() void StubLauncher::KillStubProcess() { +#ifdef _WIN32 + unsigned int exit_code; + TerminateProcess(stub_pid_.hProcess, exit_code); + CloseHandle(stub_pid_.hProcess); + CloseHandle(stub_pid_.hThread); +#else kill(stub_pid_, SIGKILL); - int status; - waitpid(stub_pid_, &status, 0); + WaitForStubProcess(); stub_pid_ = 0; +#endif } TRITONSERVER_Error* @@ -600,6 +781,19 @@ StubLauncher::ReceiveMessageFromStub( return nullptr; // success } +void +StubLauncher::WaitForStubProcess() +{ +#ifdef _WIN32 + WaitForSingleObject(stub_pid_.hProcess, INFINITE); + CloseHandle(stub_pid_.hProcess); + CloseHandle(stub_pid_.hThread); +#else + int status; + waitpid(stub_pid_, &status, 0); +#endif +} + #ifdef TRITON_ENABLE_GPU void StubLauncher::ShareCUDAMemoryPool( diff --git a/src/stub_launcher.h b/src/stub_launcher.h index fbbbdbad..6c8dd910 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -26,8 +26,6 @@ #pragma once -#include - #include #include #include @@ -79,8 +77,8 @@ class StubLauncher { // Model instance stub process TRITONSERVER_Error* ModelInstanceStubProcess(); - // Stub PID - pid_t StubPid() { return stub_pid_; } + // Check if Stub PID is active + bool StubActive(); // Health mutex bi::interprocess_mutex* HealthMutex() { return health_mutex_; } @@ -151,6 +149,14 @@ class StubLauncher { TRITONSERVER_Error* ReceiveMessageFromStub( bi::managed_external_buffer::handle_t& message); + // Wait for stub process + void WaitForStubProcess(); + +#ifndef _WIN32 + // FIXME [DLIS-5969]: Enable for Windows when custom execution environments + // are supported. + TRITONSERVER_Error* GetPythonEnvironment(ModelState* model_state); +#endif #ifdef TRITON_ENABLE_GPU // Share CUDA memory pool with stub process void ShareCUDAMemoryPool( @@ -158,9 +164,14 @@ class StubLauncher { #endif // TRITON_ENABLE_GPU private: +#ifdef _WIN32 + STARTUPINFO startup_info_; + DWORD parent_pid_; + PROCESS_INFORMATION stub_pid_; +#else pid_t parent_pid_; pid_t stub_pid_; - +#endif bool is_initialized_; bool is_decoupled_; bool is_healthy_; From 34a4db57d971ab66bc2302a35f944ee9471508e2 Mon Sep 17 00:00:00 2001 From: Kyle McGill <101670481+nv-kmcgill53@users.noreply.github.com> Date: Wed, 6 Mar 2024 14:06:57 -0800 Subject: [PATCH 175/216] patching git repository parameterization from production branch 1 (#341) Co-authored-by: kyle --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bc5387ef..dacd0f9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ if(WIN32) set(TRITON_ENABLE_GPU OFF CACHE BOOL "GPU disabled" FORCE) endif() +set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") @@ -69,17 +70,17 @@ include(ExternalProject) FetchContent_Declare( repo-common - GIT_REPOSITORY https://github.com/triton-inference-server/common.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} ) FetchContent_Declare( repo-core - GIT_REPOSITORY https://github.com/triton-inference-server/core.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} ) FetchContent_Declare( repo-backend - GIT_REPOSITORY https://github.com/triton-inference-server/backend.git + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) From 0413e46bdbaca09541afa181586c60924ff18ae1 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Thu, 7 Mar 2024 09:55:44 +0800 Subject: [PATCH 176/216] Remove $ (#343) --- examples/auto_complete/README.md | 10 +++--- examples/bls/README.md | 38 +++++++++++----------- examples/decoupled/README.md | 14 ++++---- examples/jax/README.md | 24 +++++++------- examples/preprocessing/README.md | 56 ++++++++++++++++++++++++-------- inferentia/README.md | 24 +++++++------- 6 files changed, 97 insertions(+), 69 deletions(-) diff --git a/examples/auto_complete/README.md b/examples/auto_complete/README.md index f530da3a..b07e065c 100644 --- a/examples/auto_complete/README.md +++ b/examples/auto_complete/README.md @@ -1,5 +1,5 @@ + # **Preprocessing Using Python Backend Example** This example shows how to preprocess your inputs using Python backend before it is passed to the TensorRT model for inference. This ensemble model includes an image preprocessing model (preprocess) and a TensorRT model (resnet50_trt) to do inference. @@ -5,39 +33,39 @@ This example shows how to preprocess your inputs using Python backend before it Run onnx_exporter.py to convert ResNet50 PyTorch model to ONNX format. Width and height dims are fixed at 224 but dynamic axes arguments for dynamic batching are used. Commands from the 2. and 3. subsections shall be executed within this Docker container. - $ docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash - $ pip install numpy pillow torchvision - $ python onnx_exporter.py --save model.onnx + docker run -it --gpus=all -v $(pwd):/workspace nvcr.io/nvidia/pytorch:xx.yy-py3 bash + pip install numpy pillow torchvision + python onnx_exporter.py --save model.onnx **2. Create the model repository:** - $ mkdir -p model_repository/ensemble_python_resnet50/1 - $ mkdir -p model_repository/preprocess/1 - $ mkdir -p model_repository/resnet50_trt/1 + mkdir -p model_repository/ensemble_python_resnet50/1 + mkdir -p model_repository/preprocess/1 + mkdir -p model_repository/resnet50_trt/1 # Copy the Python model - $ cp model.py model_repository/preprocess/1 + cp model.py model_repository/preprocess/1 **3. Build a TensorRT engine for the ONNX model** Set the arguments for enabling fp16 precision --fp16. To enable dynamic shapes use --minShapes, --optShapes, and maxShapes with --explicitBatch: - $ trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16 + trtexec --onnx=model.onnx --saveEngine=./model_repository/resnet50_trt/1/model.plan --explicitBatch --minShapes=input:1x3x224x224 --optShapes=input:1x3x224x224 --maxShapes=input:256x3x224x224 --fp16 **4. Run the command below to start the server container:** Under python_backend/examples/preprocessing, run this command to start the server docker container: - $ docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash - $ pip install numpy pillow torchvision - $ tritonserver --model-repository=/models + docker run --gpus=all -it --rm -p8000:8000 -p8001:8001 -p8002:8002 -v$(pwd):/workspace/ -v/$(pwd)/model_repository:/models nvcr.io/nvidia/tritonserver:xx.yy-py3 bash + pip install numpy pillow torchvision + tritonserver --model-repository=/models **5. Start the client to test:** Under python_backend/examples/preprocessing, run the commands below to start the client Docker container: - $ wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" - $ docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg - $ The result of classification is:COFFEE MUG + wget https://raw.githubusercontent.com/triton-inference-server/server/main/qa/images/mug.jpg -O "mug.jpg" + docker run --rm --net=host -v $(pwd):/workspace/ nvcr.io/nvidia/tritonserver:xx.yy-py3-sdk python client.py --image mug.jpg + The result of classification is:COFFEE MUG Here, since we input an image of "mug" and the inference result is "COFFEE MUG" which is correct. diff --git a/inferentia/README.md b/inferentia/README.md index 6a90740d..381c8ed8 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -60,18 +60,18 @@ or simply clone with https. Clone this repo with Github to home repo `/home/ubuntu`. ``` - $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh - $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh ``` Then, start the Triton instance with: ``` - $docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 + docker run --device /dev/neuron0 -v /home/ubuntu/python_backend:/home/ubuntu/python_backend -v /lib/udev:/mylib/udev --shm-size=1g --ulimit memlock=-1 -p 8000:8000 -p 8001:8001 -p 8002:8002 --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:-py3 ``` Note 1: The user would need to list any neuron device to run during container initialization. For example, to use 4 neuron devices on an instance, the user would need to run with: ``` - $docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` + docker run --device /dev/neuron0 --device /dev/neuron1 --device /dev/neuron2 --device /dev/neuron3 ...` ``` Note 2: `/mylib/udev` is used for Neuron parameter passing. @@ -81,7 +81,7 @@ Note 3: For Triton container version xx.yy, please refer to After starting the Triton container, go into the `python_backend` folder and run the setup script. ``` - $source /home/ubuntu/python_backend/inferentia/scripts/setup.sh + source /home/ubuntu/python_backend/inferentia/scripts/setup.sh ``` This script will: 1. Install necessary dependencies @@ -118,7 +118,7 @@ triton python model directory. An example invocation for the `gen_triton_model.py` for PyTorch model can look like: ``` - $python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 + python3 inferentia/scripts/gen_triton_model.py --model_type pytorch --triton_input INPUT__0,INT64,4x384 INPUT__1,INT64,4x384 INPUT__2,INT64,4x384 --triton_output OUTPUT__0,INT64,4x384 OUTPUT__1,INT64,4x384 --compiled_model /home/ubuntu/bert_large_mlperf_neuron_hack_bs1_dynamic.pt --neuron_core_range 0:3 --triton_model_dir bert-large-mlperf-bs1x4 ``` In order for the script to treat the compiled model as TorchScript @@ -161,7 +161,7 @@ script to generate triton python model directory. An example invocation for the `gen_triton_model.py` for TensorFlow model can look like: ``` - $python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3 --triton_model_dir rn50-1neuroncores-bs1x1 + python3 gen_triton_model.py --model_type tensorflow --compiled_model /home/ubuntu/inferentia-poc-2.0/scripts-rn50-tf-native/resnet50_mlperf_opt_fp16_compiled_b5_nc1/1 --neuron_core_range 0:3 --triton_model_dir rn50-1neuroncores-bs1x1 ``` NOTE: Unlike TorchScript model, TensorFlow SavedModel stores sufficient @@ -215,7 +215,7 @@ a valid torchscript file or tensorflow savedmodel. Now, the server can be launched with the model as below: ``` - $tritonserver --model-repository + tritonserver --model-repository ``` Note: @@ -255,7 +255,7 @@ contains the necessary files to set up testing with a simple add_sub model. The requires an instance with more than 8 inferentia cores to run, eg:`inf1.6xlarge`. start the test, run ``` - $source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh + source /python_backend/inferentia/qa/setup_test_enviroment_and_test.sh ``` where `` is usually `/home/ubuntu`/. This script will pull the [server repo](https://github.com/triton-inference-server/server) @@ -265,7 +265,7 @@ Triton Server and Triton SDK. Note: If you would need to change some of the tests in the server repo, you would need to run ``` - $export TRITON_SERVER_REPO_TAG= + export TRITON_SERVER_REPO_TAG= ``` before running the script. @@ -273,8 +273,8 @@ before running the script. ## pytorch-neuronx and tensorflow-neuronx 1. Similar to the steps for inf1, change the argument to the pre-container and on-container setup scripts to include the `-inf2` or `-trn1`flags e.g., ``` - $chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh - $sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2 + chmod 777 /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh + sudo /home/ubuntu/python_backend/inferentia/scripts/setup-pre-container.sh -inf2 ``` 2. On the container, followed by the `docker run` command, you can pass similar argument to the setup.sh script For Pytorch: From 8917c86a4f6face7b55319c6ca08dbd4378feef6 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 12 Mar 2024 00:58:01 +0530 Subject: [PATCH 177/216] Add Correlation Id string support for BLS (#344) * Add correlation id string support for BLS --- CMakeLists.txt | 4 +- README.md | 8 ++- src/correlation_id.cc | 120 ++++++++++++++++++++++++++++++++++++++++ src/correlation_id.h | 93 +++++++++++++++++++++++++++++++ src/infer_request.cc | 82 +++++++++++---------------- src/infer_request.h | 15 +++-- src/pb_stub.cc | 32 +++++++++-- src/python_be.cc | 19 +++++-- src/request_executor.cc | 13 ++++- 9 files changed, 316 insertions(+), 70 deletions(-) create mode 100644 src/correlation_id.cc create mode 100644 src/correlation_id.h diff --git a/CMakeLists.txt b/CMakeLists.txt index dacd0f9c..92b785bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -149,6 +149,8 @@ configure_file(src/libtriton_python.ldscript libtriton_python.ldscript COPYONLY) set( COMMON_SRCS + src/correlation_id.cc + src/correlation_id.h src/infer_response.cc src/infer_response.h src/infer_request.cc diff --git a/README.md b/README.md index 9182ae37..1b94d6b7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# **Preprocessing Using Python Backend Example** +# Preprocessing Using Python Backend Example This example shows how to preprocess your inputs using Python backend before it is passed to the TensorRT model for inference. This ensemble model includes an image preprocessing model (preprocess) and a TensorRT model (resnet50_trt) to do inference. **1. Converting PyTorch Model to ONNX format:** diff --git a/inferentia/README.md b/inferentia/README.md index 381c8ed8..fb0de4f7 100644 --- a/inferentia/README.md +++ b/inferentia/README.md @@ -34,7 +34,7 @@ and the [Neuron Runtime](https://awsdocs-neuron.readthedocs-hosted.com/en/latest ## Table of Contents -- [Using Triton with Inferentia](#using-triton-with-inferentia) +- [Using Triton with Inferentia 1](#using-triton-with-inferentia-1) - [Table of Contents](#table-of-contents) - [Inferentia setup](#inferentia-setup) - [Setting up the Inferentia model](#setting-up-the-inferentia-model) From 4d4211151d716e2a534ab1b8e8413d3c66967723 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 8 Apr 2024 13:22:08 -0400 Subject: [PATCH 182/216] Randomize Python backend shared memory region naming (#351) * Fix deprecated client package * Randomize Python backend shared memory region naming * Update docs --- README.md | 12 +++++++----- examples/preprocessing/client.py | 2 +- src/pb_utils.cc | 11 ++++++++++- src/pb_utils.h | 5 +++++ src/python_be.cc | 1 - src/stub_launcher.cc | 5 +---- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 93fd212f..1bc9bd9b 100644 --- a/README.md +++ b/README.md @@ -1067,11 +1067,13 @@ will create additional threads instead of spawning separate processes. ## Running Multiple Instances of Triton Server -Python backend uses shared memory to transfer requests to the stub process. -When running multiple instances of Triton Server on the same machine that use -Python models, there would be shared memory region name conflicts that can -result in segmentation faults or hangs. In order to avoid this issue, you need -to specify different `shm-region-prefix-name` using the `--backend-config` flag. +Starting from 24.04 release, Python backend uses UUID to generate unique +names for Python backend shared memory regions so that multiple instances of +the server can run at the same time without any conflicts. + +If you're using a Python backend released before the 24.04 release, you need +to specify different `shm-region-prefix-name` using the `--backend-config` flag +to avoid conflicts between the shared memory regions. For example: ``` # Triton instance 1 diff --git a/examples/preprocessing/client.py b/examples/preprocessing/client.py index 202d411a..1ac107af 100644 --- a/examples/preprocessing/client.py +++ b/examples/preprocessing/client.py @@ -29,7 +29,7 @@ import sys import numpy as np -import tritongrpcclient +import tritonclient.grpc as tritongrpcclient def load_image(img_path: str): diff --git a/src/pb_utils.cc b/src/pb_utils.cc index 7bc17fa4..809531b8 100644 --- a/src/pb_utils.cc +++ b/src/pb_utils.cc @@ -314,4 +314,13 @@ WrapTritonErrorInSharedPtr(TRITONSERVER_Error* error) return response_error; } #endif // NOT TRITON_PB_STUB -}}} // namespace triton::backend::python + +std::string +GenerateUUID() +{ + static boost::uuids::random_generator generator; + boost::uuids::uuid uuid = generator(); + return boost::uuids::to_string(uuid); +} + +}}} // namespace triton::backend::python diff --git a/src/pb_utils.h b/src/pb_utils.h index 6d5f21ce..1a6c2d8b 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -32,6 +32,9 @@ #include #include +#include +#include +#include #include #include #include @@ -335,4 +338,6 @@ std::shared_ptr WrapTritonErrorInSharedPtr( TRITONSERVER_Error* error); #endif +std::string GenerateUUID(); + }}} // namespace triton::backend::python diff --git a/src/python_be.cc b/src/python_be.cc index 57e6cffd..b688fdfd 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -2131,7 +2131,6 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) backend_state->shm_growth_byte_size = 1 * 1024 * 1024; // 1 MB backend_state->stub_timeout_seconds = 30; backend_state->shm_message_queue_size = 1000; - backend_state->number_of_instance_inits = 0; backend_state->thread_pool_size = 32; // Initialize shared memory region prefix to include backend's name // to avoid collision between python backend and python-based backends. diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index a9956b55..9dc2a64a 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -78,12 +78,9 @@ StubLauncher::Initialize(ModelState* model_state) stub_pid_ = 0; #endif - // Atomically increase and read the stub process count to avoid shared memory - // region name collision - int num_init = ++model_state->StateForBackend()->number_of_instance_inits; shm_region_name_ = model_state->StateForBackend()->shared_memory_region_prefix + - std::to_string(num_init); + GenerateUUID(); model_version_ = model_state->Version(); From 0cdcaf3f0ff3fe2f0449c269a15b62899813ccd0 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:55:39 -0700 Subject: [PATCH 183/216] Decoupled Async Execute (#350) * Add async decoupled execute * Enable decoupled bls async exec * Improve handling for async execute future object * Add docs for async execute for decoupled model * Fix link on docs * Improve docs wording * Improve destruction steps for async execute future object * Piggy back on GIL for protection * Document model should not modify event loop * Use Python add_done_callback * Protect infer_payload_ * Use traceback API that supports Python 3.8 and 3.9 * Update docs --- README.md | 21 +++++++++-- src/pb_stub.cc | 90 ++++++++++++++++++++++++++++++++++++++++++------ src/pb_stub.h | 7 +++- src/python_be.cc | 2 ++ src/python_be.h | 1 + 5 files changed, 107 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 1bc9bd9b..7f9c7027 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ any C++ code. - [Request Cancellation Handling](#request-cancellation-handling) - [Decoupled mode](#decoupled-mode) - [Use Cases](#use-cases) - - [Known Issues](#known-issues) + - [Async Execute](#async-execute) - [Request Rescheduling](#request-rescheduling) - [`finalize`](#finalize) - [Model Config File](#model-config-file) @@ -620,9 +620,24 @@ full power of what can be achieved from decoupled API. Read [Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md) for more details on how to host a decoupled model. -##### Known Issues +##### Async Execute -* Currently, decoupled Python models can not make async infer requests. +Starting from 24.04, `async def execute(self, requests):` is supported for +decoupled Python models. Its coroutine will be executed by an AsyncIO event loop +shared with requests executing in the same model instance. The next request for +the model instance can start executing while the current request is waiting. + +This is useful for minimizing the number of model instances for models that +spend the majority of its time waiting, given requests can be executed +concurrently by AsyncIO. To take full advantage of the concurrency, it is vital +for the async execute function to not block the event loop from making progress +while it is waiting, i.e. downloading over the network. + +Notes: +* The model should not modify the running event loop, as this might cause +unexpected issues. +* The server/backend do not control how many requests are added to the event +loop by a model instance. #### Request Rescheduling diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a9a910a1..b12e249d 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -104,6 +104,32 @@ PyDefaultArgumentToMutableType(const py::object& argument) std::string(py::str(argument.get_type()))); } +void +AsyncEventFutureDoneCallback(const py::object& py_future) +{ + // TODO: Why using `py_future.result()` with error hangs on exit? + try { + py::object exception = py_future.attr("exception")(); + if (!py::isinstance(exception)) { + std::string err_msg = ""; + py::object traceback = py::module_::import("traceback") + .attr("TracebackException") + .attr("from_exception")(exception) + .attr("format")(); + for (py::handle line : traceback) { + err_msg += py::str(line); + } + LOG_ERROR << err_msg; + } + } + catch (const PythonBackendException& pb_exception) { + LOG_ERROR << pb_exception.what(); + } + catch (const py::error_already_set& error) { + LOG_ERROR << error.what(); + } +} + void Stub::Instantiate( int64_t shm_growth_size, int64_t shm_default_size, @@ -533,6 +559,8 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) c_python_backend_utils.attr("InferenceResponse")); c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); + async_event_loop_ = py::none(); + py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); serialize_bytes_ = python_backend_utils.attr("serialize_byte_tensor"); @@ -690,11 +718,18 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) py::object execute_return = model_instance_.attr("execute")(py_request_list); - if (!py::isinstance(execute_return)) { - throw PythonBackendException( - "Python model '" + name_ + - "' is using the decoupled mode and the execute function must " - "return None."); + bool is_coroutine = py::module::import("asyncio") + .attr("iscoroutine")(execute_return) + .cast(); + if (is_coroutine) { + RunCoroutine(execute_return); + } else { + if (!py::isinstance(execute_return)) { + throw PythonBackendException( + "Python model '" + name_ + + "' is using the decoupled mode and the execute function must " + "return None."); + } } } } @@ -870,6 +905,35 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) } } +py::object +Stub::GetAsyncEventLoop() +{ + if (py::isinstance(async_event_loop_)) { + // Create the event loop if not already. + py::module asyncio = py::module_::import("asyncio"); + async_event_loop_ = asyncio.attr("new_event_loop")(); + asyncio.attr("set_event_loop")(async_event_loop_); + py::object py_thread = + py::module_::import("threading") + .attr("Thread")( + "target"_a = async_event_loop_.attr("run_forever"), + "daemon"_a = true); + py_thread.attr("start")(); + } + return async_event_loop_; +} + +void +Stub::RunCoroutine(py::object coroutine) +{ + py::object loop = GetAsyncEventLoop(); + py::object py_future = py::module_::import("asyncio").attr( + "run_coroutine_threadsafe")(coroutine, loop); + py_future.attr("add_done_callback")( + py::module_::import("c_python_backend_utils") + .attr("async_event_future_done_callback")); +} + void Stub::UpdateHealth() { @@ -881,6 +945,10 @@ void Stub::Finalize() { finalizing_ = true; + // Stop async event loop if created. + if (!py::isinstance(async_event_loop_)) { + async_event_loop_.attr("stop")(); + } // Call finalize if exists. if (initialized_ && py::hasattr(model_instance_, "finalize")) { try { @@ -943,6 +1011,7 @@ Stub::~Stub() { py::gil_scoped_acquire acquire; + async_event_loop_ = py::none(); model_instance_ = py::none(); } stub_instance_.reset(); @@ -1729,11 +1798,6 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) [](std::shared_ptr& infer_request, const bool decoupled) { std::unique_ptr& stub = Stub::GetOrCreateInstance(); - if (stub->IsDecoupled()) { - throw PythonBackendException( - "Async BLS request execution is not support in the decoupled " - "API."); - } py::object loop = py::module_::import("asyncio").attr("get_running_loop")(); py::cpp_function callback = [&stub, infer_request, decoupled]() { @@ -1860,6 +1924,12 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) "is_model_ready", &IsModelReady, py::arg("model_name").none(false), py::arg("model_version").none(false) = ""); + // This function is not part of the public API for Python backend. This is + // only used for internal callbacks. + module.def( + "async_event_future_done_callback", &AsyncEventFutureDoneCallback, + py::arg("py_future").none(false)); + // This class is not part of the public API for Python backend. This is only // used for internal testing purposes. py::class_(module, "SharedMemory") diff --git a/src/pb_stub.h b/src/pb_stub.h index a51f25f5..c9462fd0 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -255,6 +255,10 @@ class Stub { void ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr); + py::object GetAsyncEventLoop(); + + void RunCoroutine(py::object coroutine); + /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); @@ -363,6 +367,7 @@ class Stub { py::object model_instance_; py::object deserialize_bytes_; py::object serialize_bytes_; + py::object async_event_loop_; std::unique_ptr> stub_message_queue_; std::unique_ptr> diff --git a/src/python_be.cc b/src/python_be.cc index b688fdfd..b95fb715 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -768,6 +768,7 @@ ModelInstanceState::ExecuteBLSRequest( if (is_decoupled && (infer_response->Id() != nullptr)) { // Need to manage the lifetime of InferPayload object for bls // decoupled responses. + std::lock_guard lock(infer_payload_mu_); infer_payload_[reinterpret_cast(infer_payload.get())] = infer_payload; } @@ -961,6 +962,7 @@ ModelInstanceState::ProcessCleanupRequest( intptr_t id = reinterpret_cast(cleanup_message_ptr->id); if (message->Command() == PYTHONSTUB_BLSDecoupledInferPayloadCleanup) { // Remove the InferPayload object from the map. + std::lock_guard lock(infer_payload_mu_); infer_payload_.erase(id); } else if (message->Command() == PYTHONSTUB_DecoupledResponseFactoryCleanup) { // Delete response factory diff --git a/src/python_be.h b/src/python_be.h index 4430767c..9618204c 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -296,6 +296,7 @@ class ModelInstanceState : public BackendModelInstance { std::vector> futures_; std::unique_ptr thread_pool_; std::unordered_map> infer_payload_; + std::mutex infer_payload_mu_; std::unique_ptr request_executor_; public: From ad4a44014dda78c2df48b3209dd23bb016a24369 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:31:26 -0700 Subject: [PATCH 184/216] Reset async_event_loop_ only if initialized (#354) --- src/pb_stub.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index b12e249d..56d466f5 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -945,17 +945,19 @@ void Stub::Finalize() { finalizing_ = true; - // Stop async event loop if created. - if (!py::isinstance(async_event_loop_)) { - async_event_loop_.attr("stop")(); - } - // Call finalize if exists. - if (initialized_ && py::hasattr(model_instance_, "finalize")) { - try { - model_instance_.attr("finalize")(); + if (initialized_) { + // Stop async event loop if created. + if (!py::isinstance(async_event_loop_)) { + async_event_loop_.attr("stop")(); } - catch (const py::error_already_set& e) { - LOG_INFO << e.what(); + // Call finalize if exists. + if (py::hasattr(model_instance_, "finalize")) { + try { + model_instance_.attr("finalize")(); + } + catch (const py::error_already_set& e) { + LOG_INFO << e.what(); + } } } #ifdef TRITON_ENABLE_GPU From b7a069083ecf16020d7144fa596a6ed8f36559b6 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 16 Apr 2024 18:12:05 -0400 Subject: [PATCH 185/216] Add vscode configurations to make development easier (#352) * Add vscode configurations to make development easier * Review comment * Fix merge conflict * Fix permission * Update dockerfile * Fix username * Review comments * Add link * Review edit --- .devcontainer/Dockerfile | 48 +++++++++++++++++++ .devcontainer/devcontainer.json | 26 ++++++++++ .gitignore | 1 - .vscode/tasks.json | 85 +++++++++++++++++++++++++++++++++ README.md | 12 +++++ 5 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .vscode/tasks.json diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 00000000..737725bb --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,48 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM nvcr.io/nvidia/tritonserver:24.03-py3 + +ARG USERNAME=triton-server + +RUN apt-get update \ + && apt-get install -y sudo + +RUN pip3 install transformers torch + +# Create the user +RUN apt-get update \ + && apt-get install -y sudo \ + && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME + +RUN pip3 install pre-commit ipdb + +RUN mkhomedir_helper triton-server + +RUN apt-get install -y cmake rapidjson-dev + +USER ${USERNAME} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..e1b8bd10 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +{ + "name": "Python Backend", + + "build": { + "dockerfile": "Dockerfile" + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-python.vscode-pylance", + "ms-python.python", + "ms-vscode.cpptools-extension-pack", + "ms-vscode.cmake-tools", + "github.vscode-pull-request-github" + ] + } + }, + "postCreateCommand": "sudo chown -R triton-server:triton-server ~/.cache", + + "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined", "--gpus=all", "--shm-size=2g", "--ulimit", "stack=67108864" ], + "mounts": [ + "source=${localEnv:HOME}/.ssh,target=/home/triton-server/.ssh,type=bind,consistency=cached", + "source=${localEnv:HOME}/.cache/huggingface,target=/home/triton-server/.cache/huggingface,type=bind,consistency=cached" + ], + "remoteUser": "triton-server" +} diff --git a/.gitignore b/.gitignore index bf7e1686..293f6455 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ /build -/.vscode *.so builddir diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 00000000..597a746d --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,85 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Configure", + "type": "shell", + "command": "cmake", + "args": [ + "-DCMAKE_INSTALL_PREFIX:STRING=/opt/tritonserver/", + "-DTRITON_COMMON_REPO_TAG:STRING=main", + "-DTRITON_BACKEND_REPO_TAG:STRING=main", + "-DTRITON_CORE_REPO_TAG:STRING=main", + "-DTRITON_ENABLE_GPU:STRING=ON", + "-DTRITON_ENABLE_NVTX:STRING=ON", + "-DCMAKE_INSTALL_PREFIX:STRING=${workspaceFolder}/build/install", + "-DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE", + "-DCMAKE_BUILD_TYPE:STRING=Debug", + "-DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc", + "-DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++", + "-S${workspaceFolder}", + "-B${workspaceFolder}/build", + "-G", + "Unix Makefiles" + ], + "problemMatcher": [] + }, + { + "label": "Build", + "type": "shell", + "command": "cmake", + "args": [ + "--build", + "/${workspaceFolder}/build", + "--config", + "Debug", + "--target", + "all", + "-j", + "18", + "--" + ] + }, + { + "label": "Install", + "type": "shell", + "command": "cmake", + "args": [ + "--build", + "${workspaceFolder}/build", + "--config", + "Debug", + "--target", + "install", + "-j", + "18", + "--" + ] + }, + { + "label": "Move", + "type": "shell", + "command": "sudo", + "args": [ + "cp", + "-r", + "${workspaceFolder}/build/install/backends/python/*", + "/opt/tritonserver/backends/python" + ] + }, + { + "label": "Build Python Backend", + "dependsOrder": "sequence", + "dependsOn": [ + "Configure", + "Build", + "Install", + "Move" + ], + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/README.md b/README.md index 7f9c7027..89b9213e 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ any C++ code. - [Custom Metrics](#custom-metrics-1) - [Running with Inferentia](#running-with-inferentia) - [Logging](#logging) +- [Development with VSCode](#development-with-vscode) - [Reporting problems, asking questions](#reporting-problems-asking-questions) ## Quick Start @@ -1825,6 +1826,17 @@ def initialize(self, args): # Should print {'custom_key': {'string_value': 'custom_value'}} ``` +# Development with VSCode + +The repository includes a `.devcontainer` folder that contains a `Dockerfile` +and `devcontainer.json` file to help you develop the Python backend +using +[Visual Studio Code](https://code.visualstudio.com/docs/devcontainers/containers). + +In order to build the backend, you can execute the "Build Python Backend" task in the +[VSCode tasks](https://code.visualstudio.com/docs/editor/tasks). This will build +the Python backend and install the artifacts in +`/opt/tritonserver/backends/python`. # Reporting problems, asking questions From 9d2c513d41368d4932ea3e6207cbb248d5d8c9ee Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 8 May 2024 20:11:45 -0700 Subject: [PATCH 186/216] Add error handling in case of AutocompleteStub Failure for DLIS-5819 (#356) * DLIS-5819 * Guard WaitForStubProcess in case of failed auto-complete-config --- src/stub_launcher.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 9dc2a64a..828228e6 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -787,7 +787,11 @@ StubLauncher::WaitForStubProcess() CloseHandle(stub_pid_.hThread); #else int status; - waitpid(stub_pid_, &status, 0); + if (stub_pid_ != 0) { + // Added this check to ensure server doesn't hang waiting after stub + // process has already be killed and cannot be waited on + waitpid(stub_pid_, &status, 0); + } #endif } From 27f04d10abb4e7d924ebb6ca4f97de923a2e4fa4 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 5 Jun 2024 17:53:58 -0700 Subject: [PATCH 187/216] Add support for response sender in the default mode (#364) * Add response sender to non-decoupled models and unify data pipelines (#360) * Add response sender to non-decoupled model and unify data pipelines * Rename variable and class name * Fix decoupled batch statistics to account for implicit batch size (#361) * Fix decoupled gpu output error handling (#362) * Fix decoupled gpu output error handling * Return full error string upon exception from model * Response sender to check for improper non-decoupled model usage (#363) * Response sender to check for improper non-decoupled model usage * Force close response sender on exception * Rename functions --- README.md | 6 + src/infer_request.cc | 19 +- src/infer_request.h | 5 +- src/pb_stub.cc | 255 ++++++------------ src/pb_stub.h | 7 +- src/python_be.cc | 570 ++--------------------------------------- src/python_be.h | 45 +--- src/response_sender.cc | 129 +++++++--- src/response_sender.h | 18 +- 9 files changed, 241 insertions(+), 813 deletions(-) diff --git a/README.md b/README.md index 89b9213e..30f2dd25 100644 --- a/README.md +++ b/README.md @@ -479,6 +479,12 @@ Upon return from the execute function all tensor data associated with the InferenceRequest objects passed to the function are deleted, and so InferenceRequest objects should not be retained by the Python model. +Starting from 24.06, models may choose to send the response using the +`InferenceResponseSender` as illustrated on [Decoupled mode](#decoupled-mode). +Since the model is in default mode, it must send exactly one response per +request. The `pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag must be sent +either with the response or as a flag only response afterward. + #### Error Handling In case one of the requests has an error, you can use the `TritonError` object diff --git a/src/infer_request.cc b/src/infer_request.cc index 31182281..57ea6cf1 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -74,7 +74,7 @@ InferRequest::InferRequest( pb_cancel_ = std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( - request_address_, response_factory_address_, + request_address_, response_factory_address_, nullptr /* is_decoupled */, Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); #endif } @@ -272,7 +272,8 @@ InferRequest::SaveToSharedMemory(std::unique_ptr& shm_pool) std::unique_ptr InferRequest::LoadFromSharedMemory( std::unique_ptr& shm_pool, - bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle) + bi::managed_external_buffer::handle_t request_handle, bool open_cuda_handle, + bool const* is_model_decoupled) { AllocatedSharedMemory infer_request_shm = shm_pool->Load(request_handle); @@ -328,7 +329,7 @@ InferRequest::LoadFromSharedMemory( return std::unique_ptr(new InferRequest( infer_request_shm, request_id_shm, correlation_id_shm, requested_output_names_shm, model_name_shm, input_tensors, parameters_shm, - infer_trace_shm)); + infer_trace_shm, is_model_decoupled)); } InferRequest::InferRequest( @@ -339,7 +340,8 @@ InferRequest::InferRequest( std::unique_ptr& model_name_shm, std::vector>& input_tensors, std::unique_ptr& parameters_shm, - std::unique_ptr& infer_trace_shm) + std::unique_ptr& infer_trace_shm, + bool const* is_model_decoupled) : infer_request_shm_(std::move(infer_request_shm)), request_id_shm_(std::move(request_id_shm)), requested_output_names_shm_(std::move(requested_output_names_shm)), @@ -387,7 +389,7 @@ InferRequest::InferRequest( pb_cancel_ = std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( - request_address_, response_factory_address_, + request_address_, response_factory_address_, is_model_decoupled, Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); #endif } @@ -402,13 +404,6 @@ InferRequest::IsCancelled() std::shared_ptr InferRequest::GetResponseSender() { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - if (!stub->IsDecoupled()) { - throw PythonBackendException( - "'get_response_sender' function must be called only when the model is " - "using the decoupled transaction policy."); - } - return response_sender_; } diff --git a/src/infer_request.h b/src/infer_request.h index e0887624..c67e2fb0 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -118,7 +118,7 @@ class InferRequest { static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t request_handle, - bool open_cuda_handle); + bool open_cuda_handle, bool const* is_model_decoupled); /// Disallow copying the inference request object. DISALLOW_COPY_AND_ASSIGN(InferRequest); @@ -135,7 +135,8 @@ class InferRequest { std::unique_ptr& model_name_shm, std::vector>& input_tensors, std::unique_ptr& parameters_shm, - std::unique_ptr& infer_trace_shm); + std::unique_ptr& infer_trace_shm, + bool const* is_model_decoupled); std::string request_id_; CorrelationId correlation_id_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 56d466f5..87410a70 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -402,11 +402,7 @@ Stub::RunCommand() shm_pool_->Load(ipc_message->Args()); RequestBatch* request_batch_shm_ptr = reinterpret_cast(request_batch.data_.get()); - if (!ipc_control_->decoupled) { - ProcessRequests(request_batch_shm_ptr); - } else { - ProcessRequestsDecoupled(request_batch_shm_ptr); - } + ProcessRequests(request_batch_shm_ptr); } break; case PYTHONSTUB_CommandType::PYTHONSTUB_FinalizeRequest: @@ -597,18 +593,6 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) initialized_ = true; } -void -Stub::ProcessResponse(InferResponse* response) -{ - response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); - - for (auto& output_tensor : response->OutputTensors()) { - if (!output_tensor->IsCPU()) { - gpu_tensors_.push_back(output_tensor); - } - } -} - void Stub::LoadGPUBuffers(std::unique_ptr& ipc_message) { @@ -674,7 +658,8 @@ Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr) for (size_t i = 0; i < batch_size; i++) { std::shared_ptr infer_request = InferRequest::LoadFromSharedMemory( - shm_pool_, request_shm_handle[i], true /* open_cuda_handle */); + shm_pool_, request_shm_handle[i], true /* open_cuda_handle */, + &ipc_control_->decoupled /* is_model_decoupled */); py_request_list.append(infer_request); } @@ -682,7 +667,7 @@ Stub::LoadRequestsFromSharedMemory(RequestBatch* request_batch_shm_ptr) } void -Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) +Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { py::list py_request_list = LoadRequestsFromSharedMemory(request_batch_shm_ptr); @@ -718,18 +703,21 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) py::object execute_return = model_instance_.attr("execute")(py_request_list); + bool is_coroutine = py::module::import("asyncio") .attr("iscoroutine")(execute_return) .cast(); if (is_coroutine) { - RunCoroutine(execute_return); - } else { - if (!py::isinstance(execute_return)) { - throw PythonBackendException( - "Python model '" + name_ + - "' is using the decoupled mode and the execute function must " - "return None."); + if (IsDecoupled()) { + // Do not wait for async decoupled execute to return. + RunCoroutine(execute_return, true /* in_background */); + } else { + py::object coroutine_return = + RunCoroutine(execute_return, false /* in_background */); + ProcessReturnedResponses(py_request_list, coroutine_return); } + } else { + ProcessReturnedResponses(py_request_list, execute_return); } } } @@ -748,160 +736,77 @@ Stub::ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr) "Failed to process the request(s) for model '" + name_ + "', message: ") + error_string; - LOG_INFO << err_message.c_str(); + LOG_ERROR << err_message.c_str(); response_batch_shm_ptr->has_error = true; - error_string_shm = PbString::Create(shm_pool_, error_string); + error_string_shm = PbString::Create(shm_pool_, err_message); response_batch_shm_ptr->error = error_string_shm->ShmHandle(); response_batch_shm_ptr->is_error_set = true; + // Once the error is sent to the backend, the backend is supposed to close + // all response factories if not already closed, so closing all response + // senders if not already closed to prevent the model from sending more + // responses after the factories are closed. + for (py::handle py_request : py_request_list) { + InferRequest* request = py_request.cast(); + request->GetResponseSender()->Close(); + } } } void -Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) +Stub::ProcessReturnedResponses( + py::list py_requests, py::object py_responses_obj) { - std::unique_ptr execute_response = - IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResponse; - - AllocatedSharedMemory response_batch = shm_pool_->Construct( - request_batch_shm_ptr->batch_size * - sizeof(bi::managed_external_buffer::handle_t) + - sizeof(ResponseBatch)); - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - - std::unique_ptr error_string_shm; - py::list inference_responses; - - bi::managed_external_buffer::handle_t* responses_shm_handle = - reinterpret_cast( - response_batch.data_.get() + sizeof(ResponseBatch)); - - py::list responses; - - // Notifying the stub should be after responses. - ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); - ScopedDefer _( - [this, &execute_response] { SendIPCMessage(execute_response); }); - - execute_response->Args() = response_batch.handle_; - - bool has_exception = false; - std::string error_string; - try { - response_batch_shm_ptr->has_error = false; - response_batch_shm_ptr->is_error_set = false; - - uint32_t batch_size = request_batch_shm_ptr->batch_size; - - if (batch_size == 0) { - return; - } - - py::list py_request_list = - LoadRequestsFromSharedMemory(request_batch_shm_ptr); - - if (!py::hasattr(model_instance_, "execute")) { - std::string message = "Python model " + model_context_.PythonModelPath() + - " does not implement `execute` method."; - throw PythonBackendException(message); - } - - py::object request_list = py_request_list; - py::module asyncio = py::module::import("asyncio"); - - // Execute Response - py::object execute_return; - py::object responses_obj; - bool is_coroutine; - - { - NVTX_RANGE(nvtx_, "PyExecute " + name_); - execute_return = model_instance_.attr("execute")(request_list); - is_coroutine = asyncio.attr("iscoroutine")(execute_return).cast(); - } - - if (is_coroutine) { - responses_obj = asyncio.attr("run")(execute_return); - } else { - responses_obj = execute_return; - } - - // Check the return type of execute function. - if (!py::isinstance(responses_obj)) { - std::string str = py::str(execute_return.get_type()); - throw PythonBackendException( - std::string("Expected a list in the execute return, found type '") + - str + "'."); - } - - responses = responses_obj; - size_t response_size = py::len(responses); - - // If the number of request objects do not match the number of - // response objects throw an error. - if (response_size != batch_size) { - std::string err = - "Number of InferenceResponse objects do not match the number " - "of " - "InferenceRequest objects. InferenceRequest(s) size is:" + - std::to_string(batch_size) + ", and InferenceResponse(s) size is:" + - std::to_string(response_size) + "\n"; - throw PythonBackendException(err); - } - - for (size_t i = 0; i < response_size; i++) { - // Check the return type of execute function. - InferRequest* infer_request = py_request_list[i].cast(); - if (infer_request->ReleaseFlags() == - TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { - if (!py::isinstance(responses[i])) { - // When the request is rescheduled in non-decoupled model, the - // response must be None. - std::string str = py::str(responses[i].get_type()); - throw PythonBackendException( - "Expected a None object in the execute function return list for " - "reschduled request, " - "found type '" + - str + "'."); - } - } else { - if (!py::isinstance(responses[i])) { - std::string str = py::str(responses[i].get_type()); - throw PythonBackendException( - std::string( - "Expected an 'InferenceResponse' object in the execute " - "function return list, found type '") + - str + "'."); - } - InferResponse* infer_response = responses[i].cast(); - infer_response->PruneOutputTensors( - infer_request->RequestedOutputNames()); - ProcessResponse(infer_response); - responses_shm_handle[i] = infer_response->ShmHandle(); - } - } - response_batch_shm_ptr->batch_size = response_size; + // Return if there is nothing to process. + if (py::isinstance(py_responses_obj)) { + return; } - catch (const PythonBackendException& pb_exception) { - has_exception = true; - error_string = pb_exception.what(); + // Only non-decoupled may return responses. + if (IsDecoupled()) { + throw PythonBackendException( + "Python model '" + name_ + + "' is using the decoupled mode and the execute function must return " + "None."); } - catch (const py::error_already_set& error) { - has_exception = true; - error_string = error.what(); + // Check responses is a list. + if (!py::isinstance(py_responses_obj)) { + throw PythonBackendException( + "Expected a list in the execute return, found type '" + + std::string(py::str(py_responses_obj.get_type())) + "'."); + } + py::list py_responses = py_responses_obj; + // Responses and requests length must match. + size_t requests_size = py::len(py_requests); + size_t responses_size = py::len(py_responses); + if (requests_size != responses_size) { + throw PythonBackendException( + "Number of InferenceResponse objects do not match the number of " + "InferenceRequest objects. InferenceRequest(s) size is:" + + std::to_string(requests_size) + ", and InferenceResponse(s) size is:" + + std::to_string(responses_size) + "\n"); } - if (has_exception) { - std::string err_message = - std::string( - "Failed to process the request(s) for model '" + name_ + - "', message: ") + - error_string; - error_string_shm = PbString::Create(shm_pool_, error_string); - response_batch_shm_ptr->has_error = true; - response_batch_shm_ptr->is_error_set = true; - response_batch_shm_ptr->error = error_string_shm->ShmHandle(); + for (size_t i = 0; i < responses_size; i++) { + if (!py::isinstance(py_responses[i])) { + InferRequest* request = py_requests[i].cast(); + // Response must be None if rescheduled. + if (request->ReleaseFlags() == TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + throw PythonBackendException( + "Expected a None object in the execute function return list for " + "reschduled request, found type '" + + std::string(py::str(py_responses[i].get_type())) + "'."); + } + // Send the response. + if (!py::isinstance(py_responses[i])) { + throw PythonBackendException( + "Expected an 'InferenceResponse' object in the execute function " + "return list, found type '" + + std::string(py::str(py_responses[i].get_type())) + "'."); + } + std::shared_ptr response = + py_responses[i].cast>(); + request->GetResponseSender()->Send( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + } } } @@ -923,15 +828,19 @@ Stub::GetAsyncEventLoop() return async_event_loop_; } -void -Stub::RunCoroutine(py::object coroutine) +py::object +Stub::RunCoroutine(py::object coroutine, bool in_background) { py::object loop = GetAsyncEventLoop(); py::object py_future = py::module_::import("asyncio").attr( "run_coroutine_threadsafe")(coroutine, loop); - py_future.attr("add_done_callback")( - py::module_::import("c_python_backend_utils") - .attr("async_event_future_done_callback")); + if (in_background) { + py_future.attr("add_done_callback")( + py::module_::import("c_python_backend_utils") + .attr("async_event_future_done_callback")); + return py::none(); + } + return py_future.attr("result")(); } void diff --git a/src/pb_stub.h b/src/pb_stub.h index c9462fd0..10e7606a 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -253,11 +253,12 @@ class Stub { /// Execute a batch of requests. void ProcessRequests(RequestBatch* request_batch_shm_ptr); - void ProcessRequestsDecoupled(RequestBatch* request_batch_shm_ptr); + void ProcessReturnedResponses( + py::list py_requests, py::object py_responses_obj); py::object GetAsyncEventLoop(); - void RunCoroutine(py::object coroutine); + py::object RunCoroutine(py::object coroutine, bool in_background); /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); @@ -265,8 +266,6 @@ class Stub { /// Get the shared memory pool std::unique_ptr& ShmPool() { return shm_pool_; } - void ProcessResponse(InferResponse* response); - void ProcessBLSResponseDecoupled(std::unique_ptr& ipc_message); void LoadGPUBuffers(std::unique_ptr& ipc_message); diff --git a/src/python_be.cc b/src/python_be.cc index b95fb715..cd31e79e 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -153,124 +153,6 @@ ModelInstanceState::SetErrorForResponseSendMessage( } } -void -ModelInstanceState::SendMessageAndReceiveResponse( - bi::managed_external_buffer::handle_t message, - bi::managed_external_buffer::handle_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - auto error = SendMessageToStub(message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - - return; - } - - bi::managed_external_buffer::handle_t response_message; - error = Stub()->ReceiveMessageFromStub(response_message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - - return; - } - - response = response_message; -} - -TRITONSERVER_Error* -ModelInstanceState::SendMessageToStub( - bi::managed_external_buffer::handle_t message) -{ - bool success = false; - while (!success) { - uint64_t timeout_miliseconds = 1000; - { - boost::posix_time::ptime timeout = - boost::get_system_time() + - boost::posix_time::milliseconds(timeout_miliseconds); - - bi::scoped_lock lock( - *(Stub()->HealthMutex()), timeout); - - // Check if lock has been acquired. - if (lock) { - Stub()->IpcControl()->stub_health = false; - } else { - // If it failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); - } - } - - Stub()->StubMessageQueue()->Push( - message, timeout_miliseconds /* duration ms */, success); - - if (!success && !IsStubProcessAlive()) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); - } - } - - return nullptr; // success -} - -void -ModelInstanceState::RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count) -{ - for (uint32_t r = 0; r < request_count; ++r) { - if ((*responses)[r] == nullptr) - continue; - - std::string err_message = - std::string( - "Failed to process the request(s) for model instance '" + Name() + - "', message: ") + - message; - - TRITONSERVER_Error* err = - TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - - (*responses)[r] = nullptr; - TRITONSERVER_ErrorDelete(err); - } -} - -void -ModelInstanceState::WaitForBLSRequestsToFinish() -{ - futures_.clear(); -} - -bool -ModelInstanceState::IsStubProcessAlive() -{ - boost::posix_time::ptime timeout = - boost::get_system_time() + boost::posix_time::seconds(1); - bi::scoped_lock lock(*Stub()->HealthMutex(), timeout); - - // Check if lock has been acquired. - if (lock) { - return Stub()->IpcControl()->stub_health; - } else { - // If It failed to obtain the lock, it means that the stub has been - // stuck or exited while holding the health mutex lock. - return false; - } -} - TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -408,24 +290,15 @@ ModelInstanceState::SaveRequestsToSharedMemory( request, &request_timeout)); std::unique_ptr infer_request; - if (model_state->IsDecoupled()) { - TRITONBACKEND_ResponseFactory* factory_ptr; - RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); - - infer_request = std::make_unique( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), parameters_string, flags, - request_timeout, reinterpret_cast(factory_ptr), - reinterpret_cast(request), - PreferredMemory(PreferredMemory::kDefault, 0), trace); - } else { - infer_request = std::make_unique( - id, correlation_id, pb_input_tensors, requested_output_names, - model_state->Name(), model_state->Version(), parameters_string, flags, - request_timeout, 0 /* response_factory_address */, - reinterpret_cast(request), - PreferredMemory(PreferredMemory::kDefault, 0), trace); - } + TRITONBACKEND_ResponseFactory* factory_ptr; + RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); + + infer_request = std::make_unique( + id, correlation_id, pb_input_tensors, requested_output_names, + model_state->Name(), model_state->Version(), parameters_string, flags, + request_timeout, reinterpret_cast(factory_ptr), + reinterpret_cast(request), + PreferredMemory(PreferredMemory::kDefault, 0), trace); RETURN_IF_EXCEPTION(infer_request->SaveToSharedMemory(Stub()->ShmPool())); requests_shm[r] = infer_request->ShmHandle(); pb_infer_requests.emplace_back(std::move(infer_request)); @@ -449,11 +322,8 @@ ModelInstanceState::LaunchStubProcess() thread_pool_ = std::make_unique( model_state->StateForBackend()->thread_pool_size); - if (model_state->IsDecoupled()) { - decoupled_thread_ = true; - decoupled_monitor_ = - std::thread(&ModelInstanceState::DecoupledMessageQueueMonitor, this); - } + queue_monitor_thread_ = true; + queue_monitor_ = std::thread(&ModelInstanceState::MessageQueueMonitor, this); request_executor_ = std::make_unique( Stub()->ShmPool(), model_state->TritonServer()); @@ -700,7 +570,8 @@ ModelInstanceState::ExecuteBLSRequest( reinterpret_cast( request_batch.data_.get() + sizeof(RequestBatch)); infer_request = InferRequest::LoadFromSharedMemory( - Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */); + Stub()->ShmPool(), *request_handle, false /* open_cuda_handle */, + nullptr /* is_model_decoupled */); // If the BLS inputs are in GPU an additional round trip between the // stub process and the main process is required. The reason is that we @@ -806,9 +677,9 @@ ModelInstanceState::ExecuteBLSRequest( } void -ModelInstanceState::DecoupledMessageQueueMonitor() +ModelInstanceState::MessageQueueMonitor() { - while (decoupled_thread_) { + while (queue_monitor_thread_) { bi::managed_external_buffer::handle_t handle = Stub()->ParentMessageQueue()->Pop(); if (handle == DUMMY_MESSAGE) { @@ -1306,7 +1177,7 @@ ModelInstanceState::ResponseSendDecoupled( } TRITONSERVER_Error* -ModelInstanceState::ProcessRequestsDecoupled( +ModelInstanceState::ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector>& pb_infer_requests, PbMetricReporter& reporter) @@ -1365,7 +1236,7 @@ ModelInstanceState::ProcessRequestsDecoupled( uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); reporter.SetComputeEndNs(compute_end_ns); - reporter.SetBatchStatistics(request_count); + reporter.SetBatchStatistics(total_batch_size); if (response_batch.data_->has_error) { if (response_batch.data_->is_error_set) { @@ -1382,364 +1253,6 @@ ModelInstanceState::ProcessRequestsDecoupled( return nullptr; // success } -void -ModelInstanceState::ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_infer_requests, - bool& restart) -{ - NVTX_RANGE(nvtx_, "ProcessRequests " + Name()); - ModelState* model_state = reinterpret_cast(Model()); - std::string name = model_state->Name(); - - LOG_MESSAGE( - TRITONSERVER_LOG_VERBOSE, - (std::string("model ") + model_state->Name() + ", instance " + Name() + - ", executing " + std::to_string(request_count) + " requests") - .c_str()); - - uint64_t exec_start_ns = 0; - SET_TIMESTAMP(exec_start_ns); - - // We take the responsibility of the responses. - std::shared_ptr> responses( - new std::vector()); - responses->reserve(request_count); - PbMetricReporter reporter( - TritonModelInstance(), requests, request_count, responses); - reporter.SetExecStartNs(exec_start_ns); - - for (size_t i = 0; i < request_count; i++) { - TRITONBACKEND_Response* response; - auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); - if (err == nullptr) { - responses->emplace_back(response); - } else { - responses->emplace_back(nullptr); - LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); - TRITONSERVER_ErrorDelete(err); - } - } - - size_t total_batch_size = 0; - RESPOND_ALL_AND_RETURN_IF_ERROR( - responses, request_count, - CheckIncomingRequests(requests, request_count, total_batch_size)); - - // No request to process - if (total_batch_size == 0) { - return; - } - - // Wait for all the pending BLS requests to be completed. - ScopedDefer bls_defer([this] { WaitForBLSRequestsToFinish(); }); - AllocatedSharedMemory request_batch; - RESPOND_ALL_AND_RETURN_IF_ERROR( - responses, request_count, - SaveRequestsToSharedMemory( - requests, request_count, pb_infer_requests, request_batch, - responses)); - - std::shared_ptr ipc_message = - IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/); - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; - ipc_message->Args() = request_batch.handle_; - - uint64_t compute_start_ns = 0; - SET_TIMESTAMP(compute_start_ns); - reporter.SetComputeStartNs(compute_start_ns); - - // This means that the stub process has exited and Python - // backend failed to restart the stub process. - if (!Stub()->StubActive()) { - const char* error_message = "The stub process has exited unexpectedly."; - RespondErrorToAllRequests( - error_message, responses, requests, request_count); - return; - } - - bi::managed_external_buffer::handle_t response_message; - { - NVTX_RANGE(nvtx_, "StubProcessing " + Name()); - SendMessageAndReceiveResponse( - ipc_message->ShmHandle(), response_message, restart, responses, - requests, request_count); - } - - ScopedDefer execute_finalize([this, &restart] { - // Push a dummy message to the message queue so that - // the stub process is notified that it can release - // the object stored in shared memory. - NVTX_RANGE(nvtx_, "RequestExecuteFinalize " + Name()); - if (!restart) - // Push a dummy message to signal the thread to terminate. - Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); - }); - if (restart) { - return; - } - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - ipc_message = IPCMessage::LoadFromSharedMemory( - Stub()->ShmPool(), response_message)); - - // If the stub command is no longer PYTHONSTUB_InferExecRequest, it indicates - // that inference request execution has finished and there are no more BLS - // requests to execute. Otherwise, the Python backend will continuously - // execute BLS requests pushed to the message queue. - while (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferExecRequest || - ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest) { - std::packaged_task task([this, ipc_message] { - ExecuteBLSRequest( - ipc_message, - (ipc_message->Command() == - PYTHONSTUB_CommandType::PYTHONSTUB_InferStreamExecRequest)); - }); - std::future future = - boost::asio::post(*thread_pool_, std::move(task)); - futures_.emplace_back(std::move(future)); - - auto error = Stub()->ReceiveMessageFromStub(response_message); - if (error != nullptr) { - restart = true; - RespondErrorToAllRequests( - TRITONSERVER_ErrorMessage(error), responses, requests, request_count); - return; - } - - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - ipc_message = IPCMessage::LoadFromSharedMemory( - Stub()->ShmPool(), response_message)); - } - - uint64_t compute_end_ns = 0; - SET_TIMESTAMP(compute_end_ns); - reporter.SetComputeEndNs(compute_end_ns); - - // Parsing the request response - AllocatedSharedMemory response_batch; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - response_batch = Stub()->ShmPool()->Load(ipc_message->Args())); - - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - - // If inference fails, release all the requests and send an error response. - // If inference fails at this stage, it usually indicates a bug in the model - // code - if (response_batch_shm_ptr->has_error) { - if (response_batch_shm_ptr->is_error_set) { - std::unique_ptr error_message_shm; - RESPOND_ALL_AND_RETURN_IF_EXCEPTION( - responses, request_count, - error_message_shm = PbString::LoadFromSharedMemory( - Stub()->ShmPool(), response_batch_shm_ptr->error)); - RespondErrorToAllRequests( - error_message_shm->String().c_str(), responses, requests, - request_count); - } else { - const char* error_message = - "Failed to fetch the error in response batch."; - RespondErrorToAllRequests( - error_message, responses, requests, request_count); - } - - // Reset the release flags for all the requests. - for (auto& infer_request : pb_infer_requests) { - infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - } - return; - } - - bi::managed_external_buffer::handle_t* response_shm_handle = - reinterpret_cast( - response_batch.data_.get() + sizeof(ResponseBatch)); - - // If the output provided by the model is in GPU, we will pass the list of - // buffers provided by Triton to the stub process. - bool has_gpu_output = false; - std::vector requires_deferred_callback; - - std::vector> shm_responses; - std::vector, void*>>> - gpu_output_buffers(request_count); - GPUBuffersHelper gpu_buffer_helper; - - for (uint32_t r = 0; r < request_count; ++r) { - NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); - TRITONBACKEND_Response* response = (*responses)[r]; - TRITONBACKEND_Request* request = requests[r]; - uint32_t requested_output_count = 0; - requires_deferred_callback.push_back(false); - - shm_responses.emplace_back(nullptr); - std::unique_ptr& infer_response = shm_responses.back(); - try { - if (pb_infer_requests[r]->ReleaseFlags() == - TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { - // For rescheduled requests, we do not need to send a response. - LOG_IF_ERROR( - TRITONBACKEND_ResponseDelete((*responses)[r]), - "failed to delete response"); - (*responses)[r] = nullptr; - continue; - } - infer_response = InferResponse::LoadFromSharedMemory( - Stub()->ShmPool(), response_shm_handle[r], - false /* open_cuda_handle */); - if (infer_response->HasError()) { - TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - infer_response->Error()->Code(), - infer_response->Error()->Message().c_str()); - - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - (*responses)[r] = nullptr; - - // Reset the release flags for the request. - pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - - // If has_error is true, we do not look at the response tensors. - continue; - } - } - catch (const PythonBackendException& pb_exception) { - TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); - LOG_IF_ERROR( - TRITONBACKEND_ResponseSend( - (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), - "failed sending response"); - TRITONSERVER_ErrorDelete(err); - (*responses)[r] = nullptr; - - // Reset the release flags for the request. - pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - - continue; - } - - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); - - std::set requested_output_names; - for (size_t j = 0; j < requested_output_count; ++j) { - const char* output_name; - GUARDED_RESPOND_IF_ERROR( - responses, r, - TRITONBACKEND_RequestOutputName(request, j, &output_name)); - requested_output_names.insert(output_name); - } - - bool require_deferred_callback = false; - -#ifdef TRITON_ENABLE_GPU - for (auto& output_tensor : infer_response->OutputTensors()) { - if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) { - // Attempt to use the cuda shared memory pool for GPU tensor. - ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); - } - } -#endif // TRITON_ENABLE_GPU - - gpu_output_buffers[r] = - std::vector, void*>>{}; - infer_response->Send( - response, CudaStream(), require_deferred_callback, - TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), - gpu_buffer_helper, gpu_output_buffers[r], requested_output_names); - - requires_deferred_callback[r] = require_deferred_callback; - - if (requires_deferred_callback[r]) { - has_gpu_output = true; - } - } - - // Finalize the execute. - execute_finalize.Complete(); - - // If the output tensor is in GPU, there will be a second round trip - // required for filling the GPU buffers provided by the main process. - if (has_gpu_output) { - ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; - gpu_buffer_helper.Complete(Stub()->ShmPool()); - ipc_message->Args() = gpu_buffer_helper.ShmHandle(); - SendMessageAndReceiveResponse( - ipc_message->ShmHandle(), response_message, restart, responses, - requests, 0); - - bool cuda_copy = false; - - uint32_t response_index = 0; - for (auto& gpu_output_buffer : gpu_output_buffers) { - for (auto& buffer_memory_pair : gpu_output_buffer) { - auto& pb_memory = buffer_memory_pair.first; - void* pointer = buffer_memory_pair.second; - bool cuda_used = false; - - if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; - } else if ( - (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && - pb_memory->UseCUDASharedPool() && - (pb_memory->DataPtr() != pointer)) { - // If the data pointer from pb_memory is not the same as the pointer, - // it means that the Triton-provided buffer is not used during tensor - // transfer. Instead, an intermediate buffer that uses CUDA shared - // memory pool is used. In this case, we need to copy the data - // from the intermediate buffer back to the Triton-provided buffer. - GUARDED_RESPOND_IF_ERROR( - responses, response_index, - CopyBuffer( - "Failed to copy the output tensor to buffer.", - TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), - TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), - pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, - CudaStream(), &cuda_used)); - cuda_copy |= cuda_used; - } - } - response_index++; -#ifdef TRITON_ENABLE_GPU - if (cuda_copy) { - cudaStreamSynchronize(stream_); - } -#endif // TRITON_ENABLE_GPU - } - } - - bls_defer.Complete(); - for (uint32_t r = 0; r < request_count; ++r) { - if (requires_deferred_callback[r]) { - shm_responses[r]->DeferredSendCallback(); - } - } - - uint64_t exec_end_ns = 0; - SET_TIMESTAMP(exec_end_ns); - reporter.SetExecEndNs(exec_end_ns); - reporter.SetBatchStatistics(total_batch_size); - - return; -} - void ModelInstanceState::PrepareResponseBatch( ResponseBatch** response_batch, @@ -1873,18 +1386,13 @@ ModelInstanceState::ShareCUDAMemoryPool(const int32_t device_id) ModelInstanceState::~ModelInstanceState() { - ModelState* model_state = reinterpret_cast(Model()); Stub()->UpdateHealth(); if (Stub()->IsHealthy()) { - if (model_state->IsDecoupled()) { - // Wait for all the pending tasks to finish. - thread_pool_->wait(); - // Push a dummy message to signal the thread to terminate. - Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); - decoupled_monitor_.join(); - } else { - thread_pool_->wait(); - } + // Wait for all the pending tasks to finish. + thread_pool_->wait(); + // Push a dummy message to signal the thread to terminate. + Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); + queue_monitor_.join(); } // Terminate stub first to allow any last messages to be received by the back // end before deallocating the queue memory @@ -2445,36 +1953,10 @@ TRITONBACKEND_ModelInstanceExecute( // If restart is equal to true, it indicates that the stub process is // unhealthy and needs a restart. - bool restart = false; - ModelState* model_state = - reinterpret_cast(instance_state->Model()); - std::vector> infer_requests; - if (!model_state->IsDecoupled()) { - instance_state->ProcessRequests( - requests, request_count, infer_requests, restart); + // TODO: Implement restart on decoupled - if (restart) { - LOG_MESSAGE( - TRITONSERVER_LOG_ERROR, - "Stub process is unhealthy and it will be restarted."); - instance_state->TerminateMonitor(); - instance_state->Stub()->KillStubProcess(); - TRITONSERVER_Error* err = instance_state->Stub()->Setup(); - if (err == nullptr) { - instance_state->StartMonitor(); - } - LOG_IF_ERROR(err, "Failed to restart the stub process."); - err = instance_state->Stub()->Launch(); - LOG_IF_ERROR( - err, - "Failed to restart the stub process: failed to launch " - "the stub process."); - // Reset the release flags for all the requests. - for (auto& infer_request : infer_requests) { - infer_request->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); - } - } - } else { + std::vector> infer_requests; + { uint64_t exec_start_ns = 0; SET_TIMESTAMP(exec_start_ns); @@ -2483,7 +1965,7 @@ TRITONBACKEND_ModelInstanceExecute( nullptr); reporter.SetExecStartNs(exec_start_ns); - error = instance_state->ProcessRequestsDecoupled( + error = instance_state->ProcessRequests( requests, request_count, infer_requests, reporter); uint64_t exec_end_ns = 0; diff --git a/src/python_be.h b/src/python_be.h index 9618204c..59660fc4 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -287,9 +287,9 @@ class ModelInstanceState : public BackendModelInstance { std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; - // Decoupled monitor thread - std::thread decoupled_monitor_; - bool decoupled_thread_; + // Queue monitor thread + std::thread queue_monitor_; + bool queue_monitor_thread_; std::mutex mu_; std::condition_variable cv_; std::unique_ptr received_message_; @@ -309,30 +309,12 @@ class ModelInstanceState : public BackendModelInstance { // Launch stub process. TRITONSERVER_Error* LaunchStubProcess(); - TRITONSERVER_Error* SendMessageToStub( - bi::managed_external_buffer::handle_t message); void ResponseSendDecoupled(std::shared_ptr response_send_message); - // Checks whether the stub process is live - bool IsStubProcessAlive(); - - // Get a message from the stub process - void SendMessageAndReceiveResponse( - bi::managed_external_buffer::handle_t message, - bi::managed_external_buffer::handle_t& response, bool& restart, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // Responds to all the requests with an error message. - void RespondErrorToAllRequests( - const char* message, - std::shared_ptr>& responses, - TRITONBACKEND_Request** requests, const uint32_t request_count); - - // In the decoupled mode, the parent message queue is monitored only by this - // function during the execute phase. No other thread should pop any message - // from the message queue in the decoupled mode. - void DecoupledMessageQueueMonitor(); + // The parent message queue is monitored only by this function during the + // execute phase. No other thread should pop any message from the message + // queue. + void MessageQueueMonitor(); // This function is executed on a separate thread and monitors the queue for // message sent from stub to parent process. @@ -347,14 +329,8 @@ class ModelInstanceState : public BackendModelInstance { TRITONBACKEND_Request* request, std::shared_ptr>& responses); - // Process all the requests obtained from Triton. - void ProcessRequests( - TRITONBACKEND_Request** requests, const uint32_t request_count, - std::vector>& pb_infer_requests, - bool& restart); - // Process all the requests in the decoupled mode. - TRITONSERVER_Error* ProcessRequestsDecoupled( + TRITONSERVER_Error* ProcessRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, std::vector>& pb_infer_requests, PbMetricReporter& pb_metric_reporter); @@ -368,9 +344,6 @@ class ModelInstanceState : public BackendModelInstance { // Cleanup BLS responses void CleanupBLSResponses(); - // Wait for BLS requests to complete - void WaitForBLSRequestsToFinish(); - // Check the incoming requests for errors TRITONSERVER_Error* CheckIncomingRequests( TRITONBACKEND_Request** requests, const uint32_t request_count, diff --git a/src/response_sender.cc b/src/response_sender.cc index 94e3f0c8..74914ab4 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -35,13 +35,31 @@ namespace triton { namespace backend { namespace python { +void +CheckResponseSenderArguments( + const std::shared_ptr& response, const uint32_t flags) +{ + // Check the correctness of the provided flags. + if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) { + throw PythonBackendException( + "Unable to send response. Unsupported flag provided."); + } + + if (flags == 0 && response == nullptr) { + throw PythonBackendException( + "Inference Response object must be provided when the response flags is " + "set to zero."); + } +} + ResponseSender::ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool, + bool const* is_decoupled, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel) : request_address_(request_address), - response_factory_address_(response_factory_address), shm_pool_(shm_pool), - closed_(false), pb_cancel_(pb_cancel) + response_factory_address_(response_factory_address), + is_decoupled_(is_decoupled), shm_pool_(shm_pool), pb_cancel_(pb_cancel), + closed_(false), number_of_response_sent_(0) { } @@ -54,15 +72,32 @@ ResponseSender::~ResponseSender() } void -ResponseSender::Send( - std::shared_ptr infer_response, const uint32_t flags) +ResponseSender::UpdateStateAndCounters( + const std::shared_ptr& response, const uint32_t flags) { - // Release the GIL. This avoids a potential deadlock situation in the parent - // process, where every thread in the thread pool is indirectly waiting for a - // function in the stub process that acquires the GIL. Meanwhile, the current - // thread, which holds the GIL, is also waiting for the parent side to have - // the next available thread to pick up the job during resource contention. - py::gil_scoped_release release; + if (is_decoupled_ == nullptr) { + // TODO: Can a model access the response sender on a BLS infer request? + throw PythonBackendException( + "Unable to send response. Response sender has no reference to the " + "decoupled state of the model."); + } + bool is_decoupled = *is_decoupled_; + + std::lock_guard lk(mu_); + + if (!is_decoupled) { + if (response != nullptr && number_of_response_sent_ > 0) { + throw PythonBackendException( + "Unable to send response. Non-decoupled model cannot send more than " + "one response."); + } + if (response == nullptr && flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL && + number_of_response_sent_ == 0) { + throw PythonBackendException( + "Unable to send response. Non-decoupled model cannot send complete " + "final before sending a response."); + } + } if (closed_) { throw PythonBackendException( @@ -72,18 +107,22 @@ ResponseSender::Send( if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { closed_ = true; } + number_of_response_sent_++; +} - // Check the correctness of the provided flags. - if (flags != TRITONSERVER_RESPONSE_COMPLETE_FINAL && flags != 0) { - throw PythonBackendException( - "Unable to send response. Unsupported flag provided."); - } +void +ResponseSender::Send( + std::shared_ptr infer_response, const uint32_t flags) +{ + // Release the GIL. This avoids a potential deadlock situation in the parent + // process, where every thread in the thread pool is indirectly waiting for a + // function in the stub process that acquires the GIL. Meanwhile, the current + // thread, which holds the GIL, is also waiting for the parent side to have + // the next available thread to pick up the job during resource contention. + py::gil_scoped_release release; - if (flags == 0 && infer_response == nullptr) { - throw PythonBackendException( - "Inference Response object must be provided when the response flags is " - "set to zero."); - } + CheckResponseSenderArguments(infer_response, flags); + UpdateStateAndCounters(infer_response, flags); std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -147,9 +186,26 @@ ResponseSender::Send( } if (has_gpu_output) { + ScopedDefer _([send_message_payload] { + bi::scoped_lock guard{send_message_payload->mu}; + send_message_payload->is_stub_turn = false; + send_message_payload->cv.notify_one(); + while (!send_message_payload->is_stub_turn) { + // Wait for the stub process to send the response and populate error + // message if any. + send_message_payload->cv.wait(guard); + } + }); + AllocatedSharedMemory gpu_buffers_handle = shm_pool_->Load( send_message_payload->gpu_buffers_handle); + if (!gpu_buffers_handle.data_->success) { + std::unique_ptr error = PbString::LoadFromSharedMemory( + shm_pool_, gpu_buffers_handle.data_->error); + throw PythonBackendException( + "Failed to load GPU buffers: " + error->String()); + } AllocatedSharedMemory gpu_buffers_handle_shm = @@ -157,12 +213,11 @@ ResponseSender::Send( gpu_buffers_handle.data_->buffers); uint64_t gpu_buffer_count = gpu_buffers_handle.data_->buffer_count; if (gpu_tensors.size() != gpu_buffer_count) { - LOG_ERROR - << (std::string( - "GPU buffers size does not match the provided buffers: ") + - std::to_string(gpu_tensors.size()) + - " != " + std::to_string(gpu_buffer_count)); - return; + throw PythonBackendException( + std::string( + "GPU buffers size does not match the provided buffers: ") + + std::to_string(gpu_tensors.size()) + + " != " + std::to_string(gpu_buffer_count)); } std::vector> dst_buffers; @@ -175,17 +230,6 @@ ResponseSender::Send( std::shared_ptr& src_buffer = gpu_tensors[i]; PbMemory::CopyBuffer(dst_buffers[i], src_buffer->Memory()); } - - { - bi::scoped_lock guard{send_message_payload->mu}; - send_message_payload->is_stub_turn = false; - send_message_payload->cv.notify_one(); - while (!send_message_payload->is_stub_turn) { - // Wait for the stub process to send the response and populate error - // message if any. - send_message_payload->cv.wait(guard); - } - } } if (send_message_payload->has_error) { @@ -206,4 +250,11 @@ ResponseSender::IsCancelled() return pb_cancel_->IsCancelled(); } +void +ResponseSender::Close() +{ + std::lock_guard lk(mu_); + closed_ = true; +} + }}} // namespace triton::backend::python diff --git a/src/response_sender.h b/src/response_sender.h index d29a6ab6..1b57508e 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -1,4 +1,4 @@ -// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,6 +26,8 @@ #pragma once +#include + #include "infer_response.h" #include "pb_cancel.h" #include "shm_manager.h" @@ -36,17 +38,27 @@ class ResponseSender { public: ResponseSender( intptr_t request_address, intptr_t response_factory_address, - std::unique_ptr& shm_pool, + bool const* is_decoupled, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); bool IsCancelled(); + // Can be useful at stopping the model from sending any more responses. + void Close(); + private: + void UpdateStateAndCounters( + const std::shared_ptr& response, const uint32_t flags); + intptr_t request_address_; intptr_t response_factory_address_; + bool const* is_decoupled_; std::unique_ptr& shm_pool_; - bool closed_; std::shared_ptr pb_cancel_; + + std::mutex mu_; + bool closed_; + size_t number_of_response_sent_; }; }}} // namespace triton::backend::python From ebc8c6cd5d9a04981b1d24dc9e5db9e2d5a81974 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 7 Jun 2024 16:01:24 -0700 Subject: [PATCH 188/216] fix: [precaution fix] Capture Python futures while running in the background (#365) * Capture futures while running in background * Scoped defer background future removal * Use pybind11 provided python set --- src/pb_stub.cc | 61 +++++++++++++++++++++++++++++++++----------------- src/pb_stub.h | 3 +++ 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 87410a70..2a6be556 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -107,27 +107,8 @@ PyDefaultArgumentToMutableType(const py::object& argument) void AsyncEventFutureDoneCallback(const py::object& py_future) { - // TODO: Why using `py_future.result()` with error hangs on exit? - try { - py::object exception = py_future.attr("exception")(); - if (!py::isinstance(exception)) { - std::string err_msg = ""; - py::object traceback = py::module_::import("traceback") - .attr("TracebackException") - .attr("from_exception")(exception) - .attr("format")(); - for (py::handle line : traceback) { - err_msg += py::str(line); - } - LOG_ERROR << err_msg; - } - } - catch (const PythonBackendException& pb_exception) { - LOG_ERROR << pb_exception.what(); - } - catch (const py::error_already_set& error) { - LOG_ERROR << error.what(); - } + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->BackgroundFutureDone(py_future); } void @@ -556,6 +537,7 @@ Stub::Initialize(bi::managed_external_buffer::handle_t map_handle) c_python_backend_utils.attr("shared_memory") = py::cast(shm_pool_.get()); async_event_loop_ = py::none(); + background_futures_ = py::set(); py::object TritonPythonModel = sys.attr("TritonPythonModel"); deserialize_bytes_ = python_backend_utils.attr("deserialize_bytes_tensor"); @@ -838,11 +820,47 @@ Stub::RunCoroutine(py::object coroutine, bool in_background) py_future.attr("add_done_callback")( py::module_::import("c_python_backend_utils") .attr("async_event_future_done_callback")); + background_futures_.attr("add")(py_future); return py::none(); } return py_future.attr("result")(); } +void +Stub::BackgroundFutureDone(const py::object& py_future) +{ + ScopedDefer _([this, &py_future] { + // Remove future from background + try { + background_futures_.attr("remove")(py_future); + } + catch (const py::error_already_set& error) { + LOG_ERROR << "Cannot remove future from background; " << error.what(); + } + }); + // TODO: Why using `py_future.result()` with error hangs on exit? + try { + py::object exception = py_future.attr("exception")(); + if (!py::isinstance(exception)) { + std::string err_msg = ""; + py::object traceback = py::module_::import("traceback") + .attr("TracebackException") + .attr("from_exception")(exception) + .attr("format")(); + for (py::handle line : traceback) { + err_msg += py::str(line); + } + LOG_ERROR << err_msg; + } + } + catch (const PythonBackendException& pb_exception) { + LOG_ERROR << pb_exception.what(); + } + catch (const py::error_already_set& error) { + LOG_ERROR << error.what(); + } +} + void Stub::UpdateHealth() { @@ -923,6 +941,7 @@ Stub::~Stub() { py::gil_scoped_acquire acquire; async_event_loop_ = py::none(); + background_futures_ = py::none(); model_instance_ = py::none(); } stub_instance_.reset(); diff --git a/src/pb_stub.h b/src/pb_stub.h index 10e7606a..9ed74d9a 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -260,6 +260,8 @@ class Stub { py::object RunCoroutine(py::object coroutine, bool in_background); + void BackgroundFutureDone(const py::object& py_future); + /// Get the memory manager message queue std::unique_ptr>& MemoryManagerQueue(); @@ -367,6 +369,7 @@ class Stub { py::object deserialize_bytes_; py::object serialize_bytes_; py::object async_event_loop_; + py::object background_futures_; std::unique_ptr> stub_message_queue_; std::unique_ptr> From bfabfdbf4aa1e3db36aaf9e640b1ce5e0a720f48 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 12 Jun 2024 09:35:43 -0700 Subject: [PATCH 189/216] fix: Models should filter outputs based on requested outputs (#366) * Prune non requested outputs from non-decoupled models * Prune non requested outputs from decoupled models * [chore] Remove redundant copy --- src/infer_request.cc | 8 ++++---- src/response_sender.cc | 12 +++++++++--- src/response_sender.h | 5 ++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 57ea6cf1..8a95b524 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -68,14 +68,13 @@ InferRequest::InferRequest( } } - inputs_ = inputs; - requested_output_names_ = requested_output_names; #ifdef TRITON_PB_STUB pb_cancel_ = std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( request_address_, response_factory_address_, nullptr /* is_decoupled */, - Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); + RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(), + pb_cancel_); #endif } @@ -390,7 +389,8 @@ InferRequest::InferRequest( std::make_shared(response_factory_address_, request_address_); response_sender_ = std::make_shared( request_address_, response_factory_address_, is_model_decoupled, - Stub::GetOrCreateInstance()->SharedMemory(), pb_cancel_); + RequestedOutputNames(), Stub::GetOrCreateInstance()->SharedMemory(), + pb_cancel_); #endif } diff --git a/src/response_sender.cc b/src/response_sender.cc index 74914ab4..1831601f 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -54,12 +54,15 @@ CheckResponseSenderArguments( ResponseSender::ResponseSender( intptr_t request_address, intptr_t response_factory_address, - bool const* is_decoupled, std::unique_ptr& shm_pool, + bool const* is_decoupled, + const std::set& requested_output_names, + std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel) : request_address_(request_address), response_factory_address_(response_factory_address), - is_decoupled_(is_decoupled), shm_pool_(shm_pool), pb_cancel_(pb_cancel), - closed_(false), number_of_response_sent_(0) + is_decoupled_(is_decoupled), + requested_output_names_(requested_output_names), shm_pool_(shm_pool), + pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0) { } @@ -123,6 +126,9 @@ ResponseSender::Send( CheckResponseSenderArguments(infer_response, flags); UpdateStateAndCounters(infer_response, flags); + if (infer_response) { + infer_response->PruneOutputTensors(requested_output_names_); + } std::unique_ptr& stub = Stub::GetOrCreateInstance(); diff --git a/src/response_sender.h b/src/response_sender.h index 1b57508e..f274f5b4 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -38,7 +38,9 @@ class ResponseSender { public: ResponseSender( intptr_t request_address, intptr_t response_factory_address, - bool const* is_decoupled, std::unique_ptr& shm_pool, + bool const* is_decoupled, + const std::set& requested_output_names, + std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); @@ -54,6 +56,7 @@ class ResponseSender { intptr_t request_address_; intptr_t response_factory_address_; bool const* is_decoupled_; + std::set requested_output_names_; std::unique_ptr& shm_pool_; std::shared_ptr pb_cancel_; From c8b188f26a4e80c7204baaf73e27f11c33f52f57 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 14 Jun 2024 22:53:49 -0700 Subject: [PATCH 190/216] Add windows typedef for ssize_t (#368) --- src/pb_tensor.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index d9d47784..0915c1d9 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -35,6 +35,11 @@ namespace py = pybind11; #endif #include "pb_tensor.h" +// WAR for undefined ssize_t on Windows: https://stackoverflow.com/a/35368387 +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#endif namespace triton { namespace backend { namespace python { From 2b12abeba3e612633483093dcfc09a771bcedfaa Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 30 Jul 2024 14:14:06 -0700 Subject: [PATCH 191/216] feat: Add BF16 tensor support via dlpack (#371) --- README.md | 4 ++++ src/pb_stub_utils.cc | 17 ++++++++++++++++- src/pb_tensor.cc | 24 ++++++++++++++++++------ 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 30f2dd25..eee6af39 100644 --- a/README.md +++ b/README.md @@ -1557,6 +1557,10 @@ input0 = pb_utils.Tensor.from_dlpack("INPUT0", pytorch_tensor) This method only supports contiguous Tensors that are in C-order. If the tensor is not C-order contiguous an exception will be raised. +For python models with input or output tensors of type BFloat16 (BF16), the +`as_numpy()` method is not supported, and the `from_dlpack` and `to_dlpack` +methods must be used instead. + ## `pb_utils.Tensor.is_cpu() -> bool` This function can be used to check whether a tensor is placed in CPU or not. diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc index c9ffd661..9e05feae 100644 --- a/src/pb_stub_utils.cc +++ b/src/pb_stub_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -168,6 +168,8 @@ triton_to_pybind_dtype(TRITONSERVER_DataType data_type) dtype_numpy = py::dtype(py::format_descriptor::format()); break; case TRITONSERVER_TYPE_BF16: + // NOTE: Currently skipping this call via `if (BF16)` check, but may + // want to better handle this or set some default/invalid dtype. throw PythonBackendException("TYPE_BF16 not currently supported."); case TRITONSERVER_TYPE_INVALID: throw PythonBackendException("Dtype is invalid."); @@ -240,6 +242,10 @@ triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype) case TRITONSERVER_TYPE_BYTES: throw PythonBackendException( "TYPE_BYTES tensors cannot be converted to DLPack."); + case TRITONSERVER_TYPE_BF16: + dl_code = DLDataTypeCode::kDLBfloat; + dt_size = 16; + break; default: throw PythonBackendException( @@ -301,6 +307,15 @@ dlpack_to_triton_type(const DLDataType& data_type) } } + if (data_type.code == DLDataTypeCode::kDLBfloat) { + if (data_type.bits != 16) { + throw PythonBackendException( + "Expected BF16 tensor to have 16 bits, but had: " + + std::to_string(data_type.bits)); + } + return TRITONSERVER_TYPE_BF16; + } + return TRITONSERVER_TYPE_INVALID; } }}} // namespace triton::backend::python diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 0915c1d9..1ab95144 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -152,7 +152,10 @@ PbTensor::PbTensor( #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { - if (dtype != TRITONSERVER_TYPE_BYTES) { + if (dtype == TRITONSERVER_TYPE_BF16) { + // No native numpy representation for BF16. DLPack should be used instead. + numpy_array_ = py::none(); + } else if (dtype != TRITONSERVER_TYPE_BYTES) { py::object numpy_array = py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); @@ -512,12 +515,18 @@ PbTensor::Name() const const py::array* PbTensor::AsNumpy() const { - if (IsCPU()) { - return &numpy_array_; - } else { + if (!IsCPU()) { throw PythonBackendException( "Tensor is stored in GPU and cannot be converted to NumPy."); } + + if (dtype_ == TRITONSERVER_TYPE_BF16) { + throw PythonBackendException( + "Tensor dtype is BF16 and cannot be converted to NumPy. Use " + "to_dlpack() and from_dlpack() instead."); + } + + return &numpy_array_; } #endif // TRITON_PB_STUB @@ -643,7 +652,10 @@ PbTensor::PbTensor( #ifdef TRITON_PB_STUB if (memory_type_ == TRITONSERVER_MEMORY_CPU || memory_type_ == TRITONSERVER_MEMORY_CPU_PINNED) { - if (dtype_ != TRITONSERVER_TYPE_BYTES) { + if (dtype_ == TRITONSERVER_TYPE_BF16) { + // No native numpy representation for BF16. DLPack should be used instead. + numpy_array_ = py::none(); + } else if (dtype_ != TRITONSERVER_TYPE_BYTES) { py::object numpy_array = py::array(triton_to_pybind_dtype(dtype_), dims_, (void*)memory_ptr_); numpy_array_ = numpy_array.attr("view")(triton_to_numpy_type(dtype_)); From 4d469a904f34440d2ba90f775088ad4637b46c0c Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Wed, 31 Jul 2024 09:02:29 -0700 Subject: [PATCH 192/216] refactor: Refactor string input checks (#370) Refactor string input tensor checks --- src/python_be.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/python_be.cc b/src/python_be.cc index cd31e79e..2212176d 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -424,6 +424,15 @@ ModelInstanceState::GetInputTensor( RETURN_IF_ERROR(backend::ReadInputTensor( request, input_name, input_buffer, &byte_size)); } + + if (input_dtype == TRITONSERVER_TYPE_BYTES) { + const char* content = reinterpret_cast(input_tensor->DataPtr()); + size_t content_byte_size = input_tensor->ByteSize(); + const size_t request_element_cnt = GetElementCount(input_tensor->Dims()); + RETURN_IF_ERROR(ValidateStringBuffer( + content, content_byte_size, request_element_cnt, input_name, + nullptr /* str_list */)); + } } else { #ifdef TRITON_ENABLE_GPU // Attempt to use the cuda shared memory pool for GPU tensor. From 2203a5bcb729b56fc56ef1b3b77e527e0e9faa93 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:40:00 -0700 Subject: [PATCH 193/216] Delete response factory after sending complete final (#373) --- src/response_sender.cc | 24 +++++++++++++++++++----- src/response_sender.h | 4 ++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/response_sender.cc b/src/response_sender.cc index 1831601f..0a88fb6b 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -62,16 +62,14 @@ ResponseSender::ResponseSender( response_factory_address_(response_factory_address), is_decoupled_(is_decoupled), requested_output_names_(requested_output_names), shm_pool_(shm_pool), - pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0) + pb_cancel_(pb_cancel), closed_(false), number_of_response_sent_(0), + response_factory_deleted_(false) { } ResponseSender::~ResponseSender() { - std::unique_ptr& stub = Stub::GetOrCreateInstance(); - stub->EnqueueCleanupId( - reinterpret_cast(response_factory_address_), - PYTHONSTUB_DecoupledResponseFactoryCleanup); + DeleteResponseFactory(); } void @@ -248,6 +246,10 @@ ResponseSender::Send( "An error occurred while sending a response."); } } + + if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + DeleteResponseFactory(); + } } bool @@ -263,4 +265,16 @@ ResponseSender::Close() closed_ = true; } +void +ResponseSender::DeleteResponseFactory() +{ + bool already_deleted = response_factory_deleted_.exchange(true); + if (!already_deleted) { + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + stub->EnqueueCleanupId( + reinterpret_cast(response_factory_address_), + PYTHONSTUB_DecoupledResponseFactoryCleanup); + } +} + }}} // namespace triton::backend::python diff --git a/src/response_sender.h b/src/response_sender.h index f274f5b4..69f416c2 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -26,6 +26,7 @@ #pragma once +#include #include #include "infer_response.h" @@ -52,6 +53,7 @@ class ResponseSender { private: void UpdateStateAndCounters( const std::shared_ptr& response, const uint32_t flags); + void DeleteResponseFactory(); intptr_t request_address_; intptr_t response_factory_address_; @@ -63,5 +65,7 @@ class ResponseSender { std::mutex mu_; bool closed_; size_t number_of_response_sent_; + + std::atomic response_factory_deleted_; }; }}} // namespace triton::backend::python From 1393d6e1866c28a051253a08cf7c928bcbd1cad3 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:57:10 -0700 Subject: [PATCH 194/216] Release GIL during cancellation check (#372) --- src/pb_cancel.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pb_cancel.cc b/src/pb_cancel.cc index 4c9b926b..0774261d 100644 --- a/src/pb_cancel.cc +++ b/src/pb_cancel.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -57,6 +57,9 @@ PbCancel::ShmPayload() bool PbCancel::IsCancelled() { + // Release the GIL. Python objects are not accessed during the check. + py::gil_scoped_release gil_release; + std::unique_lock lk(mu_); // The cancelled flag can only move from false to true, not the other way, so // it is checked on each query until cancelled and then implicitly cached. From 958c8c9c54ddf3350520e53a71c9b2369f67011f Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:23:04 -0700 Subject: [PATCH 195/216] feat: Add new histogram metric type (#374) --- README.md | 6 +-- src/ipc_message.h | 3 +- src/metric.cc | 106 +++++++++++++++++++++++++++++++++++++++---- src/metric.h | 26 +++++++++-- src/metric_family.cc | 32 +++++++++++-- src/metric_family.h | 11 +++-- src/pb_stub.cc | 6 ++- src/pb_utils.h | 4 +- src/python_be.cc | 4 +- 9 files changed, 169 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index eee6af39..913034a8 100644 --- a/README.md +++ b/README.md @@ -1656,12 +1656,12 @@ import triton_python_backend_utils as pb_utils class TritonPythonModel: def initialize(self, args): # Create a MetricFamily object to report the latency of the model - # execution. The 'kind' parameter must be either 'COUNTER' or - # 'GAUGE'. + # execution. The 'kind' parameter must be either 'COUNTER', + # 'GAUGE' or 'HISTOGRAM'. self.metric_family = pb_utils.MetricFamily( name="preprocess_latency_ns", description="Cumulative time spent pre-processing requests", - kind=pb_utils.MetricFamily.COUNTER # or pb_utils.MetricFamily.GAUGE + kind=pb_utils.MetricFamily.COUNTER ) # Create a Metric object under the MetricFamily object. The 'labels' diff --git a/src/ipc_message.h b/src/ipc_message.h index ac28238c..8e762b8f 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -63,6 +63,7 @@ typedef enum PYTHONSTUB_commandtype_enum { PYTHONSTUB_MetricRequestValue, PYTHONSTUB_MetricRequestIncrement, PYTHONSTUB_MetricRequestSet, + PYTHONSTUB_MetricRequestObserve, PYTHONSTUB_LoadModelRequest, PYTHONSTUB_UnloadModelRequest, PYTHONSTUB_ModelReadinessRequest, diff --git a/src/metric.cc b/src/metric.cc index f67c55bf..7796b161 100644 --- a/src/metric.cc +++ b/src/metric.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -32,9 +32,12 @@ namespace triton { namespace backend { namespace python { -Metric::Metric(const std::string& labels, void* metric_family_address) - : labels_(labels), operation_value_(0), metric_address_(nullptr), - metric_family_address_(metric_family_address), is_cleared_(false) +Metric::Metric( + const std::string& labels, std::optional> buckets, + void* metric_family_address) + : labels_(labels), buckets_(buckets), operation_value_(0), + metric_address_(nullptr), metric_family_address_(metric_family_address), + is_cleared_(false) { #ifdef TRITON_PB_STUB SendCreateMetricRequest(); @@ -62,6 +65,20 @@ Metric::SaveToSharedMemory(std::unique_ptr& shm_pool) custom_metric_shm_ptr_->metric_family_address = metric_family_address_; custom_metric_shm_ptr_->metric_address = metric_address_; + // Histogram specific case + if (buckets_.has_value()) { + auto buckets_size = buckets_.value().size() * sizeof(double); + std::unique_ptr buckets_shm = PbMemory::Create( + shm_pool, TRITONSERVER_MemoryType::TRITONSERVER_MEMORY_CPU, 0, + buckets_size, reinterpret_cast(buckets_.value().data()), + false /* copy_gpu */); + custom_metric_shm_ptr_->buckets_shm_handle = buckets_shm->ShmHandle(); + buckets_shm_ = std::move(buckets_shm); + } else { + custom_metric_shm_ptr_->buckets_shm_handle = 0; + buckets_shm_ = nullptr; + } + // Save the references to shared memory. custom_metric_shm_ = std::move(custom_metric_shm); labels_shm_ = std::move(labels_shm); @@ -80,17 +97,40 @@ Metric::LoadFromSharedMemory( std::unique_ptr labels_shm = PbString::LoadFromSharedMemory( shm_pool, custom_metric_shm_ptr->labels_shm_handle); - return std::unique_ptr(new Metric(custom_metric_shm, labels_shm)); + std::unique_ptr buckets_shm = nullptr; + if (custom_metric_shm_ptr->buckets_shm_handle != 0) { + buckets_shm = PbMemory::LoadFromSharedMemory( + shm_pool, custom_metric_shm_ptr->buckets_shm_handle, + false /* open_cuda_handle */); + } + + return std::unique_ptr( + new Metric(custom_metric_shm, labels_shm, buckets_shm)); } Metric::Metric( AllocatedSharedMemory& custom_metric_shm, - std::unique_ptr& labels_shm) + std::unique_ptr& labels_shm, + std::unique_ptr& buckets_shm) : custom_metric_shm_(std::move(custom_metric_shm)), - labels_shm_(std::move(labels_shm)) + labels_shm_(std::move(labels_shm)), buckets_shm_(std::move(buckets_shm)) { custom_metric_shm_ptr_ = custom_metric_shm_.data_.get(); + + // FIXME: This constructor is called during each + // set/increment/observe/get_value call. It only needs the pointers. labels_ = labels_shm_->String(); + if (buckets_shm_ != nullptr) { // Histogram + size_t bucket_size = buckets_shm_->ByteSize() / sizeof(double); + std::vector buckets; + buckets.reserve(bucket_size); + for (size_t i = 0; i < bucket_size; ++i) { + buckets.emplace_back( + reinterpret_cast(buckets_shm_->DataPtr())[i]); + } + buckets_ = std::move(buckets); + } + operation_value_ = custom_metric_shm_ptr_->operation_value; metric_family_address_ = custom_metric_shm_ptr_->metric_family_address; metric_address_ = custom_metric_shm_ptr_->metric_address; @@ -161,6 +201,24 @@ Metric::SendSetValueRequest(const double& value) } } +void +Metric::SendObserveRequest(const double& value) +{ + try { + CheckIfCleared(); + std::unique_ptr& stub = Stub::GetOrCreateInstance(); + operation_value_ = value; + SaveToSharedMemory(stub->ShmPool()); + AllocatedSharedMemory custom_metrics_shm; + stub->SendMessage( + custom_metrics_shm, PYTHONSTUB_MetricRequestObserve, shm_handle_); + } + catch (const PythonBackendException& pb_exception) { + throw PythonBackendException( + "Failed to observe metric value: " + std::string(pb_exception.what())); + } +} + double Metric::SendGetValueRequest() { @@ -222,14 +280,35 @@ Metric::InitializeTritonMetric() { std::vector labels_params; ParseLabels(labels_params, labels_); + TRITONSERVER_MetricKind kind; + THROW_IF_TRITON_ERROR(TRITONSERVER_GetMetricFamilyKind( + reinterpret_cast(metric_family_address_), + &kind)); + TRITONSERVER_MetricArgs* args = nullptr; + switch (kind) { + case TRITONSERVER_METRIC_KIND_COUNTER: + case TRITONSERVER_METRIC_KIND_GAUGE: + break; + case TRITONSERVER_METRIC_KIND_HISTOGRAM: { + const std::vector& buckets = buckets_.value(); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsNew(&args)); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsSetHistogram( + args, buckets.data(), buckets.size())); + break; + } + default: + break; + } + TRITONSERVER_Metric* triton_metric = nullptr; - THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNew( + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricNewWithArgs( &triton_metric, reinterpret_cast(metric_family_address_), - labels_params.data(), labels_params.size())); + labels_params.data(), labels_params.size(), args)); for (const auto label : labels_params) { TRITONSERVER_ParameterDelete(const_cast(label)); } + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricArgsDelete(args)); return reinterpret_cast(triton_metric); } @@ -262,6 +341,8 @@ Metric::HandleMetricOperation( Increment(operation_value_); } else if (command_type == PYTHONSTUB_MetricRequestSet) { SetValue(operation_value_); + } else if (command_type == PYTHONSTUB_MetricRequestObserve) { + Observe(operation_value_); } else { throw PythonBackendException("Unknown metric operation"); } @@ -281,6 +362,13 @@ Metric::SetValue(const double& value) THROW_IF_TRITON_ERROR(TRITONSERVER_MetricSet(triton_metric, value)); } +void +Metric::Observe(const double& value) +{ + auto triton_metric = reinterpret_cast(metric_address_); + THROW_IF_TRITON_ERROR(TRITONSERVER_MetricObserve(triton_metric, value)); +} + double Metric::GetValue() { diff --git a/src/metric.h b/src/metric.h index 197e8ce9..cd54ca54 100644 --- a/src/metric.h +++ b/src/metric.h @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,9 +26,11 @@ #pragma once +#include #include #include "ipc_message.h" +#include "pb_memory.h" #include "pb_string.h" #include "pb_utils.h" @@ -47,6 +49,8 @@ namespace triton { namespace backend { namespace python { struct MetricShm { // The shared memory handle of the labels in PbString format. bi::managed_external_buffer::handle_t labels_shm_handle; + // The shared memory handle of the buckets in PbMemory format. + bi::managed_external_buffer::handle_t buckets_shm_handle; // The value used for incrementing or setting the metric. double operation_value; // The address of the TRITONSERVER_Metric object. @@ -58,7 +62,10 @@ struct MetricShm { class Metric { public: - Metric(const std::string& labels, void* metric_family_address); + Metric( + const std::string& labels, + std::optional> buckets, + void* metric_family_address); ~Metric(); @@ -97,6 +104,10 @@ class Metric { /// \param value The value to set the metric to. void SendSetValueRequest(const double& value); + /// Send the request to the parent process to observe the value to the metric. + /// \param value The value to set the metric to. + void SendObserveRequest(const double& value); + /// Send the request to the parent process to get the value of the metric. /// \return Returns the value of the metric. double SendGetValueRequest(); @@ -132,6 +143,10 @@ class Metric { /// \param value The value to set the metric to. void SetValue(const double& value); + /// Use Triton C API to sample the observation to the metric. + /// \param value The value to sample observation to the metric. + void Observe(const double& value); + /// Use Triton C API to get the value of the metric. double GetValue(); @@ -146,10 +161,14 @@ class Metric { // The private constructor for creating a Metric object from shared memory. Metric( AllocatedSharedMemory& custom_metric_shm, - std::unique_ptr& labels_shm); + std::unique_ptr& labels_shm, + std::unique_ptr& buckets); // The labels of the metric, which is the identifier of the metric. std::string labels_; + // Monotonically increasing values representing bucket boundaries for creating + // histogram metric. + std::optional> buckets_; // The value used for incrementing or setting the metric. double operation_value_; // The address of the TRITONSERVER_Metric object. @@ -168,6 +187,7 @@ class Metric { MetricShm* custom_metric_shm_ptr_; bi::managed_external_buffer::handle_t shm_handle_; std::unique_ptr labels_shm_; + std::unique_ptr buckets_shm_; }; }}}; // namespace triton::backend::python diff --git a/src/metric_family.cc b/src/metric_family.cc index 77e8aedf..222a0e23 100644 --- a/src/metric_family.cc +++ b/src/metric_family.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -166,19 +166,39 @@ MetricFamily::SendCreateMetricFamilyRequest() } std::shared_ptr -MetricFamily::CreateMetric(const py::object& labels) +MetricFamily::CreateMetric(const py::object& labels, const py::object& buckets) { if (!labels.is_none()) { if (!py::isinstance(labels)) { throw PythonBackendException( - "Failed to create metric. Labels must be a " - "dictionary."); + "Failed to create metric. Labels must be a dictionary."); } } py::module json = py::module_::import("json"); std::string labels_str = std::string(py::str(json.attr("dumps")(labels))); - auto metric = std::make_shared(labels_str, metric_family_address_); + + std::optional> buckets_vec; + if (!buckets.is_none()) { + if (!py::isinstance(buckets)) { + throw PythonBackendException( + "Failed to create metric. Buckets must be a list."); + } + if (kind_ == kCounter || kind_ == kGauge) { + throw PythonBackendException( + "Failed to create metric. Unexpected buckets found."); + } + buckets_vec = buckets.cast>(); + } else { + if (kind_ == kHistogram) { + throw PythonBackendException( + "Failed to create metric. Missing required buckets."); + } + buckets_vec = std::nullopt; + } + + auto metric = + std::make_shared(labels_str, buckets_vec, metric_family_address_); { std::lock_guard lock(metric_map_mu_); metric_map_.insert({metric->MetricAddress(), metric}); @@ -205,6 +225,8 @@ MetricFamily::ToTritonServerMetricKind(const MetricKind& kind) return TRITONSERVER_METRIC_KIND_COUNTER; case kGauge: return TRITONSERVER_METRIC_KIND_GAUGE; + case kHistogram: + return TRITONSERVER_METRIC_KIND_HISTOGRAM; default: throw PythonBackendException("Unknown metric kind"); } diff --git a/src/metric_family.h b/src/metric_family.h index 04374a68..2b5f86ab 100644 --- a/src/metric_family.h +++ b/src/metric_family.h @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -97,8 +97,11 @@ class MetricFamily { /// Create a metric from the metric family and store it in the metric map. /// \param labels The labels of the metric. + /// \param buckets Monotonically increasing values representing bucket + /// boundaries for creating histogram metric. /// \return Returns the shared pointer to the created metric. - std::shared_ptr CreateMetric(const py::object& labels); + std::shared_ptr CreateMetric( + const py::object& labels, const py::object& buckets); #else /// Initialize the TRITONSERVER_MetricFamily object. /// \return Returns the address of the TRITONSERVER_MetricFamily object. @@ -128,8 +131,8 @@ class MetricFamily { std::string name_; // The description of the metric family. std::string description_; - // The metric kind of the metric family. Currently only supports GAUGE and - // COUNTER. + // The metric kind of the metric family. Currently only supports GAUGE, + // COUNTER and HISTOGRAM. MetricKind kind_; // The address of the TRITONSERVER_MetricFamily object. void* metric_family_address_; diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 2a6be556..007e7f29 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1824,11 +1824,13 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::class_>(module, "Metric") .def("increment", &Metric::SendIncrementRequest) .def("set", &Metric::SendSetValueRequest) + .def("observe", &Metric::SendObserveRequest) .def("value", &Metric::SendGetValueRequest); py::enum_(module, "MetricKind") .value("COUNTER", MetricKind::kCounter) .value("GAUGE", MetricKind::kGauge) + .value("HISTOGRAM", MetricKind::kHistogram) .export_values(); py::class_>( @@ -1839,9 +1841,11 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) py::arg("kind").none(false)) .def( "Metric", &MetricFamily::CreateMetric, - py::arg("labels").none(true) = py::none()); + py::arg("labels").none(true) = py::none(), + py::arg("buckets").none(true) = py::none()); module.attr("MetricFamily").attr("COUNTER") = MetricKind::kCounter; module.attr("MetricFamily").attr("GAUGE") = MetricKind::kGauge; + module.attr("MetricFamily").attr("HISTOGRAM") = MetricKind::kHistogram; module.def( "load_model", &LoadModel, py::arg("model_name").none(false), diff --git a/src/pb_utils.h b/src/pb_utils.h index 1a6c2d8b..e68cfb0f 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -171,7 +171,7 @@ struct ResponseBatch : SendMessageBase { enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; -enum MetricKind { kCounter = 0, kGauge }; +enum MetricKind { kCounter = 0, kGauge, kHistogram }; struct LogSendMessage : SendMessageBase { bi::managed_external_buffer::handle_t filename; diff --git a/src/python_be.cc b/src/python_be.cc index 2212176d..761abdbf 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -758,7 +758,8 @@ ModelInstanceState::StubToParentMQMonitor() case PYTHONSTUB_MetricRequestDelete: case PYTHONSTUB_MetricRequestValue: case PYTHONSTUB_MetricRequestIncrement: - case PYTHONSTUB_MetricRequestSet: { + case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestObserve: { ProcessMetricRequest(message); break; } @@ -978,6 +979,7 @@ ModelInstanceState::ProcessMetricRequest( } case PYTHONSTUB_MetricRequestIncrement: case PYTHONSTUB_MetricRequestSet: + case PYTHONSTUB_MetricRequestObserve: case PYTHONSTUB_MetricRequestValue: { metric->HandleMetricOperation(metrics_message_ptr, command); break; From d84bb57b06c985162fe537b46593281589bfb4d5 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 28 Aug 2024 13:50:43 -0400 Subject: [PATCH 196/216] Improve the documentation for custom Python backend stubs (#377) * Improve the documentation for custom Python backend stubs * Review comment --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 913034a8..ffcef26c 100644 --- a/README.md +++ b/README.md @@ -809,8 +809,8 @@ Python version is different from Python 3.10 which is shipped by default in the Triton containers.** Python backend uses a *stub* process to connect your `model.py` file to the -Triton C++ core. This stub process has an embedded Python interpreter with -a fixed Python version. If you intend to use a Python interpreter with +Triton C++ core. This stub process dynamically links to a specific +`libpython..so` version. If you intend to use a Python interpreter with different version from the default Python backend stub, you need to compile your own Python backend stub by following the steps below: From 35a1c1fad5104c9c4149dd7fee69585d99bb6009 Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Sat, 21 Sep 2024 13:16:39 -0700 Subject: [PATCH 197/216] perf: vLLM metrics optimization (#379) --- src/metric.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/metric.cc b/src/metric.cc index 7796b161..4c055910 100644 --- a/src/metric.cc +++ b/src/metric.cc @@ -167,6 +167,7 @@ Metric::SendCreateMetricRequest() void Metric::SendIncrementRequest(const double& value) { + py::gil_scoped_release release; try { CheckIfCleared(); std::unique_ptr& stub = Stub::GetOrCreateInstance(); @@ -204,6 +205,7 @@ Metric::SendSetValueRequest(const double& value) void Metric::SendObserveRequest(const double& value) { + py::gil_scoped_release release; try { CheckIfCleared(); std::unique_ptr& stub = Stub::GetOrCreateInstance(); From a2564eae71bd7b090c3694921036cac67bd018f2 Mon Sep 17 00:00:00 2001 From: Kris Hung Date: Tue, 8 Oct 2024 12:59:28 -0700 Subject: [PATCH 198/216] Add back 24.05 response sending path to fix performance (#381) * Add back 24.05 response sender path * Improve perf * Fix cleanup * Review comments * Fix up * Fix up * Fix response factory cleanup * Fix segfault * Fix error handling * Remove extra logs * Fix up, add comments * Address comment * Fix up --------- Co-authored-by: Iman Tabrizian --- src/infer_request.cc | 2 +- src/infer_request.h | 1 + src/ipc_message.cc | 23 +++ src/ipc_message.h | 9 + src/pb_stub.cc | 146 +++++++++++--- src/pb_stub.h | 5 +- src/pb_utils.h | 3 + src/python_be.cc | 438 +++++++++++++++++++++++++++++++++++------ src/python_be.h | 21 +- src/response_sender.cc | 23 ++- src/response_sender.h | 5 +- 11 files changed, 579 insertions(+), 97 deletions(-) diff --git a/src/infer_request.cc b/src/infer_request.cc index 8a95b524..e5733662 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -484,7 +484,7 @@ InferRequest::Exec(const bool is_decoupled) { bi::scoped_lock lock{ *(ipc_message->ResponseMutex())}; - stub->SendIPCMessage(ipc_message); + stub->SendIPCUtilsMessage(ipc_message); ipc_message->ResponseCondition()->wait(lock); } diff --git a/src/infer_request.h b/src/infer_request.h index c67e2fb0..f368d692 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -96,6 +96,7 @@ class InferRequest { InferenceTrace& GetTrace(); uint32_t ReleaseFlags(); void SetReleaseFlags(const uint32_t& flags); + intptr_t GetResponseFactoryAddress() { return response_factory_address_; } #ifdef TRITON_PB_STUB std::shared_ptr Exec(const bool is_decoupled); diff --git a/src/ipc_message.cc b/src/ipc_message.cc index ea1dc5b0..2fa13ba3 100644 --- a/src/ipc_message.cc +++ b/src/ipc_message.cc @@ -56,6 +56,21 @@ IPCMessage::Create( new IPCMessage(ipc_message_shm, response_mutex_shm, response_cond_shm)); } +std::unique_ptr +IPCMessage::Create( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& message_handle) +{ + return std::unique_ptr( + new IPCMessage(ipc_message_shm, message_handle)); +} + +AllocatedSharedMemory& +IPCMessage::GetAllocatedSharedMemory() +{ + return ipc_message_shm_; +} + std::unique_ptr IPCMessage::LoadFromSharedMemory( std::unique_ptr& shm_pool, @@ -133,4 +148,12 @@ IPCMessage::IPCMessage( ipc_message_handle_ = ipc_message_shm_.handle_; } +IPCMessage::IPCMessage( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& handle) +{ + ipc_message_handle_ = handle; + ipc_message_shm_ptr_ = ipc_message_shm; +} + }}}; // namespace triton::backend::python diff --git a/src/ipc_message.h b/src/ipc_message.h index 8e762b8f..c3d1472e 100644 --- a/src/ipc_message.h +++ b/src/ipc_message.h @@ -97,6 +97,10 @@ class IPCMessage { static std::unique_ptr Create( const std::unique_ptr& shm_pool, bool inline_response); + + static std::unique_ptr Create( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& message_handle); static std::unique_ptr LoadFromSharedMemory( std::unique_ptr& shm_pool, bi::managed_external_buffer::handle_t message_handle); @@ -108,6 +112,7 @@ class IPCMessage { bi::interprocess_mutex* ResponseMutex(); bi::managed_external_buffer::handle_t& Args(); bi::managed_external_buffer::handle_t ShmHandle(); + AllocatedSharedMemory& GetAllocatedSharedMemory(); private: AllocatedSharedMemory ipc_message_shm_; @@ -129,6 +134,10 @@ class IPCMessage { AllocatedSharedMemory& ipc_message_shm, AllocatedSharedMemory& response_mutex_shm, AllocatedSharedMemory& response_cond_shm); + + IPCMessage( + IPCMessageShm* ipc_message_shm, + bi::managed_external_buffer::handle_t& handle); }; }}}; // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 007e7f29..a26719d2 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -653,27 +653,20 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { py::list py_request_list = LoadRequestsFromSharedMemory(request_batch_shm_ptr); - std::unique_ptr execute_response = - IPCMessage::Create(shm_pool_, false /* Inline response */); - execute_response->Command() = PYTHONSTUB_ExecuteResponse; + std::unique_ptr execute_response; - AllocatedSharedMemory response_batch = - shm_pool_->Construct(); - ResponseBatch* response_batch_shm_ptr = - reinterpret_cast(response_batch.data_.get()); - execute_response->Args() = response_batch.handle_; + std::optional> response_batch; bool has_exception = false; std::string error_string; std::unique_ptr error_string_shm; + std::string err_message; ScopedDefer execute_finalize([this] { stub_message_queue_->Pop(); }); ScopedDefer _( [this, &execute_response] { SendIPCMessage(execute_response); }); - + py::object execute_return; + py::object coroutine_return; try { - response_batch_shm_ptr->has_error = false; - response_batch_shm_ptr->is_error_set = false; - if (!py::hasattr(model_instance_, "execute")) { std::string message = "Python model " + model_context_.PythonModelPath() + " does not implement `execute` method."; @@ -683,8 +676,7 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) { NVTX_RANGE(nvtx_, "PyExecute " + name_); - py::object execute_return = - model_instance_.attr("execute")(py_request_list); + execute_return = model_instance_.attr("execute")(py_request_list); bool is_coroutine = py::module::import("asyncio") .attr("iscoroutine")(execute_return) @@ -694,12 +686,14 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) // Do not wait for async decoupled execute to return. RunCoroutine(execute_return, true /* in_background */); } else { - py::object coroutine_return = + coroutine_return = RunCoroutine(execute_return, false /* in_background */); - ProcessReturnedResponses(py_request_list, coroutine_return); + ProcessReturnedResponses( + py_request_list, coroutine_return, response_batch); } } else { - ProcessReturnedResponses(py_request_list, execute_return); + ProcessReturnedResponses( + py_request_list, execute_return, response_batch); } } } @@ -713,16 +707,36 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) } if (has_exception) { - std::string err_message = - std::string( - "Failed to process the request(s) for model '" + name_ + - "', message: ") + - error_string; + err_message = std::string( + "Failed to process the request(s) for model '" + name_ + + "', message: ") + + error_string; LOG_ERROR << err_message.c_str(); + if (!response_batch) { + response_batch = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + } + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + + // The backend will clean up the response factory if there is an error in + // the response batch. For decoupled mode, it is necessary to handle cases + // where the response sender should have already cleaned up, ensuring the + // backend does not delete the response factory again during error handling. + if (IsDecoupled()) { + for (py::handle py_request : py_request_list) { + InferRequest* request = py_request.cast(); + if (request->GetResponseSender()->IsClosed()) { + response_batch_shm_ptr->is_response_factory_deleted = true; + } + } + } + response_batch_shm_ptr->has_error = true; error_string_shm = PbString::Create(shm_pool_, err_message); response_batch_shm_ptr->error = error_string_shm->ShmHandle(); response_batch_shm_ptr->is_error_set = true; + response_batch_shm_ptr->batch_size = 0; // Once the error is sent to the backend, the backend is supposed to close // all response factories if not already closed, so closing all response // senders if not already closed to prevent the model from sending more @@ -731,12 +745,47 @@ Stub::ProcessRequests(RequestBatch* request_batch_shm_ptr) InferRequest* request = py_request.cast(); request->GetResponseSender()->Close(); } + } else { + if (!response_batch) { + response_batch = shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->batch_size = 0; + } + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->has_error = false; + response_batch_shm_ptr->is_error_set = false; + } + + execute_response = IPCMessage::Create( + reinterpret_cast(response_batch.value().data_.get()), + response_batch.value().handle_); + execute_response->Args() = + response_batch.value().handle_ + sizeof(IPCMessageShm); + execute_response->InlineResponse() = false; + execute_response->Command() = PYTHONSTUB_ExecuteResponse; + _.Complete(); + execute_finalize.Complete(); +} + +void +Stub::ProcessResponse(InferResponse* response) +{ + response->SaveToSharedMemory(shm_pool_, false /* copy_gpu */); + + for (auto& output_tensor : response->OutputTensors()) { + if (!output_tensor->IsCPU()) { + gpu_tensors_.push_back(output_tensor); + } } } void Stub::ProcessReturnedResponses( - py::list py_requests, py::object py_responses_obj) + py::list py_requests, py::object py_responses_obj, + std::optional>& response_batch) { // Return if there is nothing to process. if (py::isinstance(py_responses_obj)) { @@ -784,12 +833,55 @@ Stub::ProcessReturnedResponses( "return list, found type '" + std::string(py::str(py_responses[i].get_type())) + "'."); } - std::shared_ptr response = - py_responses[i].cast>(); - request->GetResponseSender()->Send( - response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + + InferResponse* response = py_responses[i].cast(); + try { + request->GetResponseSender()->UpdateStateAndCounters( + response, TRITONSERVER_RESPONSE_COMPLETE_FINAL); + } + catch (const PythonBackendException& pb_exception) { + // Handle the exception here to catch the error when there's a response + // returned from `execute()`. + if (request->GetResponseSender()->IsClosed()) { + response_batch = std::move(shm_pool_->Construct( + sizeof(ResponseBatch) + sizeof(IPCMessageShm))); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + response_batch_shm_ptr->batch_size = 0; + response_batch_shm_ptr->is_response_factory_deleted = true; + } + throw pb_exception; + } + } + } + // Return all the created responses using response_batch. The reason + // that both of the paths are available is that sending the responses + // using response_batch is faster than using `response_sender`. + response_batch = std::move(shm_pool_->Construct( + sizeof(IPCMessageShm) + + requests_size * sizeof(bi::managed_external_buffer::handle_t) + + sizeof(ResponseBatch))); + ResponseBatch* response_batch_shm_ptr = reinterpret_cast( + response_batch.value().data_.get() + sizeof(IPCMessageShm)); + + bi::managed_external_buffer::handle_t* responses_shm_handle = + reinterpret_cast( + response_batch.value().data_.get() + sizeof(ResponseBatch) + + sizeof(IPCMessageShm)); + for (size_t i = 0; i < responses_size; i++) { + // Check the return type of execute function. + InferRequest* infer_request = py_requests[i].cast(); + InferResponse* infer_response = py_responses[i].cast(); + if (!py::isinstance(py_responses[i])) { + infer_response->PruneOutputTensors(infer_request->RequestedOutputNames()); + ProcessResponse(infer_response); + responses_shm_handle[i] = infer_response->ShmHandle(); + } else { + responses_shm_handle[i] = 0; } } + response_batch_shm_ptr->batch_size = requests_size; } py::object diff --git a/src/pb_stub.h b/src/pb_stub.h index 9ed74d9a..7d76ec9a 100644 --- a/src/pb_stub.h +++ b/src/pb_stub.h @@ -254,7 +254,10 @@ class Stub { void ProcessRequests(RequestBatch* request_batch_shm_ptr); void ProcessReturnedResponses( - py::list py_requests, py::object py_responses_obj); + py::list py_requests, py::object py_responses_obj, + std::optional>& response_batch); + + void ProcessResponse(InferResponse* response); py::object GetAsyncEventLoop(); diff --git a/src/pb_utils.h b/src/pb_utils.h index e68cfb0f..aacf6b49 100644 --- a/src/pb_utils.h +++ b/src/pb_utils.h @@ -167,6 +167,9 @@ struct ResponseBatch : SendMessageBase { bool is_error_set; uint32_t response_size; + + // Indicates whether the response factory has been deleted or not. + bool is_response_factory_deleted = false; }; enum LogLevel { kInfo = 0, kWarning, kError, kVerbose }; diff --git a/src/python_be.cc b/src/python_be.cc index 761abdbf..bdf7b95f 100644 --- a/src/python_be.cc +++ b/src/python_be.cc @@ -153,6 +153,23 @@ ModelInstanceState::SetErrorForResponseSendMessage( } } +bool +ModelInstanceState::IsStubProcessAlive() +{ + boost::posix_time::ptime timeout = + boost::get_system_time() + boost::posix_time::seconds(1); + bi::scoped_lock lock(*Stub()->HealthMutex(), timeout); + + // Check if lock has been acquired. + if (lock) { + return Stub()->IpcControl()->stub_health; + } else { + // If It failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return false; + } +} + TRITONSERVER_Error* ModelInstanceState::SaveRequestsToSharedMemory( TRITONBACKEND_Request** requests, const uint32_t request_count, @@ -290,7 +307,7 @@ ModelInstanceState::SaveRequestsToSharedMemory( request, &request_timeout)); std::unique_ptr infer_request; - TRITONBACKEND_ResponseFactory* factory_ptr; + TRITONBACKEND_ResponseFactory* factory_ptr = nullptr; RETURN_IF_ERROR(TRITONBACKEND_ResponseFactoryNew(&factory_ptr, request)); infer_request = std::make_unique( @@ -322,8 +339,6 @@ ModelInstanceState::LaunchStubProcess() thread_pool_ = std::make_unique( model_state->StateForBackend()->thread_pool_size); - queue_monitor_thread_ = true; - queue_monitor_ = std::thread(&ModelInstanceState::MessageQueueMonitor, this); request_executor_ = std::make_unique( Stub()->ShmPool(), model_state->TritonServer()); @@ -685,44 +700,6 @@ ModelInstanceState::ExecuteBLSRequest( } } -void -ModelInstanceState::MessageQueueMonitor() -{ - while (queue_monitor_thread_) { - bi::managed_external_buffer::handle_t handle = - Stub()->ParentMessageQueue()->Pop(); - if (handle == DUMMY_MESSAGE) { - break; - } - std::unique_ptr message = - IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), handle); - - // Need to notify the model instance thread that the execute response has - // been received. - if (message->Command() == PYTHONSTUB_ExecuteResponse) { - std::lock_guard guard{mu_}; - received_message_ = std::move(message); - cv_.notify_one(); - } else if (message->Command() == PYTHONSTUB_ResponseSend) { - std::shared_ptr response_send_message = std::move(message); - std::packaged_task task([this, response_send_message] { - ResponseSendDecoupled(response_send_message); - }); - boost::asio::post(*thread_pool_, std::move(task)); - } else if ( - message->Command() == PYTHONSTUB_InferExecRequest || - message->Command() == PYTHONSTUB_InferStreamExecRequest) { - std::shared_ptr bls_execute = std::move(message); - std::packaged_task task([this, bls_execute] { - ExecuteBLSRequest( - bls_execute, - (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); - }); - boost::asio::post(*thread_pool_, std::move(task)); - } - } -} - void ModelInstanceState::StubToParentMQMonitor() { @@ -769,6 +746,25 @@ ModelInstanceState::StubToParentMQMonitor() ProcessModelControlRequest(message); break; } + case PYTHONSTUB_ResponseSend: { + std::shared_ptr response_send_message = std::move(message); + std::packaged_task task([this, response_send_message] { + ResponseSendDecoupled(response_send_message); + }); + boost::asio::post(*thread_pool_, std::move(task)); + break; + } + case PYTHONSTUB_InferExecRequest: + case PYTHONSTUB_InferStreamExecRequest: { + std::shared_ptr bls_execute = std::move(message); + std::packaged_task task([this, bls_execute] { + ExecuteBLSRequest( + bls_execute, + (bls_execute->Command() == PYTHONSTUB_InferStreamExecRequest)); + }); + boost::asio::post(*thread_pool_, std::move(task)); + break; + } default: { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, "Unexpected message type received."); @@ -1030,6 +1026,100 @@ ModelInstanceState::ProcessModelControlRequest( }); } +TRITONSERVER_Error* +ModelInstanceState::SendMessageToStub( + bi::managed_external_buffer::handle_t message) +{ + bool success = false; + while (!success) { + uint64_t timeout_miliseconds = 1000; + { + boost::posix_time::ptime timeout = + boost::get_system_time() + + boost::posix_time::milliseconds(timeout_miliseconds); + + bi::scoped_lock lock( + *(Stub()->HealthMutex()), timeout); + + // Check if lock has been acquired. + if (lock) { + Stub()->IpcControl()->stub_health = false; + } else { + // If it failed to obtain the lock, it means that the stub has been + // stuck or exited while holding the health mutex lock. + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Failed to obtain the health mutex."); + } + } + + Stub()->StubMessageQueue()->Push( + message, timeout_miliseconds /* duration ms */, success); + + if (!success && !IsStubProcessAlive()) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, "Stub process is not healthy."); + } + } + + return nullptr; // success +} + +void +ModelInstanceState::SendMessageAndReceiveResponse( + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + auto error = SendMessageToStub(message); + if (error != nullptr) { + RespondErrorToAllRequests( + TRITONSERVER_ErrorMessage(error), responses, requests, request_count); + + return; + } + + bi::managed_external_buffer::handle_t response_message; + error = Stub()->ReceiveMessageFromStub(response_message); + if (error != nullptr) { + RespondErrorToAllRequests( + TRITONSERVER_ErrorMessage(error), responses, requests, request_count); + + return; + } + + response = response_message; +} + +void +ModelInstanceState::RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count) +{ + for (uint32_t r = 0; r < request_count; ++r) { + if ((*responses)[r] == nullptr) + continue; + + std::string err_message = + std::string( + "Failed to process the request(s) for model instance '" + Name() + + "', message: ") + + message; + + TRITONSERVER_Error* err = + TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, err_message.c_str()); + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + + (*responses)[r] = nullptr; + TRITONSERVER_ErrorDelete(err); + } +} + + void ModelInstanceState::StartMonitor() { @@ -1060,6 +1150,17 @@ ModelInstanceState::ResponseSendDecoupled( ResponseSendMessage* send_message_payload = reinterpret_cast(send_message.data_.get()); std::unique_ptr error_message; + ScopedDefer response_factory_deleter([send_message_payload] { + if (send_message_payload->flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + send_message_payload->response_factory_address); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory(reinterpret_cast( + response_factory)); + } + }); ScopedDefer _([send_message_payload] { { bi::scoped_lock guard{send_message_payload->mu}; @@ -1228,31 +1329,48 @@ ModelInstanceState::ProcessRequests( IPCMessage::Create(Stub()->ShmPool(), false /*inline_response*/)); ipc_message->Command() = PYTHONSTUB_CommandType::PYTHONSTUB_ExecuteRequest; ipc_message->Args() = request_batch.handle_; - received_message_ = nullptr; - ScopedDefer _([this] { + + ScopedDefer execute_finalize([this] { // Push a dummy message to signal the thread to terminate. Stub()->StubMessageQueue()->Push(DUMMY_MESSAGE); }); + std::unique_ptr response; { - std::unique_lock guard{mu_}; Stub()->StubMessageQueue()->Push(ipc_message->ShmHandle()); - cv_.wait(guard, [this] { return received_message_ != nullptr; }); + bi::managed_external_buffer::handle_t response_message; + RETURN_IF_ERROR(Stub()->ReceiveMessageFromStub(response_message)); + response = + IPCMessage::LoadFromSharedMemory(Stub()->ShmPool(), response_message); } - - AllocatedSharedMemory response_batch = - Stub()->ShmPool()->Load(received_message_->Args()); - received_message_.reset(); + char* ipc_message_shm = + reinterpret_cast(response->GetAllocatedSharedMemory().data_.get()); + ResponseBatch* response_batch_shm_ptr = + reinterpret_cast(ipc_message_shm + sizeof(IPCMessageShm)); uint64_t compute_end_ns = 0; SET_TIMESTAMP(compute_end_ns); reporter.SetComputeEndNs(compute_end_ns); reporter.SetBatchStatistics(total_batch_size); - if (response_batch.data_->has_error) { - if (response_batch.data_->is_error_set) { + if (response_batch_shm_ptr->has_error) { + // Clean up the response factory if an error occurred. The + // `is_response_factory_deleted` flag indicates whether the response factory + // has been deleted for some corner cases. + if (!response_batch_shm_ptr->is_response_factory_deleted) { + for (uint32_t r = 0; r < request_count; r++) { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + pb_infer_requests[r]->GetResponseFactoryAddress()); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory(reinterpret_cast( + response_factory)); + } + } + if (response_batch_shm_ptr->is_error_set) { auto error = PbString::LoadFromSharedMemory( - Stub()->ShmPool(), response_batch.data_->error); + Stub()->ShmPool(), response_batch_shm_ptr->error); return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, error->String().c_str()); } @@ -1261,6 +1379,218 @@ ModelInstanceState::ProcessRequests( TRITONSERVER_ERROR_INTERNAL, "Failed to process the requests."); } + if (response_batch_shm_ptr->batch_size > 0) { + bi::managed_external_buffer::handle_t* response_shm_handle = + reinterpret_cast( + ipc_message_shm + sizeof(ResponseBatch) + sizeof(IPCMessageShm)); + + std::shared_ptr> responses( + new std::vector()); + responses->reserve(request_count); + for (size_t i = 0; i < request_count; i++) { + // It is possible to have multiple responses batched together in a single + // response batch shm, where some of the responses are None due to the + // usage of response sender, so only create a TRITONBACKEND_Response + // object for the valid responses. + if (response_shm_handle[i] == 0) { + responses->emplace_back(nullptr); + } else { + TRITONBACKEND_Response* response; + auto err = TRITONBACKEND_ResponseNew(&response, requests[i]); + if (err == nullptr) { + responses->emplace_back(response); + } else { + responses->emplace_back(nullptr); + LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response"); + TRITONSERVER_ErrorDelete(err); + } + } + } + + std::vector requires_deferred_callback; + + bool has_gpu_output = false; + std::vector> shm_responses; + std::vector, void*>>> + gpu_output_buffers(request_count); + GPUBuffersHelper gpu_buffer_helper; + + for (uint32_t r = 0; r < request_count; ++r) { + NVTX_RANGE(nvtx_, "LoadingResponse " + Name()); + requires_deferred_callback.push_back(false); + if (response_shm_handle[r] == 0) { + continue; + } + TRITONBACKEND_Response* response = (*responses)[r]; + TRITONBACKEND_Request* request = requests[r]; + uint32_t requested_output_count = 0; + + shm_responses.emplace_back(nullptr); + std::unique_ptr& infer_response = shm_responses.back(); + try { + if (pb_infer_requests[r]->ReleaseFlags() == + TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) { + // For rescheduled requests, we do not need to send a response. + LOG_IF_ERROR( + TRITONBACKEND_ResponseDelete((*responses)[r]), + "failed to delete response"); + (*responses)[r] = nullptr; + continue; + } + { + TRITONBACKEND_ResponseFactory* response_factory = + reinterpret_cast( + pb_infer_requests[r]->GetResponseFactoryAddress()); + std::unique_ptr< + TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter> + lresponse_factory( + reinterpret_cast( + response_factory)); + } + infer_response = InferResponse::LoadFromSharedMemory( + Stub()->ShmPool(), response_shm_handle[r], + false /* open_cuda_handle */); + if (infer_response->HasError()) { + TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( + infer_response->Error()->Code(), + infer_response->Error()->Message().c_str()); + + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + TRITONSERVER_ErrorDelete(err); + (*responses)[r] = nullptr; + + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags( + TRITONSERVER_REQUEST_RELEASE_ALL); + + // If has_error is true, we do not look at the response tensors. + continue; + } + } + catch (const PythonBackendException& pb_exception) { + TRITONSERVER_Error* err = TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); + LOG_IF_ERROR( + TRITONBACKEND_ResponseSend( + (*responses)[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, err), + "failed sending response"); + TRITONSERVER_ErrorDelete(err); + (*responses)[r] = nullptr; + + // Reset the release flags for the request. + pb_infer_requests[r]->SetReleaseFlags(TRITONSERVER_REQUEST_RELEASE_ALL); + + continue; + } + + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_RequestOutputCount(request, &requested_output_count)); + std::set requested_output_names; + for (size_t j = 0; j < requested_output_count; ++j) { + const char* output_name; + GUARDED_RESPOND_IF_ERROR( + responses, r, + TRITONBACKEND_RequestOutputName(request, j, &output_name)); + requested_output_names.insert(output_name); + } + + bool require_deferred_callback = false; + +#ifdef TRITON_ENABLE_GPU + for (auto& output_tensor : infer_response->OutputTensors()) { + if (output_tensor->MemoryType() == TRITONSERVER_MEMORY_GPU) { + // Attempt to use the cuda shared memory pool for GPU tensor. + ShareCUDAMemoryPool(output_tensor->MemoryTypeId()); + } + } +#endif // TRITON_ENABLE_GPU + + gpu_output_buffers[r] = + std::vector, void*>>{}; + infer_response->Send( + response, CudaStream(), require_deferred_callback, + TRITONSERVER_RESPONSE_COMPLETE_FINAL, Stub()->ShmPool(), + gpu_buffer_helper, gpu_output_buffers[r], requested_output_names); + + requires_deferred_callback[r] = require_deferred_callback; + + if (requires_deferred_callback[r]) { + has_gpu_output = true; + } + } + + execute_finalize.Complete(); + + // If the output tensor is in GPU, there will be a second round trip + // required for filling the GPU buffers provided by the main process. + if (has_gpu_output) { + ipc_message->Command() = + PYTHONSTUB_CommandType::PYTHONSTUB_LoadGPUBuffers; + gpu_buffer_helper.Complete(Stub()->ShmPool()); + ipc_message->Args() = gpu_buffer_helper.ShmHandle(); + bi::managed_external_buffer::handle_t response_message; + SendMessageAndReceiveResponse( + ipc_message->ShmHandle(), response_message, responses, requests, 0); + + bool cuda_copy = false; + + uint32_t response_index = 0; + for (auto& gpu_output_buffer : gpu_output_buffers) { + for (auto& buffer_memory_pair : gpu_output_buffer) { + auto& pb_memory = buffer_memory_pair.first; + void* pointer = buffer_memory_pair.second; + bool cuda_used = false; + + if (pb_memory->MemoryType() == TRITONSERVER_MEMORY_CPU) { + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_CPU, 0, TRITONSERVER_MEMORY_CPU, 0, + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } else if ( + (pb_memory->MemoryType() == TRITONSERVER_MEMORY_GPU) && + pb_memory->UseCUDASharedPool() && + (pb_memory->DataPtr() != pointer)) { + // If the data pointer from pb_memory is not the same as the + // pointer, it means that the Triton-provided buffer is not used + // during tensor transfer. Instead, an intermediate buffer that uses + // CUDA shared memory pool is used. In this case, we need to copy + // the data from the intermediate buffer back to the Triton-provided + // buffer. + GUARDED_RESPOND_IF_ERROR( + responses, response_index, + CopyBuffer( + "Failed to copy the output tensor to buffer.", + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + TRITONSERVER_MEMORY_GPU, pb_memory->MemoryTypeId(), + pb_memory->ByteSize(), pb_memory->DataPtr(), pointer, + CudaStream(), &cuda_used)); + cuda_copy |= cuda_used; + } + } + response_index++; +#ifdef TRITON_ENABLE_GPU + if (cuda_copy) { + cudaStreamSynchronize(stream_); + } +#endif // TRITON_ENABLE_GPU + } + } + + for (uint32_t r = 0; r < request_count; ++r) { + if (requires_deferred_callback[r]) { + shm_responses[r]->DeferredSendCallback(); + } + } + } + return nullptr; // success } @@ -1401,16 +1731,12 @@ ModelInstanceState::~ModelInstanceState() if (Stub()->IsHealthy()) { // Wait for all the pending tasks to finish. thread_pool_->wait(); - // Push a dummy message to signal the thread to terminate. - Stub()->ParentMessageQueue()->Push(DUMMY_MESSAGE); - queue_monitor_.join(); } // Terminate stub first to allow any last messages to be received by the back // end before deallocating the queue memory Stub()->TerminateStub(); TerminateMonitor(); Stub()->ClearQueues(); - received_message_.reset(); Stub().reset(); } diff --git a/src/python_be.h b/src/python_be.h index 59660fc4..c98e1284 100644 --- a/src/python_be.h +++ b/src/python_be.h @@ -287,9 +287,6 @@ class ModelInstanceState : public BackendModelInstance { std::thread stub_to_parent_queue_monitor_; bool stub_to_parent_thread_; - // Queue monitor thread - std::thread queue_monitor_; - bool queue_monitor_thread_; std::mutex mu_; std::condition_variable cv_; std::unique_ptr received_message_; @@ -361,6 +358,24 @@ class ModelInstanceState : public BackendModelInstance { AllocatedSharedMemory& request_batch, std::shared_ptr>& responses); + void SendMessageAndReceiveResponse( + bi::managed_external_buffer::handle_t message, + bi::managed_external_buffer::handle_t& response, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + void RespondErrorToAllRequests( + const char* message, + std::shared_ptr>& responses, + TRITONBACKEND_Request** requests, const uint32_t request_count); + + // void SendMessageToStub(bi::managed_external_buffer::handle_t message); + TRITONSERVER_Error* SendMessageToStub( + bi::managed_external_buffer::handle_t message); + + // Checks whether the stub process is live + bool IsStubProcessAlive(); + // Model instance stub std::unique_ptr& Stub() { return model_instance_stub_; } diff --git a/src/response_sender.cc b/src/response_sender.cc index 0a88fb6b..ef3b09dd 100644 --- a/src/response_sender.cc +++ b/src/response_sender.cc @@ -74,7 +74,7 @@ ResponseSender::~ResponseSender() void ResponseSender::UpdateStateAndCounters( - const std::shared_ptr& response, const uint32_t flags) + InferResponse* response, const uint32_t flags) { if (is_decoupled_ == nullptr) { // TODO: Can a model access the response sender on a BLS infer request? @@ -106,6 +106,7 @@ ResponseSender::UpdateStateAndCounters( } if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_deleted_.exchange(true); closed_ = true; } number_of_response_sent_++; @@ -123,7 +124,7 @@ ResponseSender::Send( py::gil_scoped_release release; CheckResponseSenderArguments(infer_response, flags); - UpdateStateAndCounters(infer_response, flags); + UpdateStateAndCounters(infer_response.get(), flags); if (infer_response) { infer_response->PruneOutputTensors(requested_output_names_); } @@ -172,7 +173,11 @@ ResponseSender::Send( { bi::scoped_lock guard{send_message_payload->mu}; - stub->SendIPCMessage(ipc_message); + // The server will destruct the response factory if the final flag is set. + if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + response_factory_deleted_.exchange(true); + } + stub->SendIPCUtilsMessage(ipc_message); while (!send_message_payload->is_stub_turn) { send_message_payload->cv.wait(guard); } @@ -246,10 +251,6 @@ ResponseSender::Send( "An error occurred while sending a response."); } } - - if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL) { - DeleteResponseFactory(); - } } bool @@ -258,11 +259,19 @@ ResponseSender::IsCancelled() return pb_cancel_->IsCancelled(); } +bool +ResponseSender::IsClosed() +{ + std::lock_guard lk(mu_); + return closed_; +} + void ResponseSender::Close() { std::lock_guard lk(mu_); closed_ = true; + response_factory_deleted_.exchange(true); } void diff --git a/src/response_sender.h b/src/response_sender.h index 69f416c2..a696f9eb 100644 --- a/src/response_sender.h +++ b/src/response_sender.h @@ -43,16 +43,17 @@ class ResponseSender { const std::set& requested_output_names, std::unique_ptr& shm_pool, const std::shared_ptr& pb_cancel); + intptr_t ResponseFactory() { return response_factory_address_; } ~ResponseSender(); void Send(std::shared_ptr response, const uint32_t flags); bool IsCancelled(); + void UpdateStateAndCounters(InferResponse* response, const uint32_t flags); // Can be useful at stopping the model from sending any more responses. void Close(); + bool IsClosed(); private: - void UpdateStateAndCounters( - const std::shared_ptr& response, const uint32_t flags); void DeleteResponseFactory(); intptr_t request_address_; From 682db01bbec9d4b4ed18da80c37a55d9331b83ac Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Thu, 31 Oct 2024 10:30:25 -0700 Subject: [PATCH 199/216] build: RHEL8 Python Backend (#385) * PYBE RHEL --- CMakeLists.txt | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ee209b5b..d27f10a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,18 @@ FetchContent_Declare( GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020" GIT_SHALLOW ON ) + +# RHEL base container has multiple version of Python installed. By default +# it seems like pybind will pickup v3.6, so we specifically assign it to +# search for 3.12 here. +set(RHEL_BUILD OFF) +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set(RHEL_BUILD ON) + set(PYBIND11_PYTHON_VERSION 3.12) + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) FetchContent_MakeAvailable(pybind11) # @@ -268,6 +280,23 @@ target_compile_options( ) target_compile_definitions(triton-python-backend-stub PRIVATE TRITON_PB_STUB) +# RHEL assets are not released in a container environment nor do the current +# Python lib versions in the manylinux base container match those currently +# available for RHEL8 package managers. Therefore, we package the correct +# python libs in the backend folder and adjust the stub executable to look +# in its own folder at runtime. +if(RHEL_BUILD) + set_target_properties( + triton-python-backend-stub + PROPERTIES + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "$\{ORIGIN\}" + ) +endif(RHEL_BUILD) + + # For WIN32 do not link Threads and DL_LIBS if(WIN32) target_link_libraries( From 09c35373d66141ad052bccf4b1591d1f1ad8034c Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:47:19 -0500 Subject: [PATCH 200/216] updating pybind11 version (#384) --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d27f10a5..cc94b3aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,8 +88,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend) FetchContent_Declare( pybind11 GIT_REPOSITORY "https://github.com/pybind/pybind11" - # COMMIT ID for v2.10.0 - GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020" + # COMMIT ID for v2.12.0 + GIT_TAG "3e9dfa2866941655c56877882565e7577de6fc7b" GIT_SHALLOW ON ) From f3068c03ed82e099cef5e2b40e9d1d79b1eab7ac Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:32:33 -0800 Subject: [PATCH 201/216] Build Updates for Ubuntu24.04 (#386) (#387) (#388) * skip warning errors * Revert "skip warning errors" This reverts commit 071c052dd876820776c27a792271ba4100a4ce8a. * Reapply "skip warning errors" This reverts commit a088c296484ee4a4dae60cde70111b4225524258. Co-authored-by: Anant Sharma --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc94b3aa..0dc70f0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,7 +267,7 @@ target_compile_features(triton-python-backend PRIVATE cxx_std_${TRITON_MIN_CXX_S target_compile_options( triton-python-backend PRIVATE $<$,$,$>: - -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror> + -Wall -Wextra -Wno-unused-parameter -Wno-type-limits> $<$:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor> ) From 8e123478ecdf33ae781e6419cb2b84942a7365ff Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Tue, 10 Dec 2024 09:57:09 -0800 Subject: [PATCH 202/216] Remove Strict Requirement (#389) --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0dc70f0d..69c7c698 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,6 @@ if(LINUX) file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") set(RHEL_BUILD ON) - set(PYBIND11_PYTHON_VERSION 3.12) endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") endif(LINUX) FetchContent_MakeAvailable(pybind11) From b771f4f2f3fae3eb97c8f3624d268fd3947f96ea Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:05:59 -0800 Subject: [PATCH 203/216] fix: Fix requested output deleting extra outputs (#390) * fix: Hold GIL when deleting numpy array * chore: setting py obj to None may not destruct the object --- src/pb_stub.cc | 6 +++--- src/pb_tensor.cc | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index a26719d2..51df5aa2 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1032,9 +1032,9 @@ Stub::~Stub() { py::gil_scoped_acquire acquire; - async_event_loop_ = py::none(); - background_futures_ = py::none(); - model_instance_ = py::none(); + py::object async_event_loop_local(std::move(async_event_loop_)); + py::object background_futures_local(std::move(background_futures_)); + py::object model_instance_local(std::move(model_instance_)); } stub_instance_.reset(); stub_message_queue_.reset(); diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc index 1ab95144..9fde62fe 100644 --- a/src/pb_tensor.cc +++ b/src/pb_tensor.cc @@ -503,6 +503,14 @@ PbTensor::~PbTensor() noexcept(false) { pb_memory_.reset(); DeleteDLPack(); + +#ifdef TRITON_PB_STUB + { + py::gil_scoped_acquire acquire; + py::array numpy_array_local(std::move(numpy_array_)); + py::array numpy_array_serialized_local(std::move(numpy_array_serialized_)); + } +#endif } const std::string& From 1ea48a6f7c3d4c27ceacc0ad1acdbe2002a0476c Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Fri, 24 Jan 2025 16:20:45 -0800 Subject: [PATCH 204/216] feat: Add parameters support to InferResponse (#394) * Add parameters support to InferResponse * Infer response to track parameters * Add parameters to binding infer response * Rank parameters argument up among InferResponse constructor arguments * Add setting parameters to Triton response * Send response parameters only on non-error * Fix double declaration * Unify py dictionary parameters to json str * Add documentation * Mark response parameters accessor const and JSON serializable * [Docs] Note BLS response parameters are not populated currently * [comment] Clarify why PbTensor::LoadFromSharedMemory() requires holding GIL --- README.md | 21 ++++++- src/infer_response.cc | 78 ++++++++++++++++++++---- src/infer_response.h | 13 ++-- src/pb_stub.cc | 131 ++++++++++++++++++++++------------------ src/request_executor.cc | 20 +++--- 5 files changed, 180 insertions(+), 83 deletions(-) diff --git a/README.md b/README.md index ffcef26c..a6242a44 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@