hjchen2
diff --git a/‎README.md‎
Lines changed: 19 additions & 4 deletions b/‎README.md‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎src/libtorch.cc‎
Lines changed: 63 additions & 4 deletions b/‎src/libtorch.cc‎
Lines changed: 63 additions & 4 deletions
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -111,7 +111,7 @@ execution of models without these optimizations. In some models, optimized execu
 does not benefit performance as seen [here](https://github.com/pytorch/pytorch/issues/19978)
 and in other cases impacts performance negatively, as seen [here](https://github.com/pytorch/pytorch/issues/53824).
 
-The section of model config file specifying this parameters will look like:
+The section of model config file specifying this parameter will look like:
 
 ```
 parameters: {
@@ -133,7 +133,7 @@ this mode gets better performance by disabling autograd.
 Please note that in some models, InferenceMode might not benefit performance
 and in fewer cases might impact performance negatively.
 
-The section of model config file specifying this parameters will look like:
+The section of model config file specifying this parameter will look like:
 
 ```
 parameters: {
@@ -153,7 +153,7 @@ Please note that in some models generated using trace in old PyTorch versions mi
 correctly with NvFuser. We recommend using scripting and a recent version of PyTorch
 to generate these models.
 
-The section of model config file specifying this parameters will look like:
+The section of model config file specifying this parameter will look like:
 
 ```
 parameters: {
@@ -164,6 +164,21 @@ key: "ENABLE_NVFUSER"
 }
 ```
 
+* `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to
+share weights. This optimization should not be used with stateful models. If not specified,
+weight sharing is disabled.
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "ENABLE_WEIGHT_SHARING"
+    value: {
+    string_value:"true"
+    }
+}
+```
+
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
 
@@ -77,7 +77,7 @@ class ModelState : public BackendModel {
   TRITONSERVER_Error* LoadModel(
       const std::string& artifact_name, const torch::Device device,
       std::string* model_path,
-      std::unique_ptr<torch::jit::script::Module>* torch_model);
+      std::shared_ptr<torch::jit::script::Module>* torch_model);
 
   bool EnabledOptimizedExecution() { return enable_optimized_execution_; }
   const std::pair<bool, bool>& EnabledTensorExprFuser() const
@@ -98,6 +98,8 @@ class ModelState : public BackendModel {
     return enable_nvfuser_pair_;
   }
 
+  bool EnabledWeightSharing() { return enable_weight_sharing_; }
+
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
   TRITONSERVER_Error* AutoCompleteConfig();
@@ -111,6 +113,9 @@ class ModelState : public BackendModel {
   // Flag to indicate whether inference mode is enabled. Defaults to false.
   bool enable_inference_mode_;
 
+  // Flag to indicate whether weight sharing is enabled. Defaults to false.
+  bool enable_weight_sharing_;
+
   // Flag pairs to indicate if various JIT settings are set and
   // enabled respectively. Defaults to (false, true). Default behavior
   // is to do nothing if not explicitly set. Tensor fuser flag is
@@ -122,6 +127,12 @@ class ModelState : public BackendModel {
   // Flag pair to indicate whether nvfuser is set and enabled respectively.
   // Defaults to (false, false).
   std::pair<bool, bool> enable_nvfuser_pair_;
+
+  // Model mapping for shared TorchScript model across all instances on the
+  // same device. The key is a pair of isGPU and device index.
+  std::map<
+      std::pair<bool, int64_t>, std::shared_ptr<torch::jit::script::Module>>
+      torch_models_;
 };
 
 TRITONSERVER_Error*
@@ -161,7 +172,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(false), enable_tensor_fuser_pair_({false, true}),
+      enable_inference_mode_(false), enable_weight_sharing_(false),
+      enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true}),
       enable_nvfuser_pair_({false, false})
@@ -172,7 +184,7 @@ TRITONSERVER_Error*
 ModelState::LoadModel(
     const std::string& artifact_name, const torch::Device device,
     std::string* model_path,
-    std::unique_ptr<torch::jit::script::Module>* torch_model)
+    std::shared_ptr<torch::jit::script::Module>* torch_model)
 {
   // Find the TorchScript file that describes the model. If the model
   // configuration doesn't have an explicit model file specified then
@@ -194,6 +206,23 @@ ModelState::LoadModel(
             "' for model instance '" + Name() + "'");
   }
 
+  // If weight sharing is enabled, skip loading model if
+  // it is already available on the target device
+  std::pair<bool, int> device_pair;
+  if (enable_weight_sharing_) {
+    device_pair = std::make_pair(!device.is_cpu(), device.index());
+    auto mit = torch_models_.find(device_pair);
+    if (mit != torch_models_.end()) {
+      *torch_model = mit->second;
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Reusing TorchScript model for instance '") + Name() +
+           "'")
+              .c_str());
+      return nullptr;  // success
+    }
+  }
+
   // Serialize the torch model to string
   std::string model_data_str;
   RETURN_IF_ERROR(ReadTextFile(*model_path, &model_data_str));
@@ -213,6 +242,17 @@ ModelState::LoadModel(
         ("failed to load model '" + Name() + "': " + ex.what()).c_str());
   }
 
+  if (enable_weight_sharing_) {
+    if (!((torch_models_.emplace(device_pair, *torch_model)).second)) {
+      std::string type = device.is_cpu() ? "CPU" : "GPU";
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_WARN,
+          (std::string("Model already found on target ") + type + " device " +
+           "(id " + std::to_string(device.index()) + ") for '" + Name() + "'")
+              .c_str());
+    }
+  }
+
   return nullptr;  // success
 }
 
@@ -295,6 +335,25 @@ ModelState::ParseParameters()
               .c_str());
     }
 
+    // If 'ENABLE_WEIGHT_SHARING' is not present in 'parameters' then no
+    // update is made to 'enable_weight_sharing'.
+    err = ParseParameter(
+        params, "ENABLE_WEIGHT_SHARING", &enable_weight_sharing_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("Weight sharing is ") +
+           (enable_weight_sharing_ ? "enabled" : "disabled") +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
+
     // If 'ENABLE_JIT_PROFILING' is not present in 'parameters' then no update
     // is made to 'enable_jit_profiling'.
     bool enable_jit_profiling = false;
@@ -419,7 +478,7 @@ class ModelInstanceState : public BackendModelInstance {
   // The full path to the TorchScript model file.
   std::string model_path_;
 
-  std::unique_ptr<torch::jit::script::Module> torch_model_;
+  std::shared_ptr<torch::jit::script::Module> torch_model_;
   torch::Device device_;
 
   // Map from configuration name for an input to the index of