DEVBOX10
diff --git a/‎bing_bert/deepspeed_train.py‎
Lines changed: 0 additions & 3 deletions b/‎bing_bert/deepspeed_train.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎bing_bert/nvidia/modelingpreln.py‎
Lines changed: 3 additions & 4 deletions b/‎bing_bert/nvidia/modelingpreln.py‎
Lines changed: 3 additions & 4 deletions
@@ -368,9 +368,6 @@ def prepare_optimizer_parameters(args, model):
 
 
 def prepare_model_optimizer(args):
-    # Initialize torch distributed
-    torch.distributed.init_process_group(backend="nccl")
-
     # Loading Model
     model = BertMultiTask(args)
 
 
@@ -739,10 +739,9 @@ def init_bert_weights(self, module):
             num_layers = self.config.num_hidden_layers
             std = self.config.initializer_range
             if hasattr(module, 'bert_output_layer'):
-                if torch.distributed.get_rank() == 0:
-                    print("Accounting for accumulation on the residual path")
-                    std = self.config.initializer_range / math.sqrt(
-                        2.0 * num_layers)
+                #print("Accounting for accumulation on the residual path")
+                std = self.config.initializer_range / math.sqrt(
+                    2.0 * num_layers)
             module.weight.data.normal_(mean=0.0, std=std)
         elif isinstance(module, BertLayerNorm):
             module.bias.data.zero_()