From 2c9ce4c52b4468266a9489992afd8ffc2ea282a1 Mon Sep 17 00:00:00 2001
From: Justin Johnson <jcjohns@fb.com>
Date: Wed, 24 Oct 2018 12:51:21 -0700
Subject: [PATCH 1/4] Make backward static in custom function example

---
 README.md                                 | 1 +
 autograd/two_layer_net_custom_function.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 07d406d..0978a09 100644
--- a/README.md
+++ b/README.md
@@ -266,6 +266,7 @@ class MyReLU(torch.autograd.Function):
     ctx.save_for_backward(x)
     return x.clamp(min=0)
 
+  @staticmethod
   def backward(ctx, grad_output):
     """
     In the backward pass we receive the context object and a Tensor containing
diff --git a/autograd/two_layer_net_custom_function.py b/autograd/two_layer_net_custom_function.py
index 79cf08e..6c768d1 100644
--- a/autograd/two_layer_net_custom_function.py
+++ b/autograd/two_layer_net_custom_function.py
@@ -27,6 +27,7 @@ def forward(ctx, x):
     ctx.save_for_backward(x)
     return x.clamp(min=0)
 
+  @staticmethod
   def backward(ctx, grad_output):
     """
     In the backward pass we receive the context object and a Tensor containing

From 3a4ef4fac51df8054c8e986a5030363fe6482aba Mon Sep 17 00:00:00 2001
From: Justin Johnson <jcjohns@fb.com>
Date: Wed, 24 Oct 2018 12:59:11 -0700
Subject: [PATCH 2/4] Fix incorrect "with respect to" comments about gradients

---
 autograd/two_layer_net_autograd.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/autograd/two_layer_net_autograd.py b/autograd/two_layer_net_autograd.py
index 31d6c34..2a5bb7f 100644
--- a/autograd/two_layer_net_autograd.py
+++ b/autograd/two_layer_net_autograd.py
@@ -10,10 +10,10 @@
 When we create a PyTorch Tensor with requires_grad=True, then operations
 involving that Tensor will not just compute values; they will also build up
 a computational graph in the background, allowing us to easily backpropagate
-through the graph to compute gradients of some Tensors with respect to a
-downstream loss. Concretely if x is a Tensor with x.requires_grad == True then
-after backpropagation x.grad will be another Tensor holding the gradient of x
-with respect to some scalar value.
+through the graph to compute gradients of some downstream (scalar) loss with
+respect to a Tensor. Concretely if x is a Tensor with x.requires_grad == True
+then after backpropagation x.grad will be another Tensor holding the gradient
+of x with respect to some scalar value.
 """
 
 device = torch.device('cpu')

From 73a662bbe9fce257ec8c3cdbb44a8112a42d39f3 Mon Sep 17 00:00:00 2001
From: Justin Johnson <jcjohns@fb.com>
Date: Wed, 24 Oct 2018 13:30:16 -0700
Subject: [PATCH 3/4] Change size_average to reduction, and make a note about
 sums vs means in MSELoss

---
 README.md                  | 14 +++++++++-----
 nn/dynamic_net.py          |  2 +-
 nn/two_layer_net_module.py |  2 +-
 nn/two_layer_net_nn.py     |  8 ++++++--
 nn/two_layer_net_optim.py  |  2 +-
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 0978a09..8394c2f 100644
--- a/README.md
+++ b/README.md
@@ -464,8 +464,12 @@ model = torch.nn.Sequential(
         ).to(device)
 
 # The nn package also contains definitions of popular loss functions; in this
-# case we will use Mean Squared Error (MSE) as our loss function.
-loss_fn = torch.nn.MSELoss(size_average=False)
+# case we will use Mean Squared Error (MSE) as our loss function. Setting
+# reduction='sum' means that we are computing the *sum* of squared errors rather
+# than the mean; this is for consistency with the examples above where we
+# manually compute the loss, but in practice it is more common to use mean
+# squared error as a loss by setting reduction='elementwise_mean'.
+loss_fn = torch.nn.MSELoss(reduction='sum')
 
 learning_rate = 1e-4
 for t in range(500):
@@ -528,7 +532,7 @@ model = torch.nn.Sequential(
           torch.nn.ReLU(),
           torch.nn.Linear(H, D_out),
         )
-loss_fn = torch.nn.MSELoss(size_average=False)
+loss_fn = torch.nn.MSELoss(reduction='sum')
 
 # Use the optim package to define an Optimizer that will update the weights of
 # the model for us. Here we will use Adam; the optim package contains many other
@@ -603,7 +607,7 @@ model = TwoLayerNet(D_in, H, D_out)
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-loss_fn = torch.nn.MSELoss(size_average=False)
+loss_fn = torch.nn.MSELoss(reduction='sum')
 optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
 for t in range(500):
   # Forward pass: Compute predicted y by passing x to the model
@@ -683,7 +687,7 @@ model = DynamicNet(D_in, H, D_out)
 
 # Construct our loss function and an Optimizer. Training this strange model with
 # vanilla stochastic gradient descent is tough, so we use momentum
-criterion = torch.nn.MSELoss(size_average=False)
+criterion = torch.nn.MSELoss(reduction='sum')
 optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
 for t in range(500):
   # Forward pass: Compute predicted y by passing x to the model
diff --git a/nn/dynamic_net.py b/nn/dynamic_net.py
index a104c83..ce4b4a0 100644
--- a/nn/dynamic_net.py
+++ b/nn/dynamic_net.py
@@ -53,7 +53,7 @@ def forward(self, x):
 
 # Construct our loss function and an Optimizer. Training this strange model with
 # vanilla stochastic gradient descent is tough, so we use momentum
-criterion = torch.nn.MSELoss(size_average=False)
+criterion = torch.nn.MSELoss(reduction='sum')
 optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
 for t in range(500):
   # Forward pass: Compute predicted y by passing x to the model
diff --git a/nn/two_layer_net_module.py b/nn/two_layer_net_module.py
index bf41b29..e86127e 100644
--- a/nn/two_layer_net_module.py
+++ b/nn/two_layer_net_module.py
@@ -43,7 +43,7 @@ def forward(self, x):
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-loss_fn = torch.nn.MSELoss(size_average=False)
+loss_fn = torch.nn.MSELoss(reduction='sum')
 optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
 for t in range(500):
   # Forward pass: Compute predicted y by passing x to the model
diff --git a/nn/two_layer_net_nn.py b/nn/two_layer_net_nn.py
index f151cb9..ec4f897 100644
--- a/nn/two_layer_net_nn.py
+++ b/nn/two_layer_net_nn.py
@@ -36,8 +36,12 @@
         ).to(device)
 
 # The nn package also contains definitions of popular loss functions; in this
-# case we will use Mean Squared Error (MSE) as our loss function.
-loss_fn = torch.nn.MSELoss(size_average=False)
+# case we will use Mean Squared Error (MSE) as our loss function. Setting
+# reduction='sum' means that we are computing the *sum* of squared errors rather
+# than the mean; this is for consistency with the examples above where we
+# manually compute the loss, but in practice it is more common to use mean
+# squared error as a loss by setting reduction='elementwise_mean'.
+loss_fn = torch.nn.MSELoss(reduction='sum')
 
 learning_rate = 1e-4
 for t in range(500):
diff --git a/nn/two_layer_net_optim.py b/nn/two_layer_net_optim.py
index fcff856..84a7f2e 100644
--- a/nn/two_layer_net_optim.py
+++ b/nn/two_layer_net_optim.py
@@ -26,7 +26,7 @@
           torch.nn.ReLU(),
           torch.nn.Linear(H, D_out),
         )
-loss_fn = torch.nn.MSELoss(size_average=False)
+loss_fn = torch.nn.MSELoss(reduction='sum')
 
 # Use the optim package to define an Optimizer that will update the weights of
 # the model for us. Here we will use Adam; the optim package contains many other

From 29e58eb4fda9f4188bf1d3da2c789cc81f0d3e27 Mon Sep 17 00:00:00 2001
From: Aman Goyal <amangoyal1412@gmail.com>
Date: Tue, 2 Jul 2019 03:38:00 +0530
Subject: [PATCH 4/4] Typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8394c2f..65ffc4e 100644
--- a/README.md
+++ b/README.md
@@ -536,7 +536,7 @@ loss_fn = torch.nn.MSELoss(reduction='sum')
 
 # Use the optim package to define an Optimizer that will update the weights of
 # the model for us. Here we will use Adam; the optim package contains many other
-# optimization algoriths. The first argument to the Adam constructor tells the
+# optimization algorithms. The first argument to the Adam constructor tells the
 # optimizer which Tensors it should update.
 learning_rate = 1e-4
 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)