From 2c9ce4c52b4468266a9489992afd8ffc2ea282a1 Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Wed, 24 Oct 2018 12:51:21 -0700 Subject: [PATCH 1/4] Make backward static in custom function example --- README.md | 1 + autograd/two_layer_net_custom_function.py | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 07d406d..0978a09 100644 --- a/README.md +++ b/README.md @@ -266,6 +266,7 @@ class MyReLU(torch.autograd.Function): ctx.save_for_backward(x) return x.clamp(min=0) + @staticmethod def backward(ctx, grad_output): """ In the backward pass we receive the context object and a Tensor containing diff --git a/autograd/two_layer_net_custom_function.py b/autograd/two_layer_net_custom_function.py index 79cf08e..6c768d1 100644 --- a/autograd/two_layer_net_custom_function.py +++ b/autograd/two_layer_net_custom_function.py @@ -27,6 +27,7 @@ def forward(ctx, x): ctx.save_for_backward(x) return x.clamp(min=0) + @staticmethod def backward(ctx, grad_output): """ In the backward pass we receive the context object and a Tensor containing From 3a4ef4fac51df8054c8e986a5030363fe6482aba Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Wed, 24 Oct 2018 12:59:11 -0700 Subject: [PATCH 2/4] Fix incorrect "with respect to" comments about gradients --- autograd/two_layer_net_autograd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/autograd/two_layer_net_autograd.py b/autograd/two_layer_net_autograd.py index 31d6c34..2a5bb7f 100644 --- a/autograd/two_layer_net_autograd.py +++ b/autograd/two_layer_net_autograd.py @@ -10,10 +10,10 @@ When we create a PyTorch Tensor with requires_grad=True, then operations involving that Tensor will not just compute values; they will also build up a computational graph in the background, allowing us to easily backpropagate -through the graph to compute gradients of some Tensors with respect to a -downstream loss. Concretely if x is a Tensor with x.requires_grad == True then -after backpropagation x.grad will be another Tensor holding the gradient of x -with respect to some scalar value. +through the graph to compute gradients of some downstream (scalar) loss with +respect to a Tensor. Concretely if x is a Tensor with x.requires_grad == True +then after backpropagation x.grad will be another Tensor holding the gradient +of x with respect to some scalar value. """ device = torch.device('cpu') From 73a662bbe9fce257ec8c3cdbb44a8112a42d39f3 Mon Sep 17 00:00:00 2001 From: Justin Johnson Date: Wed, 24 Oct 2018 13:30:16 -0700 Subject: [PATCH 3/4] Change size_average to reduction, and make a note about sums vs means in MSELoss --- README.md | 14 +++++++++----- nn/dynamic_net.py | 2 +- nn/two_layer_net_module.py | 2 +- nn/two_layer_net_nn.py | 8 ++++++-- nn/two_layer_net_optim.py | 2 +- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0978a09..8394c2f 100644 --- a/README.md +++ b/README.md @@ -464,8 +464,12 @@ model = torch.nn.Sequential( ).to(device) # The nn package also contains definitions of popular loss functions; in this -# case we will use Mean Squared Error (MSE) as our loss function. -loss_fn = torch.nn.MSELoss(size_average=False) +# case we will use Mean Squared Error (MSE) as our loss function. Setting +# reduction='sum' means that we are computing the *sum* of squared errors rather +# than the mean; this is for consistency with the examples above where we +# manually compute the loss, but in practice it is more common to use mean +# squared error as a loss by setting reduction='elementwise_mean'. +loss_fn = torch.nn.MSELoss(reduction='sum') learning_rate = 1e-4 for t in range(500): @@ -528,7 +532,7 @@ model = torch.nn.Sequential( torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) -loss_fn = torch.nn.MSELoss(size_average=False) +loss_fn = torch.nn.MSELoss(reduction='sum') # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other @@ -603,7 +607,7 @@ model = TwoLayerNet(D_in, H, D_out) # Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. -loss_fn = torch.nn.MSELoss(size_average=False) +loss_fn = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) for t in range(500): # Forward pass: Compute predicted y by passing x to the model @@ -683,7 +687,7 @@ model = DynamicNet(D_in, H, D_out) # Construct our loss function and an Optimizer. Training this strange model with # vanilla stochastic gradient descent is tough, so we use momentum -criterion = torch.nn.MSELoss(size_average=False) +criterion = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) for t in range(500): # Forward pass: Compute predicted y by passing x to the model diff --git a/nn/dynamic_net.py b/nn/dynamic_net.py index a104c83..ce4b4a0 100644 --- a/nn/dynamic_net.py +++ b/nn/dynamic_net.py @@ -53,7 +53,7 @@ def forward(self, x): # Construct our loss function and an Optimizer. Training this strange model with # vanilla stochastic gradient descent is tough, so we use momentum -criterion = torch.nn.MSELoss(size_average=False) +criterion = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) for t in range(500): # Forward pass: Compute predicted y by passing x to the model diff --git a/nn/two_layer_net_module.py b/nn/two_layer_net_module.py index bf41b29..e86127e 100644 --- a/nn/two_layer_net_module.py +++ b/nn/two_layer_net_module.py @@ -43,7 +43,7 @@ def forward(self, x): # Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. -loss_fn = torch.nn.MSELoss(size_average=False) +loss_fn = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) for t in range(500): # Forward pass: Compute predicted y by passing x to the model diff --git a/nn/two_layer_net_nn.py b/nn/two_layer_net_nn.py index f151cb9..ec4f897 100644 --- a/nn/two_layer_net_nn.py +++ b/nn/two_layer_net_nn.py @@ -36,8 +36,12 @@ ).to(device) # The nn package also contains definitions of popular loss functions; in this -# case we will use Mean Squared Error (MSE) as our loss function. -loss_fn = torch.nn.MSELoss(size_average=False) +# case we will use Mean Squared Error (MSE) as our loss function. Setting +# reduction='sum' means that we are computing the *sum* of squared errors rather +# than the mean; this is for consistency with the examples above where we +# manually compute the loss, but in practice it is more common to use mean +# squared error as a loss by setting reduction='elementwise_mean'. +loss_fn = torch.nn.MSELoss(reduction='sum') learning_rate = 1e-4 for t in range(500): diff --git a/nn/two_layer_net_optim.py b/nn/two_layer_net_optim.py index fcff856..84a7f2e 100644 --- a/nn/two_layer_net_optim.py +++ b/nn/two_layer_net_optim.py @@ -26,7 +26,7 @@ torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) -loss_fn = torch.nn.MSELoss(size_average=False) +loss_fn = torch.nn.MSELoss(reduction='sum') # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other From 29e58eb4fda9f4188bf1d3da2c789cc81f0d3e27 Mon Sep 17 00:00:00 2001 From: Aman Goyal Date: Tue, 2 Jul 2019 03:38:00 +0530 Subject: [PATCH 4/4] Typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8394c2f..65ffc4e 100644 --- a/README.md +++ b/README.md @@ -536,7 +536,7 @@ loss_fn = torch.nn.MSELoss(reduction='sum') # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other -# optimization algoriths. The first argument to the Adam constructor tells the +# optimization algorithms. The first argument to the Adam constructor tells the # optimizer which Tensors it should update. learning_rate = 1e-4 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)