|
1 | 1 | import torch |
2 | | -from torch.autograd import Variable |
3 | 2 |
|
4 | 3 | """ |
5 | 4 | A fully-connected ReLU network with one hidden layer and no biases, trained to |
6 | 5 | predict y from x by minimizing squared Euclidean distance. |
7 | 6 |
|
8 | 7 | This implementation computes the forward pass using operations on PyTorch |
9 | | -Variables, and uses PyTorch autograd to compute gradients. |
| 8 | +Tensors, and uses PyTorch autograd to compute gradients. |
10 | 9 |
|
11 | | -A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node |
12 | | -in a computational graph. If x is a Variable then x.data is a Tensor giving its |
13 | | -value, and x.grad is another Variable holding the gradient of x with respect to |
14 | | -some scalar value. |
15 | | -
|
16 | | -PyTorch Variables have the same API as PyTorch tensors: (almost) any operation |
17 | | -you can do on a Tensor you can also do on a Variable; the difference is that |
18 | | -autograd allows you to automatically compute gradients. |
| 10 | +When we create a PyTorch Tensor with requires_grad=True, then operations |
| 11 | +involving that Tensor will not just compute values; they will also build up |
| 12 | +a computational graph in the background, allowing us to easily backpropagate |
| 13 | +through the graph to compute gradients of some Tensors with respect to a |
| 14 | +downstream loss. Concretely if x is a Tensor with x.requires_grad == True then |
| 15 | +after backpropagation x.grad will be another Tensor holding the gradient of x |
| 16 | +with respect to some scalar value. |
19 | 17 | """ |
20 | 18 |
|
21 | | -dtype = torch.FloatTensor |
22 | | -# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU |
| 19 | +device = torch.device('cpu') |
| 20 | +# device = torch.device('cuda') # Uncomment this to run on GPU |
23 | 21 |
|
24 | 22 | # N is batch size; D_in is input dimension; |
25 | 23 | # H is hidden dimension; D_out is output dimension. |
26 | 24 | N, D_in, H, D_out = 64, 1000, 100, 10 |
27 | 25 |
|
28 | | -# Create random Tensors to hold input and outputs, and wrap them in Variables. |
29 | | -# Setting requires_grad=False indicates that we do not need to compute gradients |
30 | | -# with respect to these Variables during the backward pass. |
31 | | -x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False) |
32 | | -y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False) |
| 26 | +# Create random Tensors to hold input and outputs |
| 27 | +x = torch.randn(N, D_in, device=device) |
| 28 | +y = torch.randn(N, D_out, device=device) |
33 | 29 |
|
34 | | -# Create random Tensors for weights, and wrap them in Variables. |
35 | | -# Setting requires_grad=True indicates that we want to compute gradients with |
36 | | -# respect to these Variables during the backward pass. |
37 | | -w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True) |
38 | | -w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True) |
| 30 | +# Create random Tensors for weights; setting requires_grad=True means that we |
| 31 | +# want to compute gradients for these Tensors during the backward pass. |
| 32 | +w1 = torch.randn(D_in, H, device=device, requires_grad=True) |
| 33 | +w2 = torch.randn(H, D_out, device=device, requires_grad=True) |
39 | 34 |
|
40 | 35 | learning_rate = 1e-6 |
41 | 36 | for t in range(500): |
42 | | - # Forward pass: compute predicted y using operations on Variables; these |
43 | | - # are exactly the same operations we used to compute the forward pass using |
44 | | - # Tensors, but we do not need to keep references to intermediate values since |
45 | | - # we are not implementing the backward pass by hand. |
| 37 | + # Forward pass: compute predicted y using operations on Tensors. Since w1 and |
| 38 | + # w2 have requires_grad=True, operations involving these Tensors will cause |
| 39 | + # PyTorch to build a computational graph, allowing automatic computation of |
| 40 | + # gradients. Since we are no longer implementing the backward pass by hand we |
| 41 | + # don't need to keep references to intermediate values. |
46 | 42 | y_pred = x.mm(w1).clamp(min=0).mm(w2) |
47 | 43 |
|
48 | | - # Compute and print loss using operations on Variables. |
49 | | - # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape |
50 | | - # (1,); loss.data[0] is a scalar value holding the loss. |
| 44 | + # Compute and print loss. Loss is a Tensor of shape (), and loss.item() |
| 45 | + # is a Python number giving its value. |
51 | 46 | loss = (y_pred - y).pow(2).sum() |
52 | | - print(t, loss.data[0]) |
| 47 | + print(t, loss.item()) |
53 | 48 |
|
54 | 49 | # Use autograd to compute the backward pass. This call will compute the |
55 | | - # gradient of loss with respect to all Variables with requires_grad=True. |
56 | | - # After this call w1.grad and w2.grad will be Variables holding the gradient |
| 50 | + # gradient of loss with respect to all Tensors with requires_grad=True. |
| 51 | + # After this call w1.grad and w2.grad will be Tensors holding the gradient |
57 | 52 | # of the loss with respect to w1 and w2 respectively. |
58 | 53 | loss.backward() |
59 | 54 |
|
60 | | - # Update weights using gradient descent; w1.data and w2.data are Tensors, |
61 | | - # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are |
62 | | - # Tensors. |
63 | | - w1.data -= learning_rate * w1.grad.data |
64 | | - w2.data -= learning_rate * w2.grad.data |
| 55 | + # Update weights using gradient descent. For this step we just want to mutate |
| 56 | + # the values of w1 and w2 in-place; we don't want to build up a computational |
| 57 | + # graph for the update steps, so we use the torch.no_grad() context manager |
| 58 | + # to prevent PyTorch from building a computational graph for the updates |
| 59 | + with torch.no_grad(): |
| 60 | + w1 -= learning_rate * w1.grad |
| 61 | + w2 -= learning_rate * w2.grad |
65 | 62 |
|
66 | | - # Manually zero the gradients after running the backward pass |
67 | | - w1.grad.data.zero_() |
68 | | - w2.grad.data.zero_() |
| 63 | + # Manually zero the gradients after running the backward pass |
| 64 | + w1.grad.zero_() |
| 65 | + w2.grad.zero_() |
0 commit comments