| 
1 | 1 | import torch  | 
2 |  | -from torch.autograd import Variable  | 
3 | 2 | 
 
  | 
4 | 3 | """  | 
5 | 4 | A fully-connected ReLU network with one hidden layer and no biases, trained to  | 
6 | 5 | predict y from x by minimizing squared Euclidean distance.  | 
7 | 6 | 
  | 
8 | 7 | This implementation computes the forward pass using operations on PyTorch  | 
9 |  | -Variables, and uses PyTorch autograd to compute gradients.  | 
 | 8 | +Tensors, and uses PyTorch autograd to compute gradients.  | 
10 | 9 | 
  | 
11 |  | -A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node  | 
12 |  | -in a computational graph. If x is a Variable then x.data is a Tensor giving its  | 
13 |  | -value, and x.grad is another Variable holding the gradient of x with respect to  | 
14 |  | -some scalar value.  | 
15 |  | -
  | 
16 |  | -PyTorch Variables have the same API as PyTorch tensors: (almost) any operation  | 
17 |  | -you can do on a Tensor you can also do on a Variable; the difference is that  | 
18 |  | -autograd allows you to automatically compute gradients.  | 
 | 10 | +When we create a PyTorch Tensor with requires_grad=True, then operations  | 
 | 11 | +involving that Tensor will not just compute values; they will also build up  | 
 | 12 | +a computational graph in the background, allowing us to easily backpropagate  | 
 | 13 | +through the graph to compute gradients of some Tensors with respect to a  | 
 | 14 | +downstream loss. Concretely if x is a Tensor with x.requires_grad == True then  | 
 | 15 | +after backpropagation x.grad will be another Tensor holding the gradient of x  | 
 | 16 | +with respect to some scalar value.  | 
19 | 17 | """  | 
20 | 18 | 
 
  | 
21 |  | -dtype = torch.FloatTensor  | 
22 |  | -# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU  | 
 | 19 | +device = torch.device('cpu')  | 
 | 20 | +# device = torch.device('cuda') # Uncomment this to run on GPU  | 
23 | 21 | 
 
  | 
24 | 22 | # N is batch size; D_in is input dimension;  | 
25 | 23 | # H is hidden dimension; D_out is output dimension.  | 
26 | 24 | N, D_in, H, D_out = 64, 1000, 100, 10  | 
27 | 25 | 
 
  | 
28 |  | -# Create random Tensors to hold input and outputs, and wrap them in Variables.  | 
29 |  | -# Setting requires_grad=False indicates that we do not need to compute gradients  | 
30 |  | -# with respect to these Variables during the backward pass.  | 
31 |  | -x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)  | 
32 |  | -y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)  | 
 | 26 | +# Create random Tensors to hold input and outputs  | 
 | 27 | +x = torch.randn(N, D_in, device=device)  | 
 | 28 | +y = torch.randn(N, D_out, device=device)  | 
33 | 29 | 
 
  | 
34 |  | -# Create random Tensors for weights, and wrap them in Variables.  | 
35 |  | -# Setting requires_grad=True indicates that we want to compute gradients with  | 
36 |  | -# respect to these Variables during the backward pass.  | 
37 |  | -w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)  | 
38 |  | -w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)  | 
 | 30 | +# Create random Tensors for weights; setting requires_grad=True means that we  | 
 | 31 | +# want to compute gradients for these Tensors during the backward pass.  | 
 | 32 | +w1 = torch.randn(D_in, H, device=device, requires_grad=True)  | 
 | 33 | +w2 = torch.randn(H, D_out, device=device, requires_grad=True)  | 
39 | 34 | 
 
  | 
40 | 35 | learning_rate = 1e-6  | 
41 | 36 | for t in range(500):  | 
42 |  | -  # Forward pass: compute predicted y using operations on Variables; these  | 
43 |  | -  # are exactly the same operations we used to compute the forward pass using  | 
44 |  | -  # Tensors, but we do not need to keep references to intermediate values since  | 
45 |  | -  # we are not implementing the backward pass by hand.  | 
 | 37 | +  # Forward pass: compute predicted y using operations on Tensors. Since w1 and  | 
 | 38 | +  # w2 have requires_grad=True, operations involving these Tensors will cause  | 
 | 39 | +  # PyTorch to build a computational graph, allowing automatic computation of  | 
 | 40 | +  # gradients. Since we are no longer implementing the backward pass by hand we  | 
 | 41 | +  # don't need to keep references to intermediate values.  | 
46 | 42 |   y_pred = x.mm(w1).clamp(min=0).mm(w2)  | 
47 | 43 | 
 
  | 
48 |  | -  # Compute and print loss using operations on Variables.  | 
49 |  | -  # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape  | 
50 |  | -  # (1,); loss.data[0] is a scalar value holding the loss.  | 
 | 44 | +  # Compute and print loss. Loss is a Tensor of shape (), and loss.item()  | 
 | 45 | +  # is a Python number giving its value.  | 
51 | 46 |   loss = (y_pred - y).pow(2).sum()  | 
52 |  | -  print(t, loss.data[0])  | 
 | 47 | +  print(t, loss.item())  | 
53 | 48 | 
 
  | 
54 | 49 |   # Use autograd to compute the backward pass. This call will compute the  | 
55 |  | -  # gradient of loss with respect to all Variables with requires_grad=True.  | 
56 |  | -  # After this call w1.grad and w2.grad will be Variables holding the gradient  | 
 | 50 | +  # gradient of loss with respect to all Tensors with requires_grad=True.  | 
 | 51 | +  # After this call w1.grad and w2.grad will be Tensors holding the gradient  | 
57 | 52 |   # of the loss with respect to w1 and w2 respectively.  | 
58 | 53 |   loss.backward()  | 
59 | 54 | 
 
  | 
60 |  | -  # Update weights using gradient descent; w1.data and w2.data are Tensors,  | 
61 |  | -  # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are  | 
62 |  | -  # Tensors.  | 
63 |  | -  w1.data -= learning_rate * w1.grad.data  | 
64 |  | -  w2.data -= learning_rate * w2.grad.data  | 
 | 55 | +  # Update weights using gradient descent. For this step we just want to mutate  | 
 | 56 | +  # the values of w1 and w2 in-place; we don't want to build up a computational  | 
 | 57 | +  # graph for the update steps, so we use the torch.no_grad() context manager  | 
 | 58 | +  # to prevent PyTorch from building a computational graph for the updates  | 
 | 59 | +  with torch.no_grad():  | 
 | 60 | +    w1 -= learning_rate * w1.grad  | 
 | 61 | +    w2 -= learning_rate * w2.grad  | 
65 | 62 | 
 
  | 
66 |  | -  # Manually zero the gradients after running the backward pass  | 
67 |  | -  w1.grad.data.zero_()  | 
68 |  | -  w2.grad.data.zero_()  | 
 | 63 | +    # Manually zero the gradients after running the backward pass  | 
 | 64 | +    w1.grad.zero_()  | 
 | 65 | +    w2.grad.zero_()  | 
0 commit comments