renjiege
diff --git a/‎README.md‎
Lines changed: 116 additions & 115 deletions b/‎README.md‎
Lines changed: 116 additions & 115 deletions
diff --git a/‎autograd/two_layer_net_autograd.py‎
Lines changed: 37 additions & 40 deletions b/‎autograd/two_layer_net_autograd.py‎
Lines changed: 37 additions & 40 deletions
diff --git a/‎autograd/two_layer_net_custom_function.py‎
Lines changed: 38 additions & 37 deletions b/‎autograd/two_layer_net_custom_function.py‎
Lines changed: 38 additions & 37 deletions
diff --git a/‎nn/dynamic_net.py‎
Lines changed: 3 additions & 4 deletions b/‎nn/dynamic_net.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎nn/two_layer_net_module.py‎
Lines changed: 10 additions & 12 deletions b/‎nn/two_layer_net_module.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎nn/two_layer_net_nn.py‎
Lines changed: 20 additions & 15 deletions b/‎nn/two_layer_net_nn.py‎
Lines changed: 20 additions & 15 deletions
@@ -1,68 +1,65 @@
 import torch
-from torch.autograd import Variable
 
 """
 A fully-connected ReLU network with one hidden layer and no biases, trained to
 predict y from x by minimizing squared Euclidean distance.
 
 This implementation computes the forward pass using operations on PyTorch
-Variables, and uses PyTorch autograd to compute gradients.
+Tensors, and uses PyTorch autograd to compute gradients.
 
-A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents a node
-in a computational graph. If x is a Variable then x.data is a Tensor giving its
-value, and x.grad is another Variable holding the gradient of x with respect to
-some scalar value.
-
-PyTorch Variables have the same API as PyTorch tensors: (almost) any operation
-you can do on a Tensor you can also do on a Variable; the difference is that
-autograd allows you to automatically compute gradients.
+When we create a PyTorch Tensor with requires_grad=True, then operations
+involving that Tensor will not just compute values; they will also build up
+a computational graph in the background, allowing us to easily backpropagate
+through the graph to compute gradients of some Tensors with respect to a
+downstream loss. Concretely if x is a Tensor with x.requires_grad == True then
+after backpropagation x.grad will be another Tensor holding the gradient of x
+with respect to some scalar value.
 """
 
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
+device = torch.device('cpu')
+# device = torch.device('cuda') # Uncomment this to run on GPU
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold input and outputs, and wrap them in Variables.
-# Setting requires_grad=False indicates that we do not need to compute gradients
-# with respect to these Variables during the backward pass.
-x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
-y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
+# Create random Tensors to hold input and outputs
+x = torch.randn(N, D_in, device=device)
+y = torch.randn(N, D_out, device=device)
 
-# Create random Tensors for weights, and wrap them in Variables.
-# Setting requires_grad=True indicates that we want to compute gradients with
-# respect to these Variables during the backward pass.
-w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
-w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
+# Create random Tensors for weights; setting requires_grad=True means that we
+# want to compute gradients for these Tensors during the backward pass.
+w1 = torch.randn(D_in, H, device=device, requires_grad=True)
+w2 = torch.randn(H, D_out, device=device, requires_grad=True)
 
 learning_rate = 1e-6
 for t in range(500):
-  # Forward pass: compute predicted y using operations on Variables; these
-  # are exactly the same operations we used to compute the forward pass using
-  # Tensors, but we do not need to keep references to intermediate values since
-  # we are not implementing the backward pass by hand.
+  # Forward pass: compute predicted y using operations on Tensors. Since w1 and
+  # w2 have requires_grad=True, operations involving these Tensors will cause
+  # PyTorch to build a computational graph, allowing automatic computation of
+  # gradients. Since we are no longer implementing the backward pass by hand we
+  # don't need to keep references to intermediate values.
   y_pred = x.mm(w1).clamp(min=0).mm(w2)
 
-  # Compute and print loss using operations on Variables.
-  # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
-  # (1,); loss.data[0] is a scalar value holding the loss.
+  # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
+  # is a Python number giving its value.
   loss = (y_pred - y).pow(2).sum()
-  print(t, loss.data[0])
+  print(t, loss.item())
 
   # Use autograd to compute the backward pass. This call will compute the
-  # gradient of loss with respect to all Variables with requires_grad=True.
-  # After this call w1.grad and w2.grad will be Variables holding the gradient
+  # gradient of loss with respect to all Tensors with requires_grad=True.
+  # After this call w1.grad and w2.grad will be Tensors holding the gradient
   # of the loss with respect to w1 and w2 respectively.
   loss.backward()
 
-  # Update weights using gradient descent; w1.data and w2.data are Tensors,
-  # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
-  # Tensors.
-  w1.data -= learning_rate * w1.grad.data
-  w2.data -= learning_rate * w2.grad.data
+  # Update weights using gradient descent. For this step we just want to mutate
+  # the values of w1 and w2 in-place; we don't want to build up a computational
+  # graph for the update steps, so we use the torch.no_grad() context manager
+  # to prevent PyTorch from building a computational graph for the updates
+  with torch.no_grad():
+    w1 -= learning_rate * w1.grad
+    w2 -= learning_rate * w2.grad
 
-  # Manually zero the gradients after running the backward pass
-  w1.grad.data.zero_()
-  w2.grad.data.zero_()
+    # Manually zero the gradients after running the backward pass
+    w1.grad.zero_()
+    w2.grad.zero_()
@@ -1,12 +1,11 @@
 import torch
-from torch.autograd import Variable
 
 """
 A fully-connected ReLU network with one hidden layer and no biases, trained to
 predict y from x by minimizing squared Euclidean distance.
 
 This implementation computes the forward pass using operations on PyTorch
-Variables, and uses PyTorch autograd to compute gradients.
+Tensors, and uses PyTorch autograd to compute gradients.
 
 In this implementation we implement our own custom autograd function to perform
 the ReLU function.
@@ -18,62 +17,64 @@ class MyReLU(torch.autograd.Function):
   torch.autograd.Function and implementing the forward and backward passes
   which operate on Tensors.
   """
-  def forward(self, input):
+  @staticmethod
+  def forward(ctx, x):
     """
-    In the forward pass we receive a Tensor containing the input and return a
-    Tensor containing the output. You can cache arbitrary Tensors for use in the
-    backward pass using the save_for_backward method.
+    In the forward pass we receive a context object and a Tensor containing the
+    input; we must return a Tensor containing the output, and we can use the
+    context object to cache objects for use in the backward pass.
     """
-    self.save_for_backward(input)
-    return input.clamp(min=0)
+    ctx.save_for_backward(x)
+    return x.clamp(min=0)
 
-  def backward(self, grad_output):
+  def backward(ctx, grad_output):
     """
-    In the backward pass we receive a Tensor containing the gradient of the loss
-    with respect to the output, and we need to compute the gradient of the loss
-    with respect to the input.
+    In the backward pass we receive the context object and a Tensor containing
+    the gradient of the loss with respect to the output produced during the
+    forward pass. We can retrieve cached data from the context object, and must
+    compute and return the gradient of the loss with respect to the input to the
+    forward function.
     """
-    input, = self.saved_tensors
-    grad_input = grad_output.clone()
-    grad_input[input < 0] = 0
-    return grad_input
+    x, = ctx.saved_tensors
+    grad_x = grad_output.clone()
+    grad_x[x < 0] = 0
+    return grad_x
 
 
-dtype = torch.FloatTensor
-# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
+device = torch.device('cpu')
+# device = torch.device('cuda') # Uncomment this to run on GPU
 
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold input and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
-y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
+# Create random Tensors to hold input and output
+x = torch.randn(N, D_in, device=device)
+y = torch.randn(N, D_out, device=device)
 
 # Create random Tensors for weights, and wrap them in Variables.
-w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
-w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
+w1 = torch.randn(D_in, H, device=device, requires_grad=True)
+w2 = torch.randn(H, D_out, device=device, requires_grad=True)
 
 learning_rate = 1e-6
 for t in range(500):
-  # Construct an instance of our MyReLU class to use in our network
-  relu = MyReLU()
-  
-  # Forward pass: compute predicted y using operations on Variables; we compute
-  # ReLU using our custom autograd operation.
-  y_pred = relu(x.mm(w1)).mm(w2)
-  
+  # Forward pass: compute predicted y using operations on Tensors; we call our
+  # custom ReLU implementation using the MyReLU.apply function
+  y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
+ 
   # Compute and print loss
   loss = (y_pred - y).pow(2).sum()
-  print(t, loss.data[0])
+  print(t, loss.item())
 
   # Use autograd to compute the backward pass.
   loss.backward()
 
-  # Update weights using gradient descent
-  w1.data -= learning_rate * w1.grad.data
-  w2.data -= learning_rate * w2.grad.data
+  with torch.no_grad():
+    # Update weights using gradient descent
+    w1 -= learning_rate * w1.grad
+    w2 -= learning_rate * w2.grad
+
+    # Manually zero the gradients after running the backward pass
+    w1.grad.zero_()
+    w2.grad.zero_()
 
-  # Manually zero the gradients after running the backward pass
-  w1.grad.data.zero_()
-  w2.grad.data.zero_()
 
@@ -1,6 +1,5 @@
 import random
 import torch
-from torch.autograd import Variable
 
 """
 To showcase the power of PyTorch dynamic graphs, we will implement a very strange
@@ -46,8 +45,8 @@ def forward(self, x):
 N, D_in, H, D_out = 64, 1000, 100, 10
 
 # Create random Tensors to hold inputs and outputs, and wrap them in Variables
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+x = torch.randn(N, D_in)
+y = torch.randn(N, D_out)
 
 # Construct our model by instantiating the class defined above
 model = DynamicNet(D_in, H, D_out)
@@ -62,7 +61,7 @@ def forward(self, x):
 
   # Compute and print loss
   loss = criterion(y_pred, y)
-  print(t, loss.data[0])
+  print(t, loss.item())
 
   # Zero gradients, perform a backward pass, and update the weights.
   optimizer.zero_grad()
 
@@ -1,5 +1,4 @@
 import torch
-from torch.autograd import Variable
 
 """
 A fully-connected ReLU network with one hidden layer, trained to predict y from x
@@ -22,38 +21,37 @@ def __init__(self, D_in, H, D_out):
 
   def forward(self, x):
     """
-    In the forward function we accept a Variable of input data and we must return
-    a Variable of output data. We can use Modules defined in the constructor as
-    well as arbitrary operators on Variables.
+    In the forward function we accept a Tensor of input data and we must return
+    a Tensor of output data. We can use Modules defined in the constructor as
+    well as arbitrary (differentiable) operations on Tensors.
     """
     h_relu = self.linear1(x).clamp(min=0)
     y_pred = self.linear2(h_relu)
     return y_pred
 
-
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create random Tensors to hold inputs and outputs
+x = torch.randn(N, D_in)
+y = torch.randn(N, D_out)
 
-# Construct our model by instantiating the class defined above
+# Construct our model by instantiating the class defined above.
 model = TwoLayerNet(D_in, H, D_out)
 
 # Construct our loss function and an Optimizer. The call to model.parameters()
 # in the SGD constructor will contain the learnable parameters of the two
 # nn.Linear modules which are members of the model.
-criterion = torch.nn.MSELoss(size_average=False)
+loss_fn = torch.nn.MSELoss(size_average=False)
 optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
 for t in range(500):
   # Forward pass: Compute predicted y by passing x to the model
   y_pred = model(x)
 
   # Compute and print loss
-  loss = criterion(y_pred, y)
-  print(t, loss.data[0])
+  loss = loss_fn(y_pred, y)
+  print(t, loss.item())
 
   # Zero gradients, perform a backward pass, and update the weights.
   optimizer.zero_grad()
 
@@ -1,5 +1,4 @@
 import torch
-from torch.autograd import Variable
 
 """
 A fully-connected ReLU network with one hidden layer, trained to predict y from x
@@ -10,26 +9,31 @@
 but raw autograd can be a bit too low-level for defining complex neural networks;
 this is where the nn package can help. The nn package defines a set of Modules,
 which you can think of as a neural network layer that has produces output from
-input and may have some trainable weights.
+input and may have some trainable weights or other state.
 """
 
+device = torch.device('cpu')
+device = torch.device('cuda') # Uncomment this to run on GPU
+
 # N is batch size; D_in is input dimension;
 # H is hidden dimension; D_out is output dimension.
 N, D_in, H, D_out = 64, 1000, 100, 10
 
-# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-x = Variable(torch.randn(N, D_in))
-y = Variable(torch.randn(N, D_out), requires_grad=False)
+# Create random Tensors to hold inputs and outputs
+x = torch.randn(N, D_in, device=device)
+y = torch.randn(N, D_out, device=device)
 
 # Use the nn package to define our model as a sequence of layers. nn.Sequential
 # is a Module which contains other Modules, and applies them in sequence to
 # produce its output. Each Linear Module computes output from input using a
-# linear function, and holds internal Variables for its weight and bias.
+# linear function, and holds internal Tensors for its weight and bias.
+# After constructing the model we use the .to() method to move it to the
+# desired device.
 model = torch.nn.Sequential(
           torch.nn.Linear(D_in, H),
           torch.nn.ReLU(),
           torch.nn.Linear(H, D_out),
-        )
+        ).to(device)
 
 # The nn package also contains definitions of popular loss functions; in this
 # case we will use Mean Squared Error (MSE) as our loss function.
@@ -39,25 +43,26 @@
 for t in range(500):
   # Forward pass: compute predicted y by passing x to the model. Module objects
   # override the __call__ operator so you can call them like functions. When
-  # doing so you pass a Variable of input data to the Module and it produces
-  # a Variable of output data.
+  # doing so you pass a Tensor of input data to the Module and it produces
+  # a Tensor of output data.
   y_pred = model(x)
 
-  # Compute and print loss. We pass Variables containing the predicted and true
+  # Compute and print loss. We pass Tensors containing the predicted and true
   # values of y, and the loss function returns a Variable containing the loss.
   loss = loss_fn(y_pred, y)
-  print(t, loss.data[0])
+  print(t, loss.item())
 
   # Zero the gradients before running the backward pass.
   model.zero_grad()
 
   # Backward pass: compute gradient of the loss with respect to all the learnable
   # parameters of the model. Internally, the parameters of each Module are stored
-  # in Variables with requires_grad=True, so this call will compute gradients for
+  # in Tensors with requires_grad=True, so this call will compute gradients for
   # all learnable parameters in the model.
   loss.backward()
 
-  # Update the weights using gradient descent. Each parameter is a Variable, so
+  # Update the weights using gradient descent. Each parameter is a Tensor, so
   # we can access its data and gradients like we did before.
-  for param in model.parameters():
-    param.data -= learning_rate * param.grad.data
+  with torch.no_grad():
+    for param in model.parameters():
+      param.data -= learning_rate * param.grad