diff --git a/beginner_source/examples_autograd/polynomial_autograd.py b/beginner_source/examples_autograd/polynomial_autograd.py new file mode 100755 index 0000000000..65ab5892d9 --- /dev/null +++ b/beginner_source/examples_autograd/polynomial_autograd.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +""" +PyTorch: Tensors and autograd +------------------------------- + +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. + +This implementation computes the forward pass using operations on PyTorch +Tensors, and uses PyTorch autograd to compute gradients. + + +A PyTorch Tensor represents a node in a computational graph. If ``x`` is a +Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor +holding the gradient of ``x`` with respect to some scalar value. +""" +import torch +import math + +dtype = torch.float +device = torch.device("cpu") +# device = torch.device("cuda:0") # Uncomment this to run on GPU + +# Create Tensors to hold input and outputs. +# By default, requires_grad=False, which indicates that we do not need to +# compute gradients with respect to these Tensors during the backward pass. +x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype) +y = torch.sin(x) + +# Create random Tensors for weights. For a third order polynomial, we need +# 4 weights: y = a + b x + c x^2 + d x^3 +# Setting requires_grad=True indicates that we want to compute gradients with +# respect to these Tensors during the backward pass. +a = torch.randn((), device=device, dtype=dtype, requires_grad=True) +b = torch.randn((), device=device, dtype=dtype, requires_grad=True) +c = torch.randn((), device=device, dtype=dtype, requires_grad=True) +d = torch.randn((), device=device, dtype=dtype, requires_grad=True) + +learning_rate = 1e-6 +for t in range(2000): + # Forward pass: compute predicted y using operations on Tensors. + y_pred = a + b * x + c * x ** 2 + d * x ** 3 + + # Compute and print loss using operations on Tensors. + # Now loss is a Tensor of shape (1,) + # loss.item() gets the scalar value held in the loss. + loss = (y_pred - y).pow(2).sum() + if t % 100 == 99: + print(t, loss.item()) + + # Use autograd to compute the backward pass. This call will compute the + # gradient of loss with respect to all Tensors with requires_grad=True. + # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding + # the gradient of the loss with respect to a, b, c, d respectively. + loss.backward() + + # Manually update weights using gradient descent. Wrap in torch.no_grad() + # because weights have requires_grad=True, but we don't need to track this + # in autograd. + with torch.no_grad(): + a -= learning_rate * a.grad + b -= learning_rate * b.grad + c -= learning_rate * c.grad + d -= learning_rate * d.grad + + # Manually zero the gradients after updating weights + a.grad = None + b.grad = None + c.grad = None + d.grad = None + +print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3') diff --git a/beginner_source/examples_autograd/polynomial_custom_function.py b/beginner_source/examples_autograd/polynomial_custom_function.py new file mode 100755 index 0000000000..33fc1a2468 --- /dev/null +++ b/beginner_source/examples_autograd/polynomial_custom_function.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +PyTorch: Defining New autograd Functions +---------------------------------------- + +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. Instead of writing the +polynomial as :math:`y=a+bx+cx^2+dx^3`, we write the polynomial as +:math:`y=a+b P_3(c+dx)` where :math:`P_3(x)=\frac{1}{2}\left(5x^3-3x\right)` is +the `Legendre polynomial`_ of degree three. + +.. _Legendre polynomial: + https://en.wikipedia.org/wiki/Legendre_polynomials + +This implementation computes the forward pass using operations on PyTorch +Tensors, and uses PyTorch autograd to compute gradients. + +In this implementation we implement our own custom autograd function to perform +:math:`P_3'(x)`. By mathematics, :math:`P_3'(x)=\frac{3}{2}\left(5x^2-1\right)` +""" +import torch +import math + + +class LegendrePolynomial3(torch.autograd.Function): + """ + We can implement our own custom autograd Functions by subclassing + torch.autograd.Function and implementing the forward and backward passes + which operate on Tensors. + """ + + @staticmethod + def forward(ctx, input): + """ + In the forward pass we receive a Tensor containing the input and return + a Tensor containing the output. ctx is a context object that can be used + to stash information for backward computation. You can cache arbitrary + objects for use in the backward pass using the ctx.save_for_backward method. + """ + ctx.save_for_backward(input) + return 0.5 * (5 * input ** 3 - 3 * input) + + @staticmethod + def backward(ctx, grad_output): + """ + In the backward pass we receive a Tensor containing the gradient of the loss + with respect to the output, and we need to compute the gradient of the loss + with respect to the input. + """ + input, = ctx.saved_tensors + return grad_output * 1.5 * (5 * input ** 2 - 1) + + +dtype = torch.float +device = torch.device("cpu") +# device = torch.device("cuda:0") # Uncomment this to run on GPU + +# Create Tensors to hold input and outputs. +# By default, requires_grad=False, which indicates that we do not need to +# compute gradients with respect to these Tensors during the backward pass. +x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype) +y = torch.sin(x) + +# Create random Tensors for weights. For this example, we need +# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized +# not too far from the correct result to ensure convergence. +# Setting requires_grad=True indicates that we want to compute gradients with +# respect to these Tensors during the backward pass. +a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True) +b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True) +c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True) +d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True) + +learning_rate = 5e-6 +for t in range(2000): + # To apply our Function, we use Function.apply method. We alias this as 'P3'. + P3 = LegendrePolynomial3.apply + + # Forward pass: compute predicted y using operations; we compute + # P3 using our custom autograd operation. + y_pred = a + b * P3(c + d * x) + + # Compute and print loss + loss = (y_pred - y).pow(2).sum() + if t % 100 == 99: + print(t, loss.item()) + + # Use autograd to compute the backward pass. + loss.backward() + + # Update weights using gradient descent + with torch.no_grad(): + a -= learning_rate * a.grad + b -= learning_rate * b.grad + c -= learning_rate * c.grad + d -= learning_rate * d.grad + + # Manually zero the gradients after updating weights + a.grad = None + b.grad = None + c.grad = None + d.grad = None + +print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)') diff --git a/beginner_source/examples_autograd/tf_two_layer_net.py b/beginner_source/examples_autograd/tf_two_layer_net.py deleted file mode 100755 index 1caf36e89f..0000000000 --- a/beginner_source/examples_autograd/tf_two_layer_net.py +++ /dev/null @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- -""" -TensorFlow: Static Graphs -------------------------- - -A fully-connected ReLU network with one hidden layer and no biases, trained to -predict y from x by minimizing squared Euclidean distance. - -This implementation uses basic TensorFlow operations to set up a computational -graph, then executes the graph many times to actually train the network. - -One of the main differences between TensorFlow and PyTorch is that TensorFlow -uses static computational graphs while PyTorch uses dynamic computational -graphs. - -In TensorFlow we first set up the computational graph, then execute the same -graph many times. -""" -import tensorflow as tf -import numpy as np - -# First we set up the computational graph: - -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 - -# Create placeholders for the input and target data; these will be filled -# with real data when we execute the graph. -x = tf.placeholder(tf.float32, shape=(None, D_in)) -y = tf.placeholder(tf.float32, shape=(None, D_out)) - -# Create Variables for the weights and initialize them with random data. -# A TensorFlow Variable persists its value across executions of the graph. -w1 = tf.Variable(tf.random_normal((D_in, H))) -w2 = tf.Variable(tf.random_normal((H, D_out))) - -# Forward pass: Compute the predicted y using operations on TensorFlow Tensors. -# Note that this code does not actually perform any numeric operations; it -# merely sets up the computational graph that we will later execute. -h = tf.matmul(x, w1) -h_relu = tf.maximum(h, tf.zeros(1)) -y_pred = tf.matmul(h_relu, w2) - -# Compute loss using operations on TensorFlow Tensors -loss = tf.reduce_sum((y - y_pred) ** 2.0) - -# Compute gradient of the loss with respect to w1 and w2. -grad_w1, grad_w2 = tf.gradients(loss, [w1, w2]) - -# Update the weights using gradient descent. To actually update the weights -# we need to evaluate new_w1 and new_w2 when executing the graph. Note that -# in TensorFlow the the act of updating the value of the weights is part of -# the computational graph; in PyTorch this happens outside the computational -# graph. -learning_rate = 1e-6 -new_w1 = w1.assign(w1 - learning_rate * grad_w1) -new_w2 = w2.assign(w2 - learning_rate * grad_w2) - -# Now we have built our computational graph, so we enter a TensorFlow session to -# actually execute the graph. -with tf.Session() as sess: - # Run the graph once to initialize the Variables w1 and w2. - sess.run(tf.global_variables_initializer()) - - # Create numpy arrays holding the actual data for the inputs x and targets - # y - x_value = np.random.randn(N, D_in) - y_value = np.random.randn(N, D_out) - for t in range(500): - # Execute the graph many times. Each time it executes we want to bind - # x_value to x and y_value to y, specified with the feed_dict argument. - # Each time we execute the graph we want to compute the values for loss, - # new_w1, and new_w2; the values of these Tensors are returned as numpy - # arrays. - loss_value, _, _ = sess.run([loss, new_w1, new_w2], - feed_dict={x: x_value, y: y_value}) - if t % 100 == 99: - print(t, loss_value) diff --git a/beginner_source/examples_autograd/two_layer_net_autograd.py b/beginner_source/examples_autograd/two_layer_net_autograd.py deleted file mode 100755 index ebbc98b2bb..0000000000 --- a/beginner_source/examples_autograd/two_layer_net_autograd.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -""" -PyTorch: Tensors and autograd -------------------------------- - -A fully-connected ReLU network with one hidden layer and no biases, trained to -predict y from x by minimizing squared Euclidean distance. - -This implementation computes the forward pass using operations on PyTorch -Tensors, and uses PyTorch autograd to compute gradients. - - -A PyTorch Tensor represents a node in a computational graph. If ``x`` is a -Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor -holding the gradient of ``x`` with respect to some scalar value. -""" -import torch - -dtype = torch.float -device = torch.device("cpu") -# device = torch.device("cuda:0") # Uncomment this to run on GPU -# torch.backends.cuda.matmul.allow_tf32 = False # Uncomment this to run on GPU - -# The above line disables TensorFloat32. This a feature that allows -# networks to run at a much faster speed while sacrificing precision. -# Although TensorFloat32 works well on most real models, for our toy model -# in this tutorial, the sacrificed precision causes convergence issue. -# For more information, see: -# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices - -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 - -# Create random Tensors to hold input and outputs. -# Setting requires_grad=False indicates that we do not need to compute gradients -# with respect to these Tensors during the backward pass. -x = torch.randn(N, D_in, device=device, dtype=dtype) -y = torch.randn(N, D_out, device=device, dtype=dtype) - -# Create random Tensors for weights. -# Setting requires_grad=True indicates that we want to compute gradients with -# respect to these Tensors during the backward pass. -w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True) -w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True) - -learning_rate = 1e-6 -for t in range(500): - # Forward pass: compute predicted y using operations on Tensors; these - # are exactly the same operations we used to compute the forward pass using - # Tensors, but we do not need to keep references to intermediate values since - # we are not implementing the backward pass by hand. - y_pred = x.mm(w1).clamp(min=0).mm(w2) - - # Compute and print loss using operations on Tensors. - # Now loss is a Tensor of shape (1,) - # loss.item() gets the scalar value held in the loss. - loss = (y_pred - y).pow(2).sum() - if t % 100 == 99: - print(t, loss.item()) - - # Use autograd to compute the backward pass. This call will compute the - # gradient of loss with respect to all Tensors with requires_grad=True. - # After this call w1.grad and w2.grad will be Tensors holding the gradient - # of the loss with respect to w1 and w2 respectively. - loss.backward() - - # Manually update weights using gradient descent. Wrap in torch.no_grad() - # because weights have requires_grad=True, but we don't need to track this - # in autograd. - # An alternative way is to operate on weight.data and weight.grad.data. - # Recall that tensor.data gives a tensor that shares the storage with - # tensor, but doesn't track history. - # You can also use torch.optim.SGD to achieve this. - with torch.no_grad(): - w1 -= learning_rate * w1.grad - w2 -= learning_rate * w2.grad - - # Manually zero the gradients after updating weights - w1.grad.zero_() - w2.grad.zero_() diff --git a/beginner_source/examples_autograd/two_layer_net_custom_function.py b/beginner_source/examples_autograd/two_layer_net_custom_function.py deleted file mode 100755 index 2d2a087566..0000000000 --- a/beginner_source/examples_autograd/two_layer_net_custom_function.py +++ /dev/null @@ -1,97 +0,0 @@ -# -*- coding: utf-8 -*- -""" -PyTorch: Defining New autograd Functions ----------------------------------------- - -A fully-connected ReLU network with one hidden layer and no biases, trained to -predict y from x by minimizing squared Euclidean distance. - -This implementation computes the forward pass using operations on PyTorch -Variables, and uses PyTorch autograd to compute gradients. - -In this implementation we implement our own custom autograd function to perform -the ReLU function. -""" -import torch - - -class MyReLU(torch.autograd.Function): - """ - We can implement our own custom autograd Functions by subclassing - torch.autograd.Function and implementing the forward and backward passes - which operate on Tensors. - """ - - @staticmethod - def forward(ctx, input): - """ - In the forward pass we receive a Tensor containing the input and return - a Tensor containing the output. ctx is a context object that can be used - to stash information for backward computation. You can cache arbitrary - objects for use in the backward pass using the ctx.save_for_backward method. - """ - ctx.save_for_backward(input) - return input.clamp(min=0) - - @staticmethod - def backward(ctx, grad_output): - """ - In the backward pass we receive a Tensor containing the gradient of the loss - with respect to the output, and we need to compute the gradient of the loss - with respect to the input. - """ - input, = ctx.saved_tensors - grad_input = grad_output.clone() - grad_input[input < 0] = 0 - return grad_input - - -dtype = torch.float -device = torch.device("cpu") -# device = torch.device("cuda:0") # Uncomment this to run on GPU -# torch.backends.cuda.matmul.allow_tf32 = False # Uncomment this to run on GPU - -# The above line disables TensorFloat32. This a feature that allows -# networks to run at a much faster speed while sacrificing precision. -# Although TensorFloat32 works well on most real models, for our toy model -# in this tutorial, the sacrificed precision causes convergence issue. -# For more information, see: -# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices - -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 - -# Create random Tensors to hold input and outputs. -x = torch.randn(N, D_in, device=device, dtype=dtype) -y = torch.randn(N, D_out, device=device, dtype=dtype) - -# Create random Tensors for weights. -w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True) -w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True) - -learning_rate = 1e-6 -for t in range(500): - # To apply our Function, we use Function.apply method. We alias this as 'relu'. - relu = MyReLU.apply - - # Forward pass: compute predicted y using operations; we compute - # ReLU using our custom autograd operation. - y_pred = relu(x.mm(w1)).mm(w2) - - # Compute and print loss - loss = (y_pred - y).pow(2).sum() - if t % 100 == 99: - print(t, loss.item()) - - # Use autograd to compute the backward pass. - loss.backward() - - # Update weights using gradient descent - with torch.no_grad(): - w1 -= learning_rate * w1.grad - w2 -= learning_rate * w2.grad - - # Manually zero the gradients after updating weights - w1.grad.zero_() - w2.grad.zero_() diff --git a/beginner_source/examples_nn/dynamic_net.py b/beginner_source/examples_nn/dynamic_net.py index 0e56e39dfb..31fa40f3e5 100755 --- a/beginner_source/examples_nn/dynamic_net.py +++ b/beginner_source/examples_nn/dynamic_net.py @@ -4,71 +4,74 @@ -------------------------------------- To showcase the power of PyTorch dynamic graphs, we will implement a very strange -model: a fully-connected ReLU network that on each forward pass randomly chooses -a number between 1 and 4 and has that many hidden layers, reusing the same -weights multiple times to compute the innermost hidden layers. +model: a third-fifth order polynomial that on each forward pass +chooses a random number between 3 and 5 and uses that many orders, reusing +the same weights multiple times to compute the fourth and fifth order. """ import random import torch +import math class DynamicNet(torch.nn.Module): - def __init__(self, D_in, H, D_out): + def __init__(self): """ - In the constructor we construct three nn.Linear instances that we will use - in the forward pass. + In the constructor we instantiate five parameters and assign them as members. """ - super(DynamicNet, self).__init__() - self.input_linear = torch.nn.Linear(D_in, H) - self.middle_linear = torch.nn.Linear(H, H) - self.output_linear = torch.nn.Linear(H, D_out) + super().__init__() + self.a = torch.nn.Parameter(torch.randn(())) + self.b = torch.nn.Parameter(torch.randn(())) + self.c = torch.nn.Parameter(torch.randn(())) + self.d = torch.nn.Parameter(torch.randn(())) + self.e = torch.nn.Parameter(torch.randn(())) def forward(self, x): """ - For the forward pass of the model, we randomly choose either 0, 1, 2, or 3 - and reuse the middle_linear Module that many times to compute hidden layer - representations. + For the forward pass of the model, we randomly choose either 4, 5 + and reuse the e parameter to compute the contribution of these orders. Since each forward pass builds a dynamic computation graph, we can use normal Python control-flow operators like loops or conditional statements when defining the forward pass of the model. - Here we also see that it is perfectly safe to reuse the same Module many - times when defining a computational graph. This is a big improvement from Lua - Torch, where each Module could be used only once. + Here we also see that it is perfectly safe to reuse the same parameter many + times when defining a computational graph. """ - h_relu = self.input_linear(x).clamp(min=0) - for _ in range(random.randint(0, 3)): - h_relu = self.middle_linear(h_relu).clamp(min=0) - y_pred = self.output_linear(h_relu) - return y_pred + y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3 + for exp in range(4, random.randint(4, 6)): + y = y + self.e * x ** exp + return y + def string(self): + """ + Just like any class in Python, you can also define custom method on PyTorch modules + """ + return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?' -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 -# Create random Tensors to hold inputs and outputs -x = torch.randn(N, D_in) -y = torch.randn(N, D_out) +# Create Tensors to hold input and outputs. +x = torch.linspace(-math.pi, math.pi, 2000) +y = torch.sin(x) # Construct our model by instantiating the class defined above -model = DynamicNet(D_in, H, D_out) +model = DynamicNet() # Construct our loss function and an Optimizer. Training this strange model with # vanilla stochastic gradient descent is tough, so we use momentum criterion = torch.nn.MSELoss(reduction='sum') -optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) -for t in range(500): +optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9) +for t in range(30000): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x) # Compute and print loss loss = criterion(y_pred, y) - if t % 100 == 99: + if t % 2000 == 1999: print(t, loss.item()) # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step() + +print(f'Result: {model.string()}') diff --git a/beginner_source/examples_nn/two_layer_net_module.py b/beginner_source/examples_nn/polynomial_module.py similarity index 50% rename from beginner_source/examples_nn/two_layer_net_module.py rename to beginner_source/examples_nn/polynomial_module.py index 29d27274d2..7b20a5523b 100755 --- a/beginner_source/examples_nn/two_layer_net_module.py +++ b/beginner_source/examples_nn/polynomial_module.py @@ -3,25 +3,28 @@ PyTorch: Custom nn Modules -------------------------- -A fully-connected ReLU network with one hidden layer, trained to predict y from x -by minimizing squared Euclidean distance. +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. This implementation defines the model as a custom Module subclass. Whenever you want a model more complex than a simple sequence of existing Modules you will need to define your model this way. """ import torch +import math -class TwoLayerNet(torch.nn.Module): - def __init__(self, D_in, H, D_out): +class Polynomial3(torch.nn.Module): + def __init__(self): """ - In the constructor we instantiate two nn.Linear modules and assign them as - member variables. + In the constructor we instantiate four parameters and assign them as + member parameters. """ - super(TwoLayerNet, self).__init__() - self.linear1 = torch.nn.Linear(D_in, H) - self.linear2 = torch.nn.Linear(H, D_out) + super().__init__() + self.a = torch.nn.Parameter(torch.randn(())) + self.b = torch.nn.Parameter(torch.randn(())) + self.c = torch.nn.Parameter(torch.randn(())) + self.d = torch.nn.Parameter(torch.randn(())) def forward(self, x): """ @@ -29,28 +32,28 @@ def forward(self, x): a Tensor of output data. We can use Modules defined in the constructor as well as arbitrary operators on Tensors. """ - h_relu = self.linear1(x).clamp(min=0) - y_pred = self.linear2(h_relu) - return y_pred + return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3 + def string(self): + """ + Just like any class in Python, you can also define custom method on PyTorch modules + """ + return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3' -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 -# Create random Tensors to hold inputs and outputs -x = torch.randn(N, D_in) -y = torch.randn(N, D_out) +# Create Tensors to hold input and outputs. +x = torch.linspace(-math.pi, math.pi, 2000) +y = torch.sin(x) # Construct our model by instantiating the class defined above -model = TwoLayerNet(D_in, H, D_out) +model = Polynomial3() # Construct our loss function and an Optimizer. The call to model.parameters() -# in the SGD constructor will contain the learnable parameters of the two -# nn.Linear modules which are members of the model. +# in the SGD constructor will contain the learnable parameters of the nn.Linear +# module which is members of the model. criterion = torch.nn.MSELoss(reduction='sum') -optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) -for t in range(500): +optimizer = torch.optim.SGD(model.parameters(), lr=1e-6) +for t in range(2000): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x) @@ -63,3 +66,5 @@ def forward(self, x): optimizer.zero_grad() loss.backward() optimizer.step() + +print(f'Result: {model.string()}') diff --git a/beginner_source/examples_nn/two_layer_net_nn.py b/beginner_source/examples_nn/polynomial_nn.py similarity index 61% rename from beginner_source/examples_nn/two_layer_net_nn.py rename to beginner_source/examples_nn/polynomial_nn.py index 0c1925878e..9d5aca0534 100755 --- a/beginner_source/examples_nn/two_layer_net_nn.py +++ b/beginner_source/examples_nn/polynomial_nn.py @@ -3,8 +3,8 @@ PyTorch: nn ----------- -A fully-connected ReLU network with one hidden layer, trained to predict y from x -by minimizing squared Euclidean distance. +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. This implementation uses the nn package from PyTorch to build the network. PyTorch autograd makes it easy to define computational graphs and take gradients, @@ -14,36 +14,46 @@ input and may have some trainable weights. """ import torch +import math -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 -# Create random Tensors to hold inputs and outputs -x = torch.randn(N, D_in) -y = torch.randn(N, D_out) +# Create Tensors to hold input and outputs. +x = torch.linspace(-math.pi, math.pi, 2000) +y = torch.sin(x) + +# For this example, the output y is a linear function of (x, x^2, x^3), so +# we can consider it as a linear layer neural network. Let's prepare the +# tensor (x, x^2, x^3). +p = torch.tensor([1, 2, 3]) +xx = x.unsqueeze(-1).pow(p) + +# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape +# (3,), for this case, broadcasting semantics will apply to obtain a tensor +# of shape (2000, 3) # Use the nn package to define our model as a sequence of layers. nn.Sequential # is a Module which contains other Modules, and applies them in sequence to -# produce its output. Each Linear Module computes output from input using a +# produce its output. The Linear Module computes output from input using a # linear function, and holds internal Tensors for its weight and bias. +# The Flatten layer flatens the output of the linear layer to a 1D tensor, +# to match the shape of `y`. model = torch.nn.Sequential( - torch.nn.Linear(D_in, H), - torch.nn.ReLU(), - torch.nn.Linear(H, D_out), + torch.nn.Linear(3, 1), + torch.nn.Flatten(0, 1) ) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. loss_fn = torch.nn.MSELoss(reduction='sum') -learning_rate = 1e-4 -for t in range(500): +learning_rate = 1e-6 +for t in range(2000): + # Forward pass: compute predicted y by passing x to the model. Module objects # override the __call__ operator so you can call them like functions. When # doing so you pass a Tensor of input data to the Module and it produces # a Tensor of output data. - y_pred = model(x) + y_pred = model(xx) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the @@ -66,3 +76,9 @@ with torch.no_grad(): for param in model.parameters(): param -= learning_rate * param.grad + +# You can access the first layer of `model` like accessing the first item of a list +linear_layer = model[0] + +# For linear layer, its parameters are stored as `weight` and `bias`. +print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3') diff --git a/beginner_source/examples_nn/two_layer_net_optim.py b/beginner_source/examples_nn/polynomial_optim.py similarity index 62% rename from beginner_source/examples_nn/two_layer_net_optim.py rename to beginner_source/examples_nn/polynomial_optim.py index 82b67dcc1b..434fb6624b 100755 --- a/beginner_source/examples_nn/two_layer_net_optim.py +++ b/beginner_source/examples_nn/polynomial_optim.py @@ -3,8 +3,8 @@ PyTorch: optim -------------- -A fully-connected ReLU network with one hidden layer, trained to predict y from x -by minimizing squared Euclidean distance. +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. This implementation uses the nn package from PyTorch to build the network. @@ -14,32 +14,33 @@ used for deep learning, including SGD+momentum, RMSProp, Adam, etc. """ import torch +import math -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 -# Create random Tensors to hold inputs and outputs -x = torch.randn(N, D_in) -y = torch.randn(N, D_out) +# Create Tensors to hold input and outputs. +x = torch.linspace(-math.pi, math.pi, 2000) +y = torch.sin(x) + +# Prepare the input tensor (x, x^2, x^3). +p = torch.tensor([1, 2, 3]) +xx = x.unsqueeze(-1).pow(p) # Use the nn package to define our model and loss function. model = torch.nn.Sequential( - torch.nn.Linear(D_in, H), - torch.nn.ReLU(), - torch.nn.Linear(H, D_out), + torch.nn.Linear(3, 1), + torch.nn.Flatten(0, 1) ) loss_fn = torch.nn.MSELoss(reduction='sum') # Use the optim package to define an Optimizer that will update the weights of -# the model for us. Here we will use Adam; the optim package contains many other -# optimization algorithms. The first argument to the Adam constructor tells the +# the model for us. Here we will use RMSprop; the optim package contains many other +# optimization algorithms. The first argument to the RMSprop constructor tells the # optimizer which Tensors it should update. -learning_rate = 1e-4 -optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) -for t in range(500): +learning_rate = 1e-3 +optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate) +for t in range(2000): # Forward pass: compute predicted y by passing x to the model. - y_pred = model(x) + y_pred = model(xx) # Compute and print loss. loss = loss_fn(y_pred, y) @@ -60,3 +61,7 @@ # Calling the step function on an Optimizer makes an update to its # parameters optimizer.step() + + +linear_layer = model[0] +print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3') diff --git a/beginner_source/examples_tensor/polynomial_numpy.py b/beginner_source/examples_tensor/polynomial_numpy.py new file mode 100755 index 0000000000..a1a378e50e --- /dev/null +++ b/beginner_source/examples_tensor/polynomial_numpy.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +""" +Warm-up: numpy +-------------- + +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. + +This implementation uses numpy to manually compute the forward pass, loss, and +backward pass. + +A numpy array is a generic n-dimensional array; it does not know anything about +deep learning or gradients or computational graphs, and is just a way to perform +generic numeric computations. +""" +import numpy as np +import math + +# Create random input and output data +x = np.linspace(-math.pi, math.pi, 2000) +y = np.sin(x) + +# Randomly initialize weights +a = np.random.randn() +b = np.random.randn() +c = np.random.randn() +d = np.random.randn() + +learning_rate = 1e-6 +for t in range(2000): + # Forward pass: compute predicted y + # y = a + b x + c x^2 + d x^3 + y_pred = a + b * x + c * x ** 2 + d * x ** 3 + + # Compute and print loss + loss = np.square(y_pred - y).sum() + if t % 100 == 99: + print(t, loss) + + # Backprop to compute gradients of a, b, c, d with respect to loss + grad_y_pred = 2.0 * (y_pred - y) + grad_a = grad_y_pred.sum() + grad_b = (grad_y_pred * x).sum() + grad_c = (grad_y_pred * x ** 2).sum() + grad_d = (grad_y_pred * x ** 3).sum() + + # Update weights + a -= learning_rate * grad_a + b -= learning_rate * grad_b + c -= learning_rate * grad_c + d -= learning_rate * grad_d + +print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3') diff --git a/beginner_source/examples_tensor/two_layer_net_tensor.py b/beginner_source/examples_tensor/polynomial_tensor.py similarity index 53% rename from beginner_source/examples_tensor/two_layer_net_tensor.py rename to beginner_source/examples_tensor/polynomial_tensor.py index 3eacae4270..1e35b0f24b 100755 --- a/beginner_source/examples_tensor/two_layer_net_tensor.py +++ b/beginner_source/examples_tensor/polynomial_tensor.py @@ -3,8 +3,8 @@ PyTorch: Tensors ---------------- -A fully-connected ReLU network with one hidden layer and no biases, trained to -predict y from x by minimizing squared Euclidean distance. +A third order polynomial, trained to predict :math:`y=\sin(x)` from :math:`-\pi` +to :math:`pi` by minimizing squared Euclidean distance. This implementation uses PyTorch tensors to manually compute the forward pass, loss, and backward pass. @@ -19,44 +19,45 @@ """ import torch +import math dtype = torch.float device = torch.device("cpu") # device = torch.device("cuda:0") # Uncomment this to run on GPU -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 - # Create random input and output data -x = torch.randn(N, D_in, device=device, dtype=dtype) -y = torch.randn(N, D_out, device=device, dtype=dtype) +x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype) +y = torch.sin(x) # Randomly initialize weights -w1 = torch.randn(D_in, H, device=device, dtype=dtype) -w2 = torch.randn(H, D_out, device=device, dtype=dtype) +a = torch.randn((), device=device, dtype=dtype) +b = torch.randn((), device=device, dtype=dtype) +c = torch.randn((), device=device, dtype=dtype) +d = torch.randn((), device=device, dtype=dtype) learning_rate = 1e-6 -for t in range(500): +for t in range(2000): # Forward pass: compute predicted y - h = x.mm(w1) - h_relu = h.clamp(min=0) - y_pred = h_relu.mm(w2) + y_pred = a + b * x + c * x ** 2 + d * x ** 3 # Compute and print loss loss = (y_pred - y).pow(2).sum().item() if t % 100 == 99: print(t, loss) - # Backprop to compute gradients of w1 and w2 with respect to loss + # Backprop to compute gradients of a, b, c, d with respect to loss grad_y_pred = 2.0 * (y_pred - y) - grad_w2 = h_relu.t().mm(grad_y_pred) - grad_h_relu = grad_y_pred.mm(w2.t()) - grad_h = grad_h_relu.clone() - grad_h[h < 0] = 0 - grad_w1 = x.t().mm(grad_h) + grad_a = grad_y_pred.sum() + grad_b = (grad_y_pred * x).sum() + grad_c = (grad_y_pred * x ** 2).sum() + grad_d = (grad_y_pred * x ** 3).sum() # Update weights using gradient descent - w1 -= learning_rate * grad_w1 - w2 -= learning_rate * grad_w2 + a -= learning_rate * grad_a + b -= learning_rate * grad_b + c -= learning_rate * grad_c + d -= learning_rate * grad_d + + +print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3') diff --git a/beginner_source/examples_tensor/two_layer_net_numpy.py b/beginner_source/examples_tensor/two_layer_net_numpy.py deleted file mode 100755 index f003d0f002..0000000000 --- a/beginner_source/examples_tensor/two_layer_net_numpy.py +++ /dev/null @@ -1,51 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Warm-up: numpy --------------- - -A fully-connected ReLU network with one hidden layer and no biases, trained to -predict y from x using Euclidean error. - -This implementation uses numpy to manually compute the forward pass, loss, and -backward pass. - -A numpy array is a generic n-dimensional array; it does not know anything about -deep learning or gradients or computational graphs, and is just a way to perform -generic numeric computations. -""" -import numpy as np - -# N is batch size; D_in is input dimension; -# H is hidden dimension; D_out is output dimension. -N, D_in, H, D_out = 64, 1000, 100, 10 - -# Create random input and output data -x = np.random.randn(N, D_in) -y = np.random.randn(N, D_out) - -# Randomly initialize weights -w1 = np.random.randn(D_in, H) -w2 = np.random.randn(H, D_out) - -learning_rate = 1e-6 -for t in range(500): - # Forward pass: compute predicted y - h = x.dot(w1) - h_relu = np.maximum(h, 0) - y_pred = h_relu.dot(w2) - - # Compute and print loss - loss = np.square(y_pred - y).sum() - print(t, loss) - - # Backprop to compute gradients of w1 and w2 with respect to loss - grad_y_pred = 2.0 * (y_pred - y) - grad_w2 = h_relu.T.dot(grad_y_pred) - grad_h_relu = grad_y_pred.dot(w2.T) - grad_h = grad_h_relu.copy() - grad_h[h < 0] = 0 - grad_w1 = x.T.dot(grad_h) - - # Update weights - w1 -= learning_rate * grad_w1 - w2 -= learning_rate * grad_w2 diff --git a/beginner_source/pytorch_with_examples.rst b/beginner_source/pytorch_with_examples.rst index a9f56268b2..c0a2b665a5 100644 --- a/beginner_source/pytorch_with_examples.rst +++ b/beginner_source/pytorch_with_examples.rst @@ -11,8 +11,8 @@ At its core, PyTorch provides two main features: - An n-dimensional Tensor, similar to numpy but can run on GPUs - Automatic differentiation for building and training neural networks -We will use a fully-connected ReLU network as our running example. The -network will have a single hidden layer, and will be trained with +We will use a problem of fitting :math:`y=\sin(x)` with a third order polynomial +as our running example. The network will have four parameters, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output. @@ -36,10 +36,10 @@ Numpy provides an n-dimensional array object, and many functions for manipulating these arrays. Numpy is a generic framework for scientific computing; it does not know anything about computation graphs, or deep learning, or gradients. However we can easily use numpy to fit a -two-layer network to random data by manually implementing the forward +third order polynomial to sine function by manually implementing the forward and backward passes through the network using numpy operations: -.. includenodoc:: /beginner/examples_tensor/two_layer_net_numpy.py +.. includenodoc:: /beginner/examples_tensor/polynomial_numpy.py PyTorch: Tensors @@ -60,13 +60,13 @@ generic tool for scientific computing. Also unlike numpy, PyTorch Tensors can utilize GPUs to accelerate their numeric computations. To run a PyTorch Tensor on GPU, you simply -need to cast it to a new datatype. +need to specify the correct device. -Here we use PyTorch Tensors to fit a two-layer network to random data. +Here we use PyTorch Tensors to fit a third order polynomial to sine function. Like the numpy example above we need to manually implement the forward and backward passes through the network: -.. includenodoc:: /beginner/examples_tensor/two_layer_net_tensor.py +.. includenodoc:: /beginner/examples_tensor/polynomial_tensor.py Autograd @@ -95,11 +95,11 @@ represents a node in a computational graph. If ``x`` is a Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor holding the gradient of ``x`` with respect to some scalar value. -Here we use PyTorch Tensors and autograd to implement our two-layer -network; now we no longer need to manually implement the backward pass -through the network: +Here we use PyTorch Tensors and autograd to implement our fitting sine wave +with third order polynomial example; now we no longer need to manually +implement the backward pass through the network: -.. includenodoc:: /beginner/examples_autograd/two_layer_net_autograd.py +.. includenodoc:: /beginner/examples_autograd/polynomial_autograd.py PyTorch: Defining new autograd functions ---------------------------------------- @@ -117,11 +117,16 @@ and ``backward`` functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data. -In this example we define our own custom autograd function for -performing the ReLU nonlinearity, and use it to implement our two-layer -network: +In this example we define our model as :math:`y=a+b P_3(c+dx)` instead of +:math:`y=a+bx+cx^2+dx^3`, where :math:`P_3(x)=\frac{1}{2}\left(5x^3-3x\right)` +is the `Legendre polynomial`_ of degree three. We write our own custom autograd +function for computing forward and backward of :math:`P_3`, and use it to implement +our model: + +.. _Legendre polynomial: + https://en.wikipedia.org/wiki/Legendre_polynomials -.. includenodoc:: /beginner/examples_autograd/two_layer_net_custom_function.py +.. includenodoc:: /beginner/examples_autograd/polynomial_custom_function.py `nn` module =========== @@ -152,30 +157,29 @@ containing learnable parameters. The ``nn`` package also defines a set of useful loss functions that are commonly used when training neural networks. -In this example we use the ``nn`` package to implement our two-layer +In this example we use the ``nn`` package to implement our polynomial model network: -.. includenodoc:: /beginner/examples_nn/two_layer_net_nn.py +.. includenodoc:: /beginner/examples_nn/polynomial_nn.py PyTorch: optim -------------- Up to this point we have updated the weights of our models by manually -mutating the Tensors holding learnable parameters (with ``torch.no_grad()`` -or ``.data`` to avoid tracking history in autograd). This is not a huge -burden for simple optimization algorithms like stochastic gradient descent, -but in practice we often train neural networks using more sophisticated -optimizers like AdaGrad, RMSProp, Adam, etc. +mutating the Tensors holding learnable parameters with ``torch.no_grad()``. +This is not a huge burden for simple optimization algorithms like stochastic +gradient descent, but in practice we often train neural networks using more +sophisticated optimizers like AdaGrad, RMSProp, Adam, etc. The ``optim`` package in PyTorch abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms. In this example we will use the ``nn`` package to define our model as -before, but we will optimize the model using the Adam algorithm provided +before, but we will optimize the model using the RMSprop algorithm provided by the ``optim`` package: -.. includenodoc:: /beginner/examples_nn/two_layer_net_optim.py +.. includenodoc:: /beginner/examples_nn/polynomial_optim.py PyTorch: Custom nn Modules -------------------------- @@ -186,23 +190,22 @@ Modules by subclassing ``nn.Module`` and defining a ``forward`` which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors. -In this example we implement our two-layer network as a custom Module +In this example we implement our third order polynomial as a custom Module subclass: -.. includenodoc:: /beginner/examples_nn/two_layer_net_module.py +.. includenodoc:: /beginner/examples_nn/polynomial_module.py PyTorch: Control Flow + Weight Sharing -------------------------------------- As an example of dynamic graphs and weight sharing, we implement a very -strange model: a fully-connected ReLU network that on each forward pass -chooses a random number between 1 and 4 and uses that many hidden -layers, reusing the same weights multiple times to compute the innermost -hidden layers. +strange model: a third-fifth order polynomial that on each forward pass +chooses a random number between 3 and 5 and uses that many orders, reusing +the same weights multiple times to compute the fourth and fifth order. For this model we can use normal Python flow control to implement the loop, -and we can implement weight sharing among the innermost layers by simply -reusing the same Module multiple times when defining the forward pass. +and we can implement weight sharing by simply reusing the same parameter multiple +times when defining the forward pass. We can easily implement this model as a Module subclass: @@ -223,12 +226,12 @@ Tensors :maxdepth: 2 :hidden: - /beginner/examples_tensor/two_layer_net_numpy - /beginner/examples_tensor/two_layer_net_tensor + /beginner/examples_tensor/polynomial_numpy + /beginner/examples_tensor/polynomial_tensor -.. galleryitem:: /beginner/examples_tensor/two_layer_net_numpy.py +.. galleryitem:: /beginner/examples_tensor/polynomial_numpy.py -.. galleryitem:: /beginner/examples_tensor/two_layer_net_tensor.py +.. galleryitem:: /beginner/examples_tensor/polynomial_tensor.py .. raw:: html @@ -241,16 +244,13 @@ Autograd :maxdepth: 2 :hidden: - /beginner/examples_autograd/two_layer_net_autograd - /beginner/examples_autograd/two_layer_net_custom_function - /beginner/examples_autograd/tf_two_layer_net - + /beginner/examples_autograd/polynomial_autograd + /beginner/examples_autograd/polynomial_custom_function -.. galleryitem:: /beginner/examples_autograd/two_layer_net_autograd.py -.. galleryitem:: /beginner/examples_autograd/two_layer_net_custom_function.py +.. galleryitem:: /beginner/examples_autograd/polynomial_autograd.py -.. galleryitem:: /beginner/examples_autograd/tf_two_layer_net.py +.. galleryitem:: /beginner/examples_autograd/polynomial_custom_function.py .. raw:: html @@ -263,17 +263,17 @@ Autograd :maxdepth: 2 :hidden: - /beginner/examples_nn/two_layer_net_nn - /beginner/examples_nn/two_layer_net_optim - /beginner/examples_nn/two_layer_net_module + /beginner/examples_nn/polynomial_nn + /beginner/examples_nn/polynomial_optim + /beginner/examples_nn/polynomial_module /beginner/examples_nn/dynamic_net -.. galleryitem:: /beginner/examples_nn/two_layer_net_nn.py +.. galleryitem:: /beginner/examples_nn/polynomial_nn.py -.. galleryitem:: /beginner/examples_nn/two_layer_net_optim.py +.. galleryitem:: /beginner/examples_nn/polynomial_optim.py -.. galleryitem:: /beginner/examples_nn/two_layer_net_module.py +.. galleryitem:: /beginner/examples_nn/polynomial_module.py .. galleryitem:: /beginner/examples_nn/dynamic_net.py