diff --git a/tv-script-generation/dlnd_tv_script_generation.ipynb b/tv-script-generation/dlnd_tv_script_generation.ipynb index 2c92421cbd..491b840e22 100644 --- a/tv-script-generation/dlnd_tv_script_generation.ipynb +++ b/tv-script-generation/dlnd_tv_script_generation.ipynb @@ -13,9 +13,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", @@ -34,21 +32,19 @@ "metadata": {}, "source": [ "## Explore the Data\n", - "Play around with `view_sentence_range` to view different parts of the data." + "Play around with `view_sentence_range` to view different parts of the data. This will give you a sense of the data you'll be working with." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "view_sentence_range = (0, 10)\n", "\n", "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", "\"\"\"\n", "import numpy as np\n", "\n", @@ -89,12 +85,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "import problem_unittests as tests\n", "\n", "def create_lookup_tables(text):\n", @@ -104,8 +97,8 @@ " :return: A tuple of dicts (vocab_to_int, int_to_vocab)\n", " \"\"\"\n", " # TODO: Implement Function\n", - " return None, None\n", "\n", + " return None, None\n", "\n", "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", @@ -118,19 +111,19 @@ "metadata": {}, "source": [ "### Tokenize Punctuation\n", - "We'll be splitting the script into a word array using spaces as delimiters. However, punctuations like periods and exclamation marks make it hard for the neural network to distinguish between the word \"bye\" and \"bye!\".\n", + "We'll be splitting the script into a word array using spaces as delimiters. However, punctuations like periods and exclamation marks can create multiple ids for the same word. For example, \"bye\" and \"bye!\" would generate two different work ids.\n", "\n", "Implement the function `token_lookup` to return a dict that will be used to tokenize symbols like \"!\" into \"||Exclamation_Mark||\". Create a dictionary for the following symbols where the symbol is the key and value is the token:\n", - "- Period ( . )\n", - "- Comma ( , )\n", - "- Quotation Mark ( \" )\n", - "- Semicolon ( ; )\n", - "- Exclamation mark ( ! )\n", - "- Question mark ( ? )\n", - "- Left Parentheses ( ( )\n", - "- Right Parentheses ( ) )\n", - "- Dash ( -- )\n", - "- Return ( \\n )\n", + "- Period ( **.** )\n", + "- Comma ( **,** )\n", + "- Quotation Mark ( **\"** )\n", + "- Semicolon ( **;** )\n", + "- Exclamation mark ( **!** )\n", + "- Question mark ( **?** )\n", + "- Left Parentheses ( **(** )\n", + "- Right Parentheses ( **)** )\n", + "- Dash ( **--** )\n", + "- Return ( **\\n** )\n", "\n", "This dictionary will be used to token the symbols and add the delimiter (space) around it. This separates the symbols as it's own word, making it easier for the neural network to predict on the next word. Make sure you don't use a token that could be confused as a word. Instead of using the token \"dash\", try using something like \"||dash||\"." ] @@ -138,9 +131,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def token_lookup():\n", @@ -149,6 +140,7 @@ " :return: Tokenize dictionary where the key is the punctuation and the value is the token\n", " \"\"\"\n", " # TODO: Implement Function\n", + "\n", " return None\n", "\n", "\"\"\"\n", @@ -168,9 +160,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", @@ -191,16 +181,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL\n", "\"\"\"\n", "import helper\n", - "import numpy as np\n", "import problem_unittests as tests\n", "\n", "int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()" @@ -211,23 +198,15 @@ "metadata": {}, "source": [ "## Build the Neural Network\n", - "You'll build the components necessary to build a RNN by implementing the following functions below:\n", - "- get_inputs\n", - "- get_init_cell\n", - "- get_embed\n", - "- build_rnn\n", - "- build_nn\n", - "- get_batches\n", + "In this section you'll build the components necessary to build an RNN by implementing the RNN Module and Forward and back propigation function.\n", "\n", - "### Check the Version of TensorFlow and Access to GPU" + "### Check the Version of PyTorch and Access to GPU" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", @@ -235,17 +214,17 @@ "\"\"\"\n", "from distutils.version import LooseVersion\n", "import warnings\n", - "import tensorflow as tf\n", + "import torch\n", "\n", - "# Check TensorFlow Version\n", - "assert LooseVersion(tf.__version__) >= LooseVersion('1.3'), 'Please use TensorFlow version 1.3 or newer'\n", - "print('TensorFlow Version: {}'.format(tf.__version__))\n", + "# Check PyTorch Version\n", + "minimum_torch_version = '0.3'\n", + "assert LooseVersion(torch.__version__) >= LooseVersion(minimum_torch_version), \\\n", + " 'Please use PyTorch version {} or newer'.format(minimum_torch_version)\n", + "print('PyTorch Version: {}'.format(torch.__version__))\n", "\n", "# Check for a GPU\n", - "if not tf.test.gpu_device_name():\n", - " warnings.warn('No GPU found. Please use a GPU to train your neural network.')\n", - "else:\n", - " print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))" + "if not torch.cuda.is_available():\n", + " warnings.warn('No GPU found. Please use a GPU to train your neural network.')" ] }, { @@ -253,419 +232,223 @@ "metadata": {}, "source": [ "### Input\n", - "Implement the `get_inputs()` function to create TF Placeholders for the Neural Network. It should create the following placeholders:\n", - "- Input text placeholder named \"input\" using the [TF Placeholder](https://www.tensorflow.org/api_docs/python/tf/placeholder) `name` parameter.\n", - "- Targets placeholder\n", - "- Learning Rate placeholder\n", + "Let's start with the preprocessed input data. We'll use [TensorDataset](http://pytorch.org/docs/master/data.html#torch.utils.data.TensorDataset) to provide an api to our dataset. Combination with [DataLoader](http://pytorch.org/docs/master/data.html#torch.utils.data.DataLoader), it will handle batching, shuffling, and other dataset iteration functions.\n", "\n", - "Return the placeholders in the following tuple `(Input, Targets, LearningRate)`" + "Implement the `batch_data` function to batch `words` data into chunks of size `batch_size` using the `TensorDataset` and `DataLoader` classes." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def get_inputs():\n", - " \"\"\"\n", - " Create TF Placeholders for input, targets, and learning rate.\n", - " :return: Tuple (input, targets, learning rate)\n", - " \"\"\"\n", - " # TODO: Implement Function\n", - " return None, None, None\n", + "from torch.utils.data import TensorDataset, DataLoader\n", "\n", "\n", - "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", - "\"\"\"\n", - "tests.test_get_inputs(get_inputs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build RNN Cell and Initialize\n", - "Stack one or more [`BasicLSTMCells`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicLSTMCell) in a [`MultiRNNCell`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/MultiRNNCell).\n", - "- The Rnn size should be set using `rnn_size`\n", - "- Initalize Cell State using the MultiRNNCell's [`zero_state()`](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/MultiRNNCell#zero_state) function\n", - " - Apply the name \"initial_state\" to the initial state using [`tf.identity()`](https://www.tensorflow.org/api_docs/python/tf/identity)\n", - "\n", - "Return the cell and initial state in the following tuple `(Cell, InitialState)`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def get_init_cell(batch_size, rnn_size):\n", + "def batch_data(words, sequence_length, batch_size):\n", " \"\"\"\n", - " Create an RNN Cell and initialize it.\n", - " :param batch_size: Size of batches\n", - " :param rnn_size: Size of RNNs\n", - " :return: Tuple (cell, initialize state)\n", + " Batch the neural network data using DataLoader\n", + " :param words: The word ids of the TV scripts\n", + " :param sequence_length: The sequence length of each batch\n", + " :param batch_size: The size of each batch\n", + " :return: DataLoader with batched data\n", " \"\"\"\n", - " # TODO: Implement Function\n", - " return None, None\n", - "\n", + " # TODO: Implement function\n", + " \n", + " return None\n", "\n", "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", "\"\"\"\n", - "tests.test_get_init_cell(get_init_cell)" + "tests.test_batch_data(batch_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Word Embedding\n", - "Apply embedding to `input_data` using TensorFlow. Return the embedded sequence." + "### Build the Neural Network\n", + "Implement an RNN using PyTorch's [Module class](http://pytorch.org/docs/master/nn.html#torch.nn.Module). To do this, you'll have to implement the following functions for the class:\n", + " - `__init__` - The initialize function. \n", + " - `forward` - Forward propigation function.\n", + " \n", + "The initialize function should create the layers of the neural network and save them to the class. The forward propigation function will use these layers to run forward propigation." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def get_embed(input_data, vocab_size, embed_dim):\n", - " \"\"\"\n", - " Create embedding for .\n", - " :param input_data: TF placeholder for text input.\n", - " :param vocab_size: Number of words in vocabulary.\n", - " :param embed_dim: Number of embedding dimensions\n", - " :return: Embedded input.\n", - " \"\"\"\n", - " # TODO: Implement Function\n", - " return None\n", + "import torch.nn as nn\n", + "\n", + "class RNN(nn.Module):\n", + " def __init__(self, input_size, output_size, sequence_length):\n", + " \"\"\"\n", + " Initialize the PyTorch Module\n", + " :param input_size: The number of input dimensions of the neural network\n", + " :param output_size: The number of output dimensions of the neural network\n", + " :param sequence_length: The sequence length of each batch\n", + " \"\"\"\n", + " super(RNN, self).__init__()\n", + " # TODO: Implement function\n", + " \n", + " pass\n", + "\n", + " def forward(self, nn_input):\n", + " \"\"\"\n", + " Forwad propigation on the neural network\n", + " :param nn_input: The input to the neural network\n", + " :return: The output of the neural network\n", + " \"\"\"\n", + " # TODO: Implement function\n", + " \n", + " return None\n", "\n", "\n", "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", "\"\"\"\n", - "tests.test_get_embed(get_embed)" + "tests.test_rnn(RNN)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Build RNN\n", - "You created a RNN Cell in the `get_init_cell()` function. Time to use the cell to create a RNN.\n", - "- Build the RNN using the [`tf.nn.dynamic_rnn()`](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)\n", - " - Apply the name \"final_state\" to the final state using [`tf.identity()`](https://www.tensorflow.org/api_docs/python/tf/identity)\n", - "\n", - "Return the outputs and final_state state in the following tuple `(Outputs, FinalState)` " + "### Build back propigation\n", + "Use the RNN class you implemented to apply forward and back propigation. You'll use the `RNN` class in this function like any other PyTorch Module." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def build_rnn(cell, inputs):\n", + "def forward_back_prop(decoder, decoder_optimizer, criterion, inp, target):\n", " \"\"\"\n", - " Create a RNN using a RNN Cell\n", - " :param cell: RNN Cell\n", - " :param inputs: Input text data\n", - " :return: Tuple (Outputs, Final State)\n", + " Forwad and backward propigation on the neural network\n", + " :param decoder: The PyTorch Module that holds the neural network\n", + " :param decoder_optimizer: The PyTorch optimizer for the neural network\n", + " :param criterion: The PyTorch loss function\n", + " :param inp: A batch of input to the neural network\n", + " :param target: The target output for the batch of input\n", + " :return: The loss\n", " \"\"\"\n", " # TODO: Implement Function\n", - " return None, None\n", + "\n", + " return None\n", "\n", "\n", "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", "\"\"\"\n", - "tests.test_build_rnn(build_rnn)" + "tests.test_forward_back_prop(forward_back_prop)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Build the Neural Network\n", - "Apply the functions you implemented above to:\n", - "- Apply embedding to `input_data` using your `get_embed(input_data, vocab_size, embed_dim)` function.\n", - "- Build RNN using `cell` and your `build_rnn(cell, inputs)` function.\n", - "- Apply a fully connected layer with a linear activation and `vocab_size` as the number of outputs.\n", - "\n", - "Return the logits and final state in the following tuple (Logits, FinalState) " + "## Neural Network Training\n", + "With the structue of the network complete and data ready to be fed in the neural network, it's time to train it.\n", + "### Train Loop\n", + "The training loop is implemented for you in the `train_decoder` function. This function will train the network over all the batches for the number of epochs given. It's progress will be shown every number of batches. This number is set with the `show_every_n_batches` parameter. You'll set this parameter along with other paramters in the next section." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):\n", - " \"\"\"\n", - " Build part of the neural network\n", - " :param cell: RNN cell\n", - " :param rnn_size: Size of rnns\n", - " :param input_data: Input data\n", - " :param vocab_size: Vocabulary size\n", - " :param embed_dim: Number of embedding dimensions\n", - " :return: Tuple (Logits, FinalState)\n", - " \"\"\"\n", - " # TODO: Implement Function\n", - " return None, None\n", - "\n", - "\n", "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "DON'T MODIFY ANYTHING IN THIS CELL\n", "\"\"\"\n", - "tests.test_build_nn(build_nn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batches\n", - "Implement `get_batches` to create batches of input and targets using `int_text`. The batches should be a Numpy array with the shape `(number of batches, 2, batch size, sequence length)`. Each batch contains two elements:\n", - "- The first element is a single batch of **input** with the shape `[batch size, sequence length]`\n", - "- The second element is a single batch of **targets** with the shape `[batch size, sequence length]`\n", + "from torch.autograd import Variable\n", "\n", - "If you can't fill the last batch with enough data, drop the last batch.\n", + "def train_decoder(batched_data, decoder, decoder_optimizer, criterion, n_epochs, show_every_n_batches):\n", + " chunk_losses = []\n", "\n", - "For example, `get_batches([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 3, 2)` would return a Numpy array of the following:\n", - "```\n", - "[\n", - " # First Batch\n", - " [\n", - " # Batch of Input\n", - " [[ 1 2], [ 7 8], [13 14]]\n", - " # Batch of targets\n", - " [[ 2 3], [ 8 9], [14 15]]\n", - " ]\n", + " print(\"Training for %d epoch(s)...\" % n_epochs)\n", + " for epoch_i in range(1, n_epochs + 1):\n", + " for batch_i, (inp, target) in enumerate(batched_data, 1):\n", + " loss = forward_back_prop(decoder, decoder_optimizer, criterion, Variable(inp), Variable(target))\n", + " chunk_losses.append(loss)\n", "\n", - " # Second Batch\n", - " [\n", - " # Batch of Input\n", - " [[ 3 4], [ 9 10], [15 16]]\n", - " # Batch of targets\n", - " [[ 4 5], [10 11], [16 17]]\n", - " ]\n", + " if batch_i % show_every_n_batches == 0:\n", + " print('Epoch: {:>4}/{:<4} Current Batch: {:>6.2f}% Loss: {}\\n'.format(\n", + " epoch_i, n_epochs, batch_i/len(batched_data) * 100, np.average(chunk_losses)))\n", + " chunk_losses = []\n", "\n", - " # Third Batch\n", - " [\n", - " # Batch of Input\n", - " [[ 5 6], [11 12], [17 18]]\n", - " # Batch of targets\n", - " [[ 6 7], [12 13], [18 1]]\n", - " ]\n", - "]\n", - "```\n", - "\n", - "Notice that the last target value in the last batch is the first input value of the first batch. In this case, `1`. This is a common technique used when creating sequence batches, although it is rather unintuitive." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def get_batches(int_text, batch_size, seq_length):\n", - " \"\"\"\n", - " Return batches of input and target\n", - " :param int_text: Text with the words replaced by their ids\n", - " :param batch_size: The size of batch\n", - " :param seq_length: The length of sequence\n", - " :return: Batches as a Numpy array\n", - " \"\"\"\n", - " # TODO: Implement Function\n", - " return None\n", - "\n", - "\n", - "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", - "\"\"\"\n", - "tests.test_get_batches(get_batches)" + " return decoder" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Neural Network Training\n", "### Hyperparameters\n", - "Tune the following parameters:\n", - "\n", + "Set and train the neural network with the following parameters:\n", "- Set `num_epochs` to the number of epochs.\n", "- Set `batch_size` to the batch size.\n", "- Set `rnn_size` to the size of the RNNs.\n", "- Set `embed_dim` to the size of the embedding.\n", "- Set `seq_length` to the length of sequence.\n", "- Set `learning_rate` to the learning rate.\n", - "- Set `show_every_n_batches` to the number of batches the neural network should print progress." + "- Set `show_every_n_batches` to the number of batches the neural network should print progress.\n", + "\n", + "If the network isn't getting the desiered results, tweak these parameters and/or the layers in the `RNN` class." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Number of Epochs\n", "num_epochs = None\n", "# Batch Size\n", "batch_size = None\n", - "# RNN Size\n", - "rnn_size = None\n", - "# Embedding Dimension Size\n", - "embed_dim = None\n", "# Sequence Length\n", - "seq_length = None\n", + "sequence_length = None\n", "# Learning Rate\n", "learning_rate = None\n", "# Show stats for every n number of batches\n", - "show_every_n_batches = None\n", - "\n", - "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", - "\"\"\"\n", - "save_dir = './save'" + "show_every_n_batches = 100" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Build the Graph\n", - "Build the graph using the neural network you implemented." + "### Train\n", + "In the next cell, you'll train the neural network on the preprocessed data. If you have a hard time getting a good loss, check the [forums](https://discussions.udacity.com/) to see if anyone is having the similar problems." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL\n", - "\"\"\"\n", - "from tensorflow.contrib import seq2seq\n", - "\n", - "train_graph = tf.Graph()\n", - "with train_graph.as_default():\n", - " vocab_size = len(int_to_vocab)\n", - " input_text, targets, lr = get_inputs()\n", - " input_data_shape = tf.shape(input_text)\n", - " cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)\n", - " logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)\n", - "\n", - " # Probabilities for generating words\n", - " probs = tf.nn.softmax(logits, name='probs')\n", - "\n", - " # Loss function\n", - " cost = seq2seq.sequence_loss(\n", - " logits,\n", - " targets,\n", - " tf.ones([input_data_shape[0], input_data_shape[1]]))\n", - "\n", - " # Optimizer\n", - " optimizer = tf.train.AdamOptimizer(lr)\n", - "\n", - " # Gradient Clipping\n", - " gradients = optimizer.compute_gradients(cost)\n", - " capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]\n", - " train_op = optimizer.apply_gradients(capped_gradients)" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "## Train\n", - "Train the neural network on the preprocessed data. If you have a hard time getting a good loss, check the [forums](https://discussions.udacity.com/) to see if anyone is having the same problem." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, "outputs": [], "source": [ "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL\n", "\"\"\"\n", - "batches = get_batches(int_text, batch_size, seq_length)\n", - "\n", - "with tf.Session(graph=train_graph) as sess:\n", - " sess.run(tf.global_variables_initializer())\n", + "vocab_size = len(vocab_to_int)\n", "\n", - " for epoch_i in range(num_epochs):\n", - " state = sess.run(initial_state, {input_text: batches[0][0]})\n", + "decoder = RNN(vocab_size, vocab_size, sequence_length)\n", + "decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)\n", + "criterion = nn.CrossEntropyLoss()\n", + "batched_data = batch_data(int_text, sequence_length, batch_size)\n", + "trained_decoder = train_decoder(batched_data, decoder, decoder_optimizer, criterion, num_epochs, show_every_n_batches)\n", "\n", - " for batch_i, (x, y) in enumerate(batches):\n", - " feed = {\n", - " input_text: x,\n", - " targets: y,\n", - " initial_state: state,\n", - " lr: learning_rate}\n", - " train_loss, state, _ = sess.run([cost, final_state, train_op], feed)\n", - "\n", - " # Show every batches\n", - " if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:\n", - " print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f}'.format(\n", - " epoch_i,\n", - " batch_i,\n", - " len(batches),\n", - " train_loss))\n", - "\n", - " # Save Model\n", - " saver = tf.train.Saver()\n", - " saver.save(sess, save_dir)\n", - " print('Model Trained and Saved')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save Parameters\n", - "Save `seq_length` and `save_dir` for generating a new TV script." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL\n", - "\"\"\"\n", - "# Save parameters for checkpoint\n", - "helper.save_params((seq_length, save_dir))" + "helper.save_model('./save/trained_decoder', trained_decoder)\n", + "print('Model Trained and Saved')" ] }, { @@ -678,155 +461,100 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL\n", "\"\"\"\n", - "import tensorflow as tf\n", - "import numpy as np\n", + "import torch\n", "import helper\n", "import problem_unittests as tests\n", "\n", "_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()\n", - "seq_length, load_dir = helper.load_params()" + "trained_decoder = helper.load_model('./save/trained_decoder')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Implement Generate Functions\n", - "### Get Tensors\n", - "Get tensors from `loaded_graph` using the function [`get_tensor_by_name()`](https://www.tensorflow.org/api_docs/python/tf/Graph#get_tensor_by_name). Get the tensors using the following names:\n", - "- \"input:0\"\n", - "- \"initial_state:0\"\n", - "- \"final_state:0\"\n", - "- \"probs:0\"\n", - "\n", - "Return the tensors in the following tuple `(InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)` " + "## Generate TV Script\n", + "With the network trained and saved, you'll use it to generate a Simpsons TV script in this section.\n", + "### Generate Text\n", + "To generate the text, the network needs to start with a single word and repeat it's predictions untill it reaches a set length. You'll be using the `generate` function to do this. It takes a word id to start with, `prime_id`, and genereates a set length of text, `predict_len`." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def get_tensors(loaded_graph):\n", - " \"\"\"\n", - " Get input, initial state, final state, and probabilities tensor from \n", - " :param loaded_graph: TensorFlow graph loaded from file\n", - " :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)\n", - " \"\"\"\n", - " # TODO: Implement Function\n", - " return None, None, None, None\n", - "\n", - "\n", "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", "\"\"\"\n", - "tests.test_get_tensors(get_tensors)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Choose Word\n", - "Implement the `pick_word()` function to select the next word using `probabilities`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def pick_word(probabilities, int_to_vocab):\n", + "def generate(decoder, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):\n", " \"\"\"\n", - " Pick the next word in the generated text\n", - " :param probabilities: Probabilites of the next word\n", - " :param int_to_vocab: Dictionary of word ids as the keys and words as the values\n", - " :return: String of the predicted word\n", + " Generate text using the neural network\n", + " :param decoder: The PyTorch Module that holds the trained neural network\n", + " :param prime_id: The word id to start the first prediction\n", + " :param int_to_vocab: Dict of word id keys to word values\n", + " :param token_dict: Dict of puncuation tokens keys to puncuation values\n", + " :param pad_value: The value used to pad a sequence\n", + " :param predict_len: The length of text to generate\n", + " :return: The generated text\n", " \"\"\"\n", - " # TODO: Implement Function\n", - " return None\n", - "\n", - "\n", - "\"\"\"\n", - "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", - "\"\"\"\n", - "tests.test_pick_word(pick_word)" + " current_seq = np.full((1, decoder.sequence_length), pad_value)\n", + " current_seq[-1][-1] = prime_id\n", + " predicted = [int_to_vocab[prime_id]]\n", + " \n", + " for _ in range(predict_len):\n", + " output = decoder(Variable(torch.LongTensor(current_seq)))\n", + " top_i = torch.multinomial(output.view(-1).exp().data, 1)[0]\n", + " word = int_to_vocab[top_i]\n", + " \n", + " predicted.append(word)\n", + " current_seq = np.roll(current_seq, -1, 1)\n", + " current_seq[-1][-1] = top_i\n", + " \n", + " gen_sentences = ' '.join(predicted)\n", + " \n", + " # Replace tokens\n", + " for key, token in token_dict.items():\n", + " ending = ' ' if key in ['\\n', '(', '\"'] else ''\n", + " gen_sentences = gen_sentences.replace(' ' + token.lower(), key)\n", + " gen_sentences = gen_sentences.replace('\\n ', '\\n')\n", + " gen_sentences = gen_sentences.replace('( ', '(')\n", + " \n", + " return gen_sentences" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Generate TV Script\n", - "This will generate the TV script for you. Set `gen_length` to the length of TV script you want to generate." + "### Generate a New Script\n", + "It's time to generate the text. Set `gen_length` to the length of TV script you want to generate and set `prime_word` to one of the following to start the prediction:\n", + "- \"homer_simpson\"\n", + "- \"moe_szyslak\"\n", + "- \"Barney_Gumble\"" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "gen_length = 200\n", - "# homer_simpson, moe_szyslak, or Barney_Gumble\n", - "prime_word = 'moe_szyslak'\n", + "prime_word = 'moe_szyslak' # homer_simpson, moe_szyslak, or Barney_Gumble\n", "\n", "\"\"\"\n", "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", "\"\"\"\n", - "loaded_graph = tf.Graph()\n", - "with tf.Session(graph=loaded_graph) as sess:\n", - " # Load saved model\n", - " loader = tf.train.import_meta_graph(load_dir + '.meta')\n", - " loader.restore(sess, load_dir)\n", - "\n", - " # Get Tensors from loaded model\n", - " input_text, initial_state, final_state, probs = get_tensors(loaded_graph)\n", - "\n", - " # Sentences generation setup\n", - " gen_sentences = [prime_word + ':']\n", - " prev_state = sess.run(initial_state, {input_text: np.array([[1]])})\n", - "\n", - " # Generate sentences\n", - " for n in range(gen_length):\n", - " # Dynamic Input\n", - " dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]\n", - " dyn_seq_length = len(dyn_input[0])\n", - "\n", - " # Get Prediction\n", - " probabilities, prev_state = sess.run(\n", - " [probs, final_state],\n", - " {input_text: dyn_input, initial_state: prev_state})\n", - " \n", - " pred_word = pick_word(probabilities[0][dyn_seq_length-1], int_to_vocab)\n", - "\n", - " gen_sentences.append(pred_word)\n", - " \n", - " # Remove tokens\n", - " tv_script = ' '.join(gen_sentences)\n", - " for key, token in token_dict.items():\n", - " ending = ' ' if key in ['\\n', '(', '\"'] else ''\n", - " tv_script = tv_script.replace(' ' + token.lower(), key)\n", - " tv_script = tv_script.replace('\\n ', '\\n')\n", - " tv_script = tv_script.replace('( ', '(')\n", - " \n", - " print(tv_script)" + "pad_word = helper.SPECIAL_WORDS['PADDING']\n", + "print(generate(trained_decoder, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length))" ] }, { @@ -834,7 +562,7 @@ "metadata": {}, "source": [ "# The TV Script is Nonsensical\n", - "It's ok if the TV script doesn't make any sense. We trained on less than a megabyte of text. In order to get good results, you'll have to use a smaller vocabulary or get more data. Luckily there's more data! As we mentioned in the beggining of this project, this is a subset of [another dataset](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data). We didn't have you train on all the data, because that would take too long. However, you are free to train your neural network on all the data. After you complete the project, of course.\n", + "It's ok if the TV script doesn't make any sense. We trained on less than a megabyte of text. In order to get good results, you'll have to use a smaller vocabulary or get more data. Luckly there's more data! As we mentioned in the begging of this project, this is a subset of [another dataset](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data). We didn't have you train on all the data, because that would take too long. However, you are free to train your neural network on all the data. After you complete the project, of course.\n", "# Submitting This Project\n", "When submitting this project, make sure to run all the cells before saving the notebook. Save the notebook file as \"dlnd_tv_script_generation.ipynb\" and save it as a HTML file under \"File\" -> \"Download as\". Include the \"helper.py\" and \"problem_unittests.py\" files in your submission." ] diff --git a/tv-script-generation/helper.py b/tv-script-generation/helper.py index eec8857629..fc76bd52d3 100644 --- a/tv-script-generation/helper.py +++ b/tv-script-generation/helper.py @@ -1,5 +1,9 @@ import os import pickle +import torch + + +SPECIAL_WORDS = {'PADDING': ''} def load_data(path): @@ -29,7 +33,7 @@ def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables): text = text.lower() text = text.split() - vocab_to_int, int_to_vocab = create_lookup_tables(text) + vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values())) int_text = [vocab_to_int[word] for word in text] pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb')) @@ -41,15 +45,11 @@ def load_preprocess(): return pickle.load(open('preprocess.p', mode='rb')) -def save_params(params): - """ - Save parameters to file - """ - pickle.dump(params, open('params.p', 'wb')) +def save_model(filename, decoder): + save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt' + torch.save(decoder, save_filename) -def load_params(): - """ - Load parameters from file - """ - return pickle.load(open('params.p', mode='rb')) +def load_model(filename): + save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt' + return torch.load(save_filename) diff --git a/tv-script-generation/problem_unittests.py b/tv-script-generation/problem_unittests.py index aa3d7095d5..3dc3f0a6b5 100644 --- a/tv-script-generation/problem_unittests.py +++ b/tv-script-generation/problem_unittests.py @@ -1,312 +1,272 @@ +from unittest.mock import MagicMock, patch import numpy as np -import tensorflow as tf -from tensorflow.contrib import rnn +import torch -def _print_success_message(): - print('Tests Passed') - - -def test_create_lookup_tables(create_lookup_tables): - with tf.Graph().as_default(): - test_text = ''' - Moe_Szyslak Moe's Tavern Where the elite meet to drink - Bart_Simpson Eh yeah hello is Mike there Last name Rotch - Moe_Szyslak Hold on I'll check Mike Rotch Mike Rotch Hey has anybody seen Mike Rotch lately - Moe_Szyslak Listen you little puke One of these days I'm gonna catch you and I'm gonna carve my name on your back with an ice pick - Moe_Szyslak Whats the matter Homer You're not your normal effervescent self - Homer_Simpson I got my problems Moe Give me another one - Moe_Szyslak Homer hey you should not drink to forget your problems - Barney_Gumble Yeah you should only drink to enhance your social skills''' - - test_text = test_text.lower() - test_text = test_text.split() - - vocab_to_int, int_to_vocab = create_lookup_tables(test_text) - - # Check types - assert isinstance(vocab_to_int, dict),\ - 'vocab_to_int is not a dictionary.' - assert isinstance(int_to_vocab, dict),\ - 'int_to_vocab is not a dictionary.' - - # Compare lengths of dicts - assert len(vocab_to_int) == len(int_to_vocab),\ - 'Length of vocab_to_int and int_to_vocab don\'t match. ' \ - 'vocab_to_int is length {}. int_to_vocab is length {}'.format(len(vocab_to_int), len(int_to_vocab)) - - # Make sure the dicts have the same words - vocab_to_int_word_set = set(vocab_to_int.keys()) - int_to_vocab_word_set = set(int_to_vocab.values()) - - assert not (vocab_to_int_word_set - int_to_vocab_word_set),\ - 'vocab_to_int and int_to_vocab don\'t have the same words.' \ - '{} found in vocab_to_int, but not in int_to_vocab'.format(vocab_to_int_word_set - int_to_vocab_word_set) - assert not (int_to_vocab_word_set - vocab_to_int_word_set),\ - 'vocab_to_int and int_to_vocab don\'t have the same words.' \ - '{} found in int_to_vocab, but not in vocab_to_int'.format(int_to_vocab_word_set - vocab_to_int_word_set) - - # Make sure the dicts have the same word ids - vocab_to_int_word_id_set = set(vocab_to_int.values()) - int_to_vocab_word_id_set = set(int_to_vocab.keys()) - - assert not (vocab_to_int_word_id_set - int_to_vocab_word_id_set),\ - 'vocab_to_int and int_to_vocab don\'t contain the same word ids.' \ - '{} found in vocab_to_int, but not in int_to_vocab'.format(vocab_to_int_word_id_set - int_to_vocab_word_id_set) - assert not (int_to_vocab_word_id_set - vocab_to_int_word_id_set),\ - 'vocab_to_int and int_to_vocab don\'t contain the same word ids.' \ - '{} found in int_to_vocab, but not in vocab_to_int'.format(int_to_vocab_word_id_set - vocab_to_int_word_id_set) - - # Make sure the dicts make the same lookup - missmatches = [(word, id, id, int_to_vocab[id]) for word, id in vocab_to_int.items() if int_to_vocab[id] != word] - - assert not missmatches,\ - 'Found {} missmatche(s). First missmatch: vocab_to_int[{}] = {} and int_to_vocab[{}] = {}'.format( - len(missmatches), - *missmatches[0]) - - assert len(vocab_to_int) > len(set(test_text))/2,\ - 'The length of vocab seems too small. Found a length of {}'.format(len(vocab_to_int)) +class _TestNN(torch.nn.Module): + def __init__(self, input_size, output_size): + super(_TestNN, self).__init__() + self.decoder = torch.nn.Linear(input_size, output_size) + self.forward_called = False - _print_success_message() + def forward(self, nn_input): + self.forward_called = True + output = self.decoder(nn_input) + return output -def test_get_batches(get_batches): - with tf.Graph().as_default(): - test_batch_size = 128 - test_seq_length = 5 - test_int_text = list(range(1000*test_seq_length)) - batches = get_batches(test_int_text, test_batch_size, test_seq_length) - # Check type - assert isinstance(batches, np.ndarray),\ - 'Batches is not a Numpy array' +def _print_success_message(): + print('Tests Passed') - # Check shape - assert batches.shape == (7, 2, 128, 5),\ - 'Batches returned wrong shape. Found {}'.format(batches.shape) - for x in range(batches.shape[2]): - assert np.array_equal(batches[0,0,x], np.array(range(x * 35, x * 35 + batches.shape[3]))),\ - 'Batches returned wrong contents. For example, input sequence {} in the first batch was {}'.format(x, batches[0,0,x]) - assert np.array_equal(batches[0,1,x], np.array(range(x * 35 + 1, x * 35 + 1 + batches.shape[3]))),\ - 'Batches returned wrong contents. For example, target sequence {} in the first batch was {}'.format(x, batches[0,1,x]) +class AssertTest(object): + def __init__(self, params): + self.assert_param_message = '\n'.join([str(k) + ': ' + str(v) + '' for k, v in params.items()]) + def test(self, assert_condition, assert_message): + assert assert_condition, assert_message + '\n\nUnit Test Function Parameters\n' + self.assert_param_message - last_seq_target = (test_batch_size-1) * 35 + 31 - last_seq = np.array(range(last_seq_target, last_seq_target+ batches.shape[3])) - last_seq[-1] = batches[0,0,0,0] - assert np.array_equal(batches[-1,1,-1], last_seq),\ - 'The last target of the last batch should be the first input of the first batch. Found {} but expected {}'.format(batches[-1,1,-1], last_seq) +def test_create_lookup_tables(create_lookup_tables): + test_text = ''' + Moe_Szyslak Moe's Tavern Where the elite meet to drink + Bart_Simpson Eh yeah hello is Mike there Last name Rotch + Moe_Szyslak Hold on I'll check Mike Rotch Mike Rotch Hey has anybody seen Mike Rotch lately + Moe_Szyslak Listen you little puke One of these days I'm gonna catch you and I'm gonna carve my name on your back with an ice pick + Moe_Szyslak Whats the matter Homer You're not your normal effervescent self + Homer_Simpson I got my problems Moe Give me another one + Moe_Szyslak Homer hey you should not drink to forget your problems + Barney_Gumble Yeah you should only drink to enhance your social skills''' + + test_text = test_text.lower() + test_text = test_text.split() + + vocab_to_int, int_to_vocab = create_lookup_tables(test_text) + + # Check types + assert isinstance(vocab_to_int, dict),\ + 'vocab_to_int is not a dictionary.' + assert isinstance(int_to_vocab, dict),\ + 'int_to_vocab is not a dictionary.' + + # Compare lengths of dicts + assert len(vocab_to_int) == len(int_to_vocab),\ + 'Length of vocab_to_int and int_to_vocab don\'t match. ' \ + 'vocab_to_int is length {}. int_to_vocab is length {}'.format(len(vocab_to_int), len(int_to_vocab)) + + # Make sure the dicts have the same words + vocab_to_int_word_set = set(vocab_to_int.keys()) + int_to_vocab_word_set = set(int_to_vocab.values()) + + assert not (vocab_to_int_word_set - int_to_vocab_word_set),\ + 'vocab_to_int and int_to_vocab don\'t have the same words.' \ + '{} found in vocab_to_int, but not in int_to_vocab'.format(vocab_to_int_word_set - int_to_vocab_word_set) + assert not (int_to_vocab_word_set - vocab_to_int_word_set),\ + 'vocab_to_int and int_to_vocab don\'t have the same words.' \ + '{} found in int_to_vocab, but not in vocab_to_int'.format(int_to_vocab_word_set - vocab_to_int_word_set) + + # Make sure the dicts have the same word ids + vocab_to_int_word_id_set = set(vocab_to_int.values()) + int_to_vocab_word_id_set = set(int_to_vocab.keys()) + + assert not (vocab_to_int_word_id_set - int_to_vocab_word_id_set),\ + 'vocab_to_int and int_to_vocab don\'t contain the same word ids.' \ + '{} found in vocab_to_int, but not in int_to_vocab'.format(vocab_to_int_word_id_set - int_to_vocab_word_id_set) + assert not (int_to_vocab_word_id_set - vocab_to_int_word_id_set),\ + 'vocab_to_int and int_to_vocab don\'t contain the same word ids.' \ + '{} found in int_to_vocab, but not in vocab_to_int'.format(int_to_vocab_word_id_set - vocab_to_int_word_id_set) + + # Make sure the dicts make the same lookup + missmatches = [(word, id, id, int_to_vocab[id]) for word, id in vocab_to_int.items() if int_to_vocab[id] != word] + + assert not missmatches,\ + 'Found {} missmatche(s). First missmatch: vocab_to_int[{}] = {} and int_to_vocab[{}] = {}'.format( + len(missmatches), + *missmatches[0]) + + assert len(vocab_to_int) > len(set(test_text))/2,\ + 'The length of vocab seems too small. Found a length of {}'.format(len(vocab_to_int)) _print_success_message() def test_tokenize(token_lookup): - with tf.Graph().as_default(): - symbols = set(['.', ',', '"', ';', '!', '?', '(', ')', '--', '\n']) - token_dict = token_lookup() + symbols = set(['.', ',', '"', ';', '!', '?', '(', ')', '--', '\n']) + token_dict = token_lookup() - # Check type - assert isinstance(token_dict, dict), \ - 'Returned type is {}.'.format(type(token_dict)) + # Check type + assert isinstance(token_dict, dict), \ + 'Returned type is {}.'.format(type(token_dict)) - # Check symbols - missing_symbols = symbols - set(token_dict.keys()) - unknown_symbols = set(token_dict.keys()) - symbols + # Check symbols + missing_symbols = symbols - set(token_dict.keys()) + unknown_symbols = set(token_dict.keys()) - symbols - assert not missing_symbols, \ - 'Missing symbols: {}'.format(missing_symbols) - assert not unknown_symbols, \ - 'Unknown symbols: {}'.format(unknown_symbols) + assert not missing_symbols, \ + 'Missing symbols: {}'.format(missing_symbols) + assert not unknown_symbols, \ + 'Unknown symbols: {}'.format(unknown_symbols) - # Check values type - bad_value_type = [type(val) for val in token_dict.values() if not isinstance(val, str)] + # Check values type + bad_value_type = [type(val) for val in token_dict.values() if not isinstance(val, str)] - assert not bad_value_type,\ - 'Found token as {} type.'.format(bad_value_type[0]) + assert not bad_value_type,\ + 'Found token as {} type.'.format(bad_value_type[0]) - # Check for spaces - key_has_spaces = [k for k in token_dict.keys() if ' ' in k] - val_has_spaces = [val for val in token_dict.values() if ' ' in val] + # Check for spaces + key_has_spaces = [k for k in token_dict.keys() if ' ' in k] + val_has_spaces = [val for val in token_dict.values() if ' ' in val] - assert not key_has_spaces,\ - 'The key "{}" includes spaces. Remove spaces from keys and values'.format(key_has_spaces[0]) - assert not val_has_spaces,\ - 'The value "{}" includes spaces. Remove spaces from keys and values'.format(val_has_spaces[0]) + assert not key_has_spaces,\ + 'The key "{}" includes spaces. Remove spaces from keys and values'.format(key_has_spaces[0]) + assert not val_has_spaces,\ + 'The value "{}" includes spaces. Remove spaces from keys and values'.format(val_has_spaces[0]) - # Check for symbols in values - symbol_val = () - for symbol in symbols: - for val in token_dict.values(): - if symbol in val: - symbol_val = (symbol, val) + # Check for symbols in values + symbol_val = () + for symbol in symbols: + for val in token_dict.values(): + if symbol in val: + symbol_val = (symbol, val) - assert not symbol_val,\ - 'Don\'t use a symbol that will be replaced in your tokens. Found the symbol {} in value {}'.format(*symbol_val) + assert not symbol_val,\ + 'Don\'t use a symbol that will be replaced in your tokens. Found the symbol {} in value {}'.format(*symbol_val) _print_success_message() -def test_get_inputs(get_inputs): - with tf.Graph().as_default(): - input_data, targets, lr = get_inputs() - - # Check type - assert input_data.op.type == 'Placeholder',\ - 'Input not a Placeholder.' - assert targets.op.type == 'Placeholder',\ - 'Targets not a Placeholder.' - assert lr.op.type == 'Placeholder',\ - 'Learning Rate not a Placeholder.' - - # Check name - assert input_data.name == 'input:0',\ - 'Input has bad name. Found name {}'.format(input_data.name) - - # Check rank - input_rank = 0 if input_data.get_shape() == None else len(input_data.get_shape()) - targets_rank = 0 if targets.get_shape() == None else len(targets.get_shape()) - lr_rank = 0 if lr.get_shape() == None else len(lr.get_shape()) - - assert input_rank == 2,\ - 'Input has wrong rank. Rank {} found.'.format(input_rank) - assert targets_rank == 2,\ - 'Targets has wrong rank. Rank {} found.'.format(targets_rank) - assert lr_rank == 0,\ - 'Learning Rate has wrong rank. Rank {} found'.format(lr_rank) +def test_batch_data(batch_data): + text_size = 22 + sequence_length = 3 + batch_size = 4 + int_text = np.arange(text_size).tolist() + feature_batches_flatten = np.array( + [int_text[i:i + sequence_length] for i in range(text_size - sequence_length)]).flatten() + label_batches_flatten = np.array(int_text[sequence_length:]) + + assert_test = AssertTest({'Input Text': int_text, 'Sequence Length': sequence_length, 'Batch Size': batch_size}) + + data_loader = batch_data(int_text, sequence_length, batch_size) + assert_condition = type(data_loader) == torch.utils.data.DataLoader + assert_message = 'Wront type returned. Expected type {}, got type {}'.format(torch.utils.data.DataLoader, type(data_loader)) + assert_test.test(assert_condition, assert_message) + + data_batches = list(data_loader) + correct_n_batches = int(text_size / batch_size) + assert_condition = len(data_batches) == correct_n_batches + assert_message = 'Number of batches is incorrect. It should be {}, found {}'.format(correct_n_batches, + len(data_batches)) + assert_test.test(assert_condition, assert_message) + + batch_shapes = [len(batch) for batch in data_batches] + assert_condition = set(batch_shapes) == {2} + assert_message = 'Each batch should have features and a label (2). Found the following lengths in batches: {}'.format( + set(batch_shapes)) + assert_test.test(assert_condition, assert_message) + + feature_tensor_shapes = [(tuple(batch[0].size())) for batch in data_batches] + assert_condition = set(feature_tensor_shapes[:-1]) == {(4, 3)} + assert_message = 'The first {} batches for these parameters should have features of shape (4,3). Found the following shapes: {}'.format( + correct_n_batches - 1, set(feature_tensor_shapes[:-1])) + assert_test.test(assert_condition, assert_message) + + assert_condition = feature_tensor_shapes[-1] == (3, 3) + assert_message = 'The last batch for these parameters should have a feature with shape of (3,3). Found a shape of {}'.format( + feature_tensor_shapes[-1]) + assert_test.test(assert_condition, assert_message) + + label_tensor_shapes = [(tuple(batch[1].size())) for batch in data_batches] + assert_condition = set(label_tensor_shapes[:-1]) == {(4,)} + assert_message = 'The first {} batches for these parameters should have a label of shape (4,3)'.format( + correct_n_batches - 1) + assert_test.test(assert_condition, assert_message) + + assert_condition = label_tensor_shapes[-1] == (3,) + assert_message = 'The last batch for these parameters should have a label with shape (3,). Found a shape of {}'.format( + label_tensor_shapes[-1]) + assert_test.test(assert_condition, assert_message) + + feature_tensor_types = [type(batch[0]) for batch in data_batches] + assert_condition = set(feature_tensor_types) == {torch.LongTensor} + assert_message = 'Each feature Tensor should be a type LongTensor. Found the following type(s): {}'.format( + set(feature_tensor_types)) + assert_test.test(assert_condition, assert_message) + + label_tensor_types = [type(batch[1]) for batch in data_batches] + assert_condition = set(label_tensor_types) == {torch.LongTensor} + assert_message = 'Each label Tensor should be a type LongTensor. Found the following type(s): {}'.format( + set(feature_tensor_types)) + assert_test.test(assert_condition, assert_message) + + feature_tensors = np.concatenate([batch[0].view(-1) for batch in data_batches]) + assert_condition = (feature_tensors == feature_batches_flatten).all() + assert_message = 'Wrong values for features. Output:\n{}'.format(data_batches) + assert_test.test(assert_condition, assert_message) + + label_tensors = np.concatenate([batch[1].view(-1) for batch in data_batches]) + assert_condition = (label_tensors == label_batches_flatten).all() + assert_message = 'Wrong values for labels. Output:\n{}'.format(data_batches) + assert_test.test(assert_condition, assert_message) _print_success_message() -def test_get_init_cell(get_init_cell): - with tf.Graph().as_default(): - test_batch_size_ph = tf.placeholder(tf.int32, []) - test_rnn_size = 256 - - cell, init_state = get_init_cell(test_batch_size_ph, test_rnn_size) - - # Check type - assert isinstance(cell, tf.contrib.rnn.MultiRNNCell),\ - 'Cell is wrong type. Found {} type'.format(type(cell)) +def test_rnn(RNN): + batch_size = 50 + sequence_length = 3 + input_size = 20 + output_size = 10 + decoder = RNN(input_size, output_size, sequence_length) - # Check for name attribute - assert hasattr(init_state, 'name'),\ - 'Initial state doesn\'t have the "name" attribute. Try using `tf.identity` to set the name.' + a = np.random.randint(input_size, size=(batch_size, sequence_length)) + b = torch.LongTensor(a) + nn_input = torch.autograd.Variable(b) - # Check name - assert init_state.name == 'initial_state:0',\ - 'Initial state doesn\'t have the correct name. Found the name {}'.format(init_state.name) + output = decoder(nn_input) + assert_test = AssertTest({ + 'Input Size': input_size, + 'Output Size': output_size, + 'Sequence Length': sequence_length, + 'Input': nn_input}) - _print_success_message() - - -def test_get_embed(get_embed): - with tf.Graph().as_default(): - embed_shape = [50, 5, 256] - test_input_data = tf.placeholder(tf.int32, embed_shape[:2]) - test_vocab_size = 27 - test_embed_dim = embed_shape[2] + assert_condition = type(output) == torch.autograd.Variable + assert_message = 'Wrong output type. Expected type {}. Got type {}'.format(torch.autograd.Variable, type(output)) + assert_test.test(assert_condition, assert_message) - embed = get_embed(test_input_data, test_vocab_size, test_embed_dim) + correct_output_size = (batch_size, output_size) + assert_condition = output.size() == correct_output_size + assert_message = 'Wrong output size. Expected type {}. Got type {}'.format(correct_output_size, output.size()) + assert_test.test(assert_condition, assert_message) - # Check shape - assert embed.shape == embed_shape,\ - 'Wrong shape. Found shape {}'.format(embed.shape) + assert_condition = type(output.data) == torch.FloatTensor + assert_message = 'Wrong output data type. Expected a Variable with data of type {}. Got data of type {}'\ + .format(torch.FloatTensor, type(output.data)) + assert_test.test(assert_condition, assert_message) _print_success_message() -def test_build_rnn(build_rnn): - with tf.Graph().as_default(): - test_rnn_size = 256 - test_rnn_layer_size = 2 - test_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(test_rnn_size) for _ in range(test_rnn_layer_size)]) +def test_forward_back_prop(forward_back_prop): + batch_size = 200 + input_size = 20 + output_size = 10 + learning_rate = 0.01 - test_inputs = tf.placeholder(tf.float32, [None, None, test_rnn_size]) - outputs, final_state = build_rnn(test_cell, test_inputs) + mock_decoder = MagicMock(wraps=_TestNN(input_size, output_size)) + mock_decoder_optimizer = MagicMock(wraps=torch.optim.Adam(mock_decoder.parameters(), lr=learning_rate)) + mock_criterion = MagicMock(wraps=torch.nn.CrossEntropyLoss()) - # Check name - assert hasattr(final_state, 'name'),\ - 'Final state doesn\'t have the "name" attribute. Try using `tf.identity` to set the name.' - assert final_state.name == 'final_state:0',\ - 'Final state doesn\'t have the correct name. Found the name {}'.format(final_state.name) + with patch.object(torch.autograd, 'backward', wraps=torch.autograd.backward) as mock_autograd_backward: + inp = torch.autograd.Variable(torch.FloatTensor(np.random.rand(batch_size, input_size))) + target = torch.autograd.Variable(torch.LongTensor(np.random.randint(output_size, size=batch_size))) - # Check shape - assert outputs.get_shape().as_list() == [None, None, test_rnn_size],\ - 'Outputs has wrong shape. Found shape {}'.format(outputs.get_shape()) - assert final_state.get_shape().as_list() == [test_rnn_layer_size, 2, None, test_rnn_size],\ - 'Final state wrong shape. Found shape {}'.format(final_state.get_shape()) - - _print_success_message() + loss = forward_back_prop(mock_decoder, mock_decoder_optimizer, mock_criterion, inp, target) - -def test_build_nn(build_nn): - with tf.Graph().as_default(): - test_input_data_shape = [128, 5] - test_input_data = tf.placeholder(tf.int32, test_input_data_shape) - test_rnn_size = 256 - test_embed_dim = 300 - test_rnn_layer_size = 2 - test_vocab_size = 27 - test_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(test_rnn_size) for _ in range(test_rnn_layer_size)]) - - logits, final_state = build_nn(test_cell, test_rnn_size, test_input_data, test_vocab_size, test_embed_dim) - - # Check name - assert hasattr(final_state, 'name'), \ - 'Final state doesn\'t have the "name" attribute. Are you using build_rnn?' - assert final_state.name == 'final_state:0', \ - 'Final state doesn\'t have the correct name. Found the name {}. Are you using build_rnn?'.format(final_state.name) - - # Check Shape - assert logits.get_shape().as_list() == test_input_data_shape + [test_vocab_size], \ - 'Outputs has wrong shape. Found shape {}'.format(logits.get_shape()) - assert final_state.get_shape().as_list() == [test_rnn_layer_size, 2, 128, test_rnn_size], \ - 'Final state wrong shape. Found shape {}'.format(final_state.get_shape()) - - _print_success_message() - - -def test_get_tensors(get_tensors): - test_graph = tf.Graph() - with test_graph.as_default(): - test_input = tf.placeholder(tf.int32, name='input') - test_initial_state = tf.placeholder(tf.int32, name='initial_state') - test_final_state = tf.placeholder(tf.int32, name='final_state') - test_probs = tf.placeholder(tf.float32, name='probs') - - input_text, initial_state, final_state, probs = get_tensors(test_graph) - - # Check correct tensor - assert input_text == test_input,\ - 'Test input is wrong tensor' - assert initial_state == test_initial_state, \ - 'Initial state is wrong tensor' - assert final_state == test_final_state, \ - 'Final state is wrong tensor' - assert probs == test_probs, \ - 'Probabilities is wrong tensor' + assert mock_decoder.zero_grad.called, 'Didn\'t set the gradients to 0.' + assert mock_decoder.forward_called, 'Forward propagation not called.' + assert mock_autograd_backward.called, 'Backward propagation not called' + assert mock_decoder_optimizer.step.called, 'Optimization step not performed' + assert type(loss) == float, 'Wrong return type. Exptected {}, got {}'.format(float, type(loss)) _print_success_message() - - -def test_pick_word(pick_word): - with tf.Graph().as_default(): - test_probabilities = np.array([0.1, 0.8, 0.05, 0.05]) - test_int_to_vocab = {word_i: word for word_i, word in enumerate(['this', 'is', 'a', 'test'])} - - pred_word = pick_word(test_probabilities, test_int_to_vocab) - - # Check type - assert isinstance(pred_word, str),\ - 'Predicted word is wrong type. Found {} type.'.format(type(pred_word)) - - # Check word is from vocab - assert pred_word in test_int_to_vocab.values(),\ - 'Predicted word not found in int_to_vocab.' - - - _print_success_message() - diff --git a/tv-script-generation/requirements.txt b/tv-script-generation/requirements.txt index 4cd90f9a86..94dad0adac 100644 --- a/tv-script-generation/requirements.txt +++ b/tv-script-generation/requirements.txt @@ -1,44 +1,24 @@ -appdirs==1.4.3 appnope==0.1.0 -bleach==2.0.0 -decorator==4.0.11 -entrypoints==0.2.2 -html5lib==0.999999999 -ipykernel==4.5.2 -ipython==5.3.0 +decorator==4.2.1 +ipykernel==4.8.2 +ipython==6.2.1 ipython-genutils==0.2.0 -ipywidgets==6.0.0 -Jinja2==2.9.5 -jsonschema==2.6.0 -jupyter==1.0.0 -jupyter-client==5.0.0 -jupyter-console==5.1.0 -jupyter-core==4.3.0 -MarkupSafe==1.0 -mistune==0.7.4 -nbconvert==5.1.1 -nbformat==4.3.0 -notebook==4.4.1 -numpy==1.12.1 -packaging==16.8 -pandocfilters==1.4.1 -pexpect==4.2.1 +jedi==0.11.1 +jupyter-client==5.2.3 +jupyter-core==4.4.0 +numpy==1.14.2 +parso==0.1.1 +pexpect==4.4.0 pickleshare==0.7.4 -prompt-toolkit==1.0.14 -protobuf==3.2.0 -ptyprocess==0.5.1 +prompt-toolkit==1.0.15 +ptyprocess==0.5.2 Pygments==2.2.0 -pyparsing==2.2.0 -python-dateutil==2.6.0 -pyzmq==16.0.2 -qtconsole==4.3.0 +python-dateutil==2.7.2 +PyYAML==3.12 +pyzmq==17.0.0 simplegeneric==0.8.1 -six==1.10.0 -tensorflow==1.0.0 -terminado==0.6 -testpath==0.3 -tornado==4.4.3 +six==1.11.0 +torch==0.3.0.post4 +tornado==5.0.1 traitlets==4.3.2 wcwidth==0.1.7 -webencodings==0.5 -widgetsnbextension==2.0.0