|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# 1. Let's make it more idiomatic" |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "markdown", |
| 12 | + "metadata": {}, |
| 13 | + "source": [ |
| 14 | + "Your task is to refactor the following report generation code to more idiomatic. The existing implementation was written by an unknown developer who did not know anything about the idioms of Python. Luckily the unkown developer documented the implementation decently and wrote some tests for it. \n", |
| 15 | + "\n", |
| 16 | + "### The specification of the report generation\n", |
| 17 | + "\n", |
| 18 | + "This file content:\n", |
| 19 | + "```\n", |
| 20 | + "something\n", |
| 21 | + "1\n", |
| 22 | + "7\n", |
| 23 | + "somEThing\n", |
| 24 | + "\n", |
| 25 | + "2\n", |
| 26 | + "wassup\n", |
| 27 | + "woop\n", |
| 28 | + "woop\n", |
| 29 | + "something\n", |
| 30 | + "WoOP\n", |
| 31 | + "```\n", |
| 32 | + "\n", |
| 33 | + "Should yield this report:\n", |
| 34 | + "```\n", |
| 35 | + "missing values: 1\n", |
| 36 | + "highest number: 7.0\n", |
| 37 | + "most common words: something, woop\n", |
| 38 | + "occurrences of most common: 3\n", |
| 39 | + "#####\n", |
| 40 | + "numbers: [1.0, 7.0, 2.0]\n", |
| 41 | + "words: ['something', 'something', 'wassup', 'woop', 'woop', 'something', 'woop']\n", |
| 42 | + "```\n", |
| 43 | + "\n", |
| 44 | + "Note:\n", |
| 45 | + "* all numbers of the input file should be presented as floats in the report\n", |
| 46 | + "* all words are presented as lowercased in the report\n", |
| 47 | + "* while calculating the most common words, the count should be done as case insensitive (in other words, `'WoOp'` should be considered the same word as `'woop'`)\n", |
| 48 | + "* if there are multiple different most common words, they should be presented in the format presented above\n", |
| 49 | + "* there are more examples in the tests\n", |
| 50 | + "\n", |
| 51 | + "Run the cell of the existing implementation and then run the tests to verify that it works correctly. Then make sure that you understand how the legacy implementation works. After that, start refactoring, a function by function. Good luck!" |
| 52 | + ] |
| 53 | + }, |
| 54 | + { |
| 55 | + "cell_type": "code", |
| 56 | + "execution_count": null, |
| 57 | + "metadata": { |
| 58 | + "editable": false |
| 59 | + }, |
| 60 | + "outputs": [], |
| 61 | + "source": [ |
| 62 | + "def get_report(path):\n", |
| 63 | + " \"\"\"\n", |
| 64 | + " Creates a report of the file specified as argument.\n", |
| 65 | + "\n", |
| 66 | + " :param path: path to file from which the report should be created (string)\n", |
| 67 | + " :return: the report (string)\n", |
| 68 | + " \"\"\"\n", |
| 69 | + " data = _read_file(path)\n", |
| 70 | + " missing_count = data[0]\n", |
| 71 | + " numbers = data[1]\n", |
| 72 | + " words = data[2]\n", |
| 73 | + " report = _make_report(missing_count, numbers, words)\n", |
| 74 | + " return report\n", |
| 75 | + "\n", |
| 76 | + "\n", |
| 77 | + "def _read_file(path):\n", |
| 78 | + " \"\"\"\n", |
| 79 | + " Reads and returns the data from the file specified as argument.\n", |
| 80 | + "\n", |
| 81 | + " :param path: path to the file to be read.\n", |
| 82 | + " :return: a tuple containing\n", |
| 83 | + " 1. the number of empty lines (int)\n", |
| 84 | + " 2. numeric values (list of floats)\n", |
| 85 | + " 3. non-numeric values (list of strings)\n", |
| 86 | + " \"\"\"\n", |
| 87 | + " data_file = open(path, 'r')\n", |
| 88 | + " lines = data_file.readlines()\n", |
| 89 | + " line_count = len(lines)\n", |
| 90 | + " idx = 0\n", |
| 91 | + " empty_lines = 0\n", |
| 92 | + " words = []\n", |
| 93 | + " numbers = []\n", |
| 94 | + " while idx < line_count:\n", |
| 95 | + " line = lines[idx]\n", |
| 96 | + " line = line.strip()\n", |
| 97 | + " if line == '':\n", |
| 98 | + " empty_lines = empty_lines + 1\n", |
| 99 | + " else:\n", |
| 100 | + " is_number = False\n", |
| 101 | + " try:\n", |
| 102 | + " number = float(line)\n", |
| 103 | + " is_number = True\n", |
| 104 | + " except Exception:\n", |
| 105 | + " pass\n", |
| 106 | + "\n", |
| 107 | + " if is_number:\n", |
| 108 | + " numbers.append(number)\n", |
| 109 | + " else:\n", |
| 110 | + " words.append(line)\n", |
| 111 | + " idx = idx + 1\n", |
| 112 | + " data_file.close()\n", |
| 113 | + "\n", |
| 114 | + " return empty_lines, numbers, words\n", |
| 115 | + "\n", |
| 116 | + "\n", |
| 117 | + "def _make_report(missing_values, numbers, words):\n", |
| 118 | + " \"\"\"\n", |
| 119 | + " Creates and a report based on data given as arguments.\n", |
| 120 | + "\n", |
| 121 | + " :param missing_values: number of empty lines (int)\n", |
| 122 | + " :param numbers: numeric values (list of floats)\n", |
| 123 | + " :param words: non numeric values (list of strings)\n", |
| 124 | + " :return: the generated report (string)\n", |
| 125 | + " \"\"\"\n", |
| 126 | + " max_value = _get_max_value(numbers)\n", |
| 127 | + " lower_case_words = _words_to_lowercase(words)\n", |
| 128 | + " most_common_info = _get_most_common_word(lower_case_words)\n", |
| 129 | + " most_common_words = most_common_info[0]\n", |
| 130 | + " most_common_count = most_common_info[1]\n", |
| 131 | + "\n", |
| 132 | + " most_common_str = ''\n", |
| 133 | + " for idx in range(len(most_common_words)):\n", |
| 134 | + " most_common_str += most_common_words[idx] + ', '\n", |
| 135 | + " # remove the last comma and space\n", |
| 136 | + " most_common_str = most_common_str[0:len(most_common_str) - 2]\n", |
| 137 | + "\n", |
| 138 | + " report = ('missing values: {}\\n'\n", |
| 139 | + " 'highest number: {}\\n'\n", |
| 140 | + " 'most common words: {}\\n'\n", |
| 141 | + " 'occurrences of most common: {}\\n'\n", |
| 142 | + " '#####\\n'\n", |
| 143 | + " 'numbers: {}\\n'\n", |
| 144 | + " 'words: {}').format(missing_values, max_value, most_common_str,\n", |
| 145 | + " most_common_count, numbers, lower_case_words)\n", |
| 146 | + "\n", |
| 147 | + " return report\n", |
| 148 | + "\n", |
| 149 | + "\n", |
| 150 | + "def _get_max_value(numbers):\n", |
| 151 | + " \"\"\"\n", |
| 152 | + " Returns the greatest value of the list given as argument.\n", |
| 153 | + "\n", |
| 154 | + " :param numbers: numbers (list of numeric values)\n", |
| 155 | + " :return: greatest value of numbers, None if numbers is an empty list\n", |
| 156 | + " \"\"\"\n", |
| 157 | + " max_value = None\n", |
| 158 | + " if len(numbers) > 0:\n", |
| 159 | + " max_value = numbers[0]\n", |
| 160 | + " for idx in range(len(numbers)):\n", |
| 161 | + " if numbers[idx] > max_value:\n", |
| 162 | + " max_value = numbers[idx]\n", |
| 163 | + " return max_value\n", |
| 164 | + "\n", |
| 165 | + "\n", |
| 166 | + "def _words_to_lowercase(words):\n", |
| 167 | + " \"\"\"\n", |
| 168 | + " :param words: words to be converted (list of strings)\n", |
| 169 | + " :return: lowercased words (list of strings)\n", |
| 170 | + " \"\"\"\n", |
| 171 | + " lowercased = []\n", |
| 172 | + " for idx in range(len(words)):\n", |
| 173 | + " value = words[idx].lower()\n", |
| 174 | + " lowercased.append(value)\n", |
| 175 | + " return lowercased\n", |
| 176 | + "\n", |
| 177 | + "\n", |
| 178 | + "def _get_most_common_word(words):\n", |
| 179 | + " \"\"\"\n", |
| 180 | + " Finds the most common words in a list of words.\n", |
| 181 | + " If there are multiple different words with the same amount of occurrences,\n", |
| 182 | + " they are all included in the return value sorted alphabetically.\n", |
| 183 | + " In addition to returning the most common words, the return value\n", |
| 184 | + " includes also the count of occurrences of the most common words.\n", |
| 185 | + "\n", |
| 186 | + " :param words: list of words (list of strings)\n", |
| 187 | + " :return: a tuple containing:\n", |
| 188 | + " 1. most common words (list of strings)\n", |
| 189 | + " 2. the count of occurrences of the most common words (int)\n", |
| 190 | + " \"\"\"\n", |
| 191 | + " word_counts = {}\n", |
| 192 | + " idx = 0\n", |
| 193 | + " while idx < len(words):\n", |
| 194 | + " value = words[idx]\n", |
| 195 | + " if value not in word_counts.keys():\n", |
| 196 | + " word_counts[value] = 1\n", |
| 197 | + " else:\n", |
| 198 | + " word_counts[value] += 1\n", |
| 199 | + " idx = idx + 1\n", |
| 200 | + "\n", |
| 201 | + " max_count = 0\n", |
| 202 | + " for value in word_counts.values():\n", |
| 203 | + " if value > max_count:\n", |
| 204 | + " max_count = value\n", |
| 205 | + "\n", |
| 206 | + " most_common_words = []\n", |
| 207 | + " for word in word_counts.keys():\n", |
| 208 | + " count = word_counts[word]\n", |
| 209 | + " if count == max_count:\n", |
| 210 | + " most_common_words.append(word)\n", |
| 211 | + "\n", |
| 212 | + " most_common_words = sorted(most_common_words)\n", |
| 213 | + "\n", |
| 214 | + " return most_common_words, max_count" |
| 215 | + ] |
| 216 | + }, |
| 217 | + { |
| 218 | + "cell_type": "markdown", |
| 219 | + "metadata": {}, |
| 220 | + "source": [ |
| 221 | + "Now it's time refactor the existing code to make it more idiomatic.\n", |
| 222 | + "\n", |
| 223 | + "It's desirable that you do the refactoring in small junks. Consider using the following workflow:\n", |
| 224 | + "1. Copy-paste a single function from the above ugly implementation to the cell below\n", |
| 225 | + "2. Refactor the function\n", |
| 226 | + "3. Run the tests to verify that you did not break anything\n", |
| 227 | + "\n", |
| 228 | + "This way you can consider each function as a separate sub task." |
| 229 | + ] |
| 230 | + }, |
| 231 | + { |
| 232 | + "cell_type": "code", |
| 233 | + "execution_count": null, |
| 234 | + "metadata": {}, |
| 235 | + "outputs": [], |
| 236 | + "source": [ |
| 237 | + "# Your beautiful refactored, idiomatic, pythonic solution here\n", |
| 238 | + "\n", |
| 239 | + "\n" |
| 240 | + ] |
| 241 | + }, |
| 242 | + { |
| 243 | + "cell_type": "markdown", |
| 244 | + "metadata": {}, |
| 245 | + "source": [ |
| 246 | + "The tests are here. Run these often while refactoring!" |
| 247 | + ] |
| 248 | + }, |
| 249 | + { |
| 250 | + "cell_type": "code", |
| 251 | + "execution_count": null, |
| 252 | + "metadata": { |
| 253 | + "editable": false |
| 254 | + }, |
| 255 | + "outputs": [], |
| 256 | + "source": [ |
| 257 | + "import os\n", |
| 258 | + "\n", |
| 259 | + "CURRENT_DIR = os.getcwd()\n", |
| 260 | + "DATA_DIR = os.path.join(os.path.dirname(CURRENT_DIR), 'data')\n", |
| 261 | + "\n", |
| 262 | + "DATA_FILE1 = os.path.join(DATA_DIR, 'misc_data1.txt')\n", |
| 263 | + "DATA_FILE2 = os.path.join(DATA_DIR, 'misc_data2.txt')\n", |
| 264 | + "DATA_FILE3 = os.path.join(DATA_DIR, 'empty.txt')\n", |
| 265 | + "\n", |
| 266 | + "expected1 = '''missing values: 2\n", |
| 267 | + "highest number: 99.0\n", |
| 268 | + "most common words: john\n", |
| 269 | + "occurrences of most common: 4\n", |
| 270 | + "#####\n", |
| 271 | + "numbers: [1.0, 2.0, 99.0, 6.72, 2.0, 2.0, 2.0]\n", |
| 272 | + "words: ['john', 'doe', 'john', 'john', 'was', 'here', 'this', 'is', 'totally', 'random', 'john']'''\n", |
| 273 | + "\n", |
| 274 | + "expected2 = '''missing values: 3\n", |
| 275 | + "highest number: 101.0\n", |
| 276 | + "most common words: doe, john\n", |
| 277 | + "occurrences of most common: 4\n", |
| 278 | + "#####\n", |
| 279 | + "numbers: [1.0, 2.0, 101.0, 6.72, 2.0, 2.0, 67.0, 2.0]\n", |
| 280 | + "words: ['john', 'doe', 'john', 'john', 'doe', 'was', 'doe', 'here', 'this', 'is', 'totally', 'random', 'john', 'doe']'''\n", |
| 281 | + "\n", |
| 282 | + "expected3 = '''missing values: 0\n", |
| 283 | + "highest number: None\n", |
| 284 | + "most common words: \n", |
| 285 | + "occurrences of most common: 0\n", |
| 286 | + "#####\n", |
| 287 | + "numbers: []\n", |
| 288 | + "words: []'''\n", |
| 289 | + "\n", |
| 290 | + "assert get_report(DATA_FILE1) == expected1\n", |
| 291 | + "print('First one OK!')\n", |
| 292 | + "\n", |
| 293 | + "assert get_report(DATA_FILE2) == expected2\n", |
| 294 | + "print('Second one OK!')\n", |
| 295 | + "\n", |
| 296 | + "assert get_report(DATA_FILE3) == expected3\n", |
| 297 | + "print('All OK, woop woop!')" |
| 298 | + ] |
| 299 | + }, |
| 300 | + { |
| 301 | + "cell_type": "code", |
| 302 | + "execution_count": null, |
| 303 | + "metadata": {}, |
| 304 | + "outputs": [], |
| 305 | + "source": [ |
| 306 | + "# If the tests are failing, you can debug here.\n", |
| 307 | + "\n", |
| 308 | + "report = get_report(DATA_FILE1)\n", |
| 309 | + "print(report)" |
| 310 | + ] |
| 311 | + } |
| 312 | + ], |
| 313 | + "metadata": { |
| 314 | + "kernelspec": { |
| 315 | + "display_name": "Python 3", |
| 316 | + "language": "python", |
| 317 | + "name": "python3" |
| 318 | + }, |
| 319 | + "language_info": { |
| 320 | + "codemirror_mode": { |
| 321 | + "name": "ipython", |
| 322 | + "version": 3 |
| 323 | + }, |
| 324 | + "file_extension": ".py", |
| 325 | + "mimetype": "text/x-python", |
| 326 | + "name": "python", |
| 327 | + "nbconvert_exporter": "python", |
| 328 | + "pygments_lexer": "ipython3", |
| 329 | + "version": "3.5.4" |
| 330 | + } |
| 331 | + }, |
| 332 | + "nbformat": 4, |
| 333 | + "nbformat_minor": 2 |
| 334 | +} |
0 commit comments