diff --git a/.ipynb_checkpoints/BuildingMLSystemsWithPython_Ch1-checkpoint.ipynb b/.ipynb_checkpoints/BuildingMLSystemsWithPython_Ch1-checkpoint.ipynb new file mode 100644 index 00000000..cd71446c --- /dev/null +++ b/.ipynb_checkpoints/BuildingMLSystemsWithPython_Ch1-checkpoint.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy as sp" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = sp.genfromtxt(\"./ch01/data/web_traffic.tsv\", delimiter=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 1.00000000e+00 2.27200000e+03]\n", + " [ 2.00000000e+00 nan]\n", + " [ 3.00000000e+00 1.38600000e+03]\n", + " ..., \n", + " [ 7.41000000e+02 5.39200000e+03]\n", + " [ 7.42000000e+02 5.90600000e+03]\n", + " [ 7.43000000e+02 4.88100000e+03]]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(743, 2)\n" + ] + } + ], + "source": [ + "print(data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1.0, 2272.0)\n", + "(2.0, nan)\n", + "[ 1.00000000e+00 2.27200000e+03]\n" + ] + } + ], + "source": [ + "x = data[:,0]\n", + "y = data[:,1]\n", + "print(x[0], y[0]) #Get first column\n", + "print(x[1], y[1]) #Get second column\n", + "print(data[0,:]) #Get first row" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sp.sum(sp.isnan(y)) #Find invalid" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Clean vectors by constructing vector of which numbers are NaN and \n", + "#using that to index the vector\n", + "x = x[~sp.isnan(y)]\n", + "y = y[~sp.isnan(y)]\n", + "sp.sum(sp.isnan(y)) #No more invalid!" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt #Time for some plotting\n", + "# plot the (x,y) points with dots of size 10\n", + "plt.scatter(x, y, s=10)\n", + "plt.title(\"Web traffic over the last month\")\n", + "plt.xlabel(\"Time\")\n", + "plt.ylabel(\"Hits/hour\")\n", + "plt.xticks([w*7*24 for w in range(10)], ['week %i' % w for w in range(10)])\n", + "plt.autoscale(tight=True)\n", + "\n", + "# draw a slightly opaque, dashed grid\n", + "plt.grid(True, linestyle='-', color='0.75')\n", + "plt.show()\n", + "print(\"Done\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Parameters: [ 2.59619213 989.02487106]\n", + "[ 3.17389767e+08]\n", + "317389767.34\n" + ] + } + ], + "source": [ + "#Model Error: squared distance of model prediction to real data\n", + "def error(f, x, y):\n", + " return sp.sum((f(x)-y)**2)\n", + "\n", + "# Do the maths to create our model params\n", + "fp1, residuals, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)\n", + "\n", + "# Meaning our model is f(x) = 2.59619213 * x + 989.02487106\n", + "print(\"Model Parameters: %s\" % fp1)\n", + "print(residuals)\n", + "\n", + "#Create model from model params:\n", + "f1 = sp.poly1d(fp1)\n", + "print(error(f1, x, y))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Plot the new model against the data\n", + "fx = sp.linspace(0,x[-1], 1000) # generate X-values for plotting\n", + "plt.plot(fx, f1(fx), linewidth=4)\n", + "plt.legend([\"d=%i\" % f1.order], loc=\"upper left\")\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1.05322215e-02 -5.26545650e+00 1.97476082e+03]\n", + "179983507.878\n" + ] + } + ], + "source": [ + "# Time for some polynomial moddeling\n", + "f2p = sp.polyfit(x, y, 2)\n", + "print(f2p)\n", + "\n", + "f2 = sp.poly1d(f2p)\n", + "print(error(f2, x, y))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Plot the poly model\n", + "fx = sp.linspace(0,x[-1], 1000) # generate X-values for plotting\n", + "plt.plot(fx, f2(fx), linewidth=4)\n", + "plt.legend([\"d=%i\" % f1.order], loc=\"upper left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error inflection=132950348.197616\n" + ] + } + ], + "source": [ + "inflection = 3.5*7*24 # inflection point in hours\n", + "xa = x[:inflection]\n", + "ya = y[:inflection]\n", + "xb = x[inflection:]\n", + "yb = y[inflection:]\n", + "\n", + "fa = sp.poly1d(sp.polyfit(xa, ya, 1))\n", + "fb = sp.poly1d(sp.polyfit(xb, yb, 1))\n", + "\n", + "fa_error = error(fa, xa, ya)\n", + "fb_error = error(fb, xb, yb)\n", + "\n", + "print(\"Error inflection=%f\" % (fa_error + fb_error))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/.ipynb_checkpoints/BuildingMLSystemsWithPython_Ch2-checkpoint.ipynb b/.ipynb_checkpoints/BuildingMLSystemsWithPython_Ch2-checkpoint.ipynb new file mode 100644 index 00000000..286dcb3d --- /dev/null +++ b/.ipynb_checkpoints/BuildingMLSystemsWithPython_Ch2-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/BuildingMLSystemsWithPython_Ch1.ipynb b/BuildingMLSystemsWithPython_Ch1.ipynb new file mode 100644 index 00000000..cd71446c --- /dev/null +++ b/BuildingMLSystemsWithPython_Ch1.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy as sp" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "data = sp.genfromtxt(\"./ch01/data/web_traffic.tsv\", delimiter=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 1.00000000e+00 2.27200000e+03]\n", + " [ 2.00000000e+00 nan]\n", + " [ 3.00000000e+00 1.38600000e+03]\n", + " ..., \n", + " [ 7.41000000e+02 5.39200000e+03]\n", + " [ 7.42000000e+02 5.90600000e+03]\n", + " [ 7.43000000e+02 4.88100000e+03]]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(743, 2)\n" + ] + } + ], + "source": [ + "print(data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1.0, 2272.0)\n", + "(2.0, nan)\n", + "[ 1.00000000e+00 2.27200000e+03]\n" + ] + } + ], + "source": [ + "x = data[:,0]\n", + "y = data[:,1]\n", + "print(x[0], y[0]) #Get first column\n", + "print(x[1], y[1]) #Get second column\n", + "print(data[0,:]) #Get first row" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sp.sum(sp.isnan(y)) #Find invalid" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Clean vectors by constructing vector of which numbers are NaN and \n", + "#using that to index the vector\n", + "x = x[~sp.isnan(y)]\n", + "y = y[~sp.isnan(y)]\n", + "sp.sum(sp.isnan(y)) #No more invalid!" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt #Time for some plotting\n", + "# plot the (x,y) points with dots of size 10\n", + "plt.scatter(x, y, s=10)\n", + "plt.title(\"Web traffic over the last month\")\n", + "plt.xlabel(\"Time\")\n", + "plt.ylabel(\"Hits/hour\")\n", + "plt.xticks([w*7*24 for w in range(10)], ['week %i' % w for w in range(10)])\n", + "plt.autoscale(tight=True)\n", + "\n", + "# draw a slightly opaque, dashed grid\n", + "plt.grid(True, linestyle='-', color='0.75')\n", + "plt.show()\n", + "print(\"Done\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Parameters: [ 2.59619213 989.02487106]\n", + "[ 3.17389767e+08]\n", + "317389767.34\n" + ] + } + ], + "source": [ + "#Model Error: squared distance of model prediction to real data\n", + "def error(f, x, y):\n", + " return sp.sum((f(x)-y)**2)\n", + "\n", + "# Do the maths to create our model params\n", + "fp1, residuals, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)\n", + "\n", + "# Meaning our model is f(x) = 2.59619213 * x + 989.02487106\n", + "print(\"Model Parameters: %s\" % fp1)\n", + "print(residuals)\n", + "\n", + "#Create model from model params:\n", + "f1 = sp.poly1d(fp1)\n", + "print(error(f1, x, y))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Plot the new model against the data\n", + "fx = sp.linspace(0,x[-1], 1000) # generate X-values for plotting\n", + "plt.plot(fx, f1(fx), linewidth=4)\n", + "plt.legend([\"d=%i\" % f1.order], loc=\"upper left\")\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1.05322215e-02 -5.26545650e+00 1.97476082e+03]\n", + "179983507.878\n" + ] + } + ], + "source": [ + "# Time for some polynomial moddeling\n", + "f2p = sp.polyfit(x, y, 2)\n", + "print(f2p)\n", + "\n", + "f2 = sp.poly1d(f2p)\n", + "print(error(f2, x, y))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Plot the poly model\n", + "fx = sp.linspace(0,x[-1], 1000) # generate X-values for plotting\n", + "plt.plot(fx, f2(fx), linewidth=4)\n", + "plt.legend([\"d=%i\" % f1.order], loc=\"upper left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error inflection=132950348.197616\n" + ] + } + ], + "source": [ + "inflection = 3.5*7*24 # inflection point in hours\n", + "xa = x[:inflection]\n", + "ya = y[:inflection]\n", + "xb = x[inflection:]\n", + "yb = y[inflection:]\n", + "\n", + "fa = sp.poly1d(sp.polyfit(xa, ya, 1))\n", + "fb = sp.poly1d(sp.polyfit(xb, yb, 1))\n", + "\n", + "fa_error = error(fa, xa, ya)\n", + "fb_error = error(fb, xb, yb)\n", + "\n", + "print(\"Error inflection=%f\" % (fa_error + fb_error))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/BuildingMLSystemsWithPython_Ch2.ipynb b/BuildingMLSystemsWithPython_Ch2.ipynb new file mode 100644 index 00000000..b1a84c35 --- /dev/null +++ b/BuildingMLSystemsWithPython_Ch2.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "\n", + "# We load the data with load_iris from sklearn\n", + "from sklearn.datasets import load_iris\n", + "\n", + "data = load_iris()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# load_iris returns an object with several fields\n", + "features = data.data\n", + "feature_names = data.feature_names\n", + "target = data.target\n", + "target_names = data.target_names\n", + "for t in range(3):\n", + " if t == 0:\n", + " c = 'r'\n", + " marker = '>'\n", + " elif t == 1:\n", + " c = 'g'\n", + " marker = 'o'\n", + " elif t == 2:\n", + " c = 'b'\n", + " marker = 'x'\n", + " plt.scatter(features[target == t,0],\n", + " features[target == t,1],\n", + " marker=marker,\n", + " c=c)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maximum of setosa: 1.9.\n", + "Minimum of others: 3.0.\n" + ] + } + ], + "source": [ + "# We use NumPy fancy indexing to get an array of strings:\n", + "labels = target_names[target]\n", + "\n", + "# The petal length is the feature at position 2\n", + "plength = features[:, 2]\n", + "\n", + "# Build an array of booleans:\n", + "is_setosa = (labels == 'setosa')\n", + "\n", + "# This is the important step:\n", + "max_setosa = plength[is_setosa].max()\n", + "min_non_setosa = plength[~is_setosa].min()\n", + "print('Maximum of setosa: {0}.'.format(max_setosa))\n", + "print('Minimum of others: {0}.'.format(min_non_setosa))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# ~ is the boolean negation operator\n", + "features = features[~is_setosa]\n", + "labels = labels[~is_setosa]\n", + "# Build a new target variable, is_virginica\n", + "is_virginica = (labels == 'virginica')\n", + "# print(is_virginica)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best acc: 0.94, fi: 3, t: 1.6, reverse: False\n" + ] + } + ], + "source": [ + "# Check which feature + threshold results in best classification \n", + "# accuracy\n", + "\n", + "# Initialize best_acc to impossibly low value\n", + "best_acc = -1.0\n", + "\n", + "for fi in range(features.shape[1]):\n", + " # We are going to test all possible thresholds\n", + " # print(\"Testing %s\" % fi)\n", + " thresh = features[:,fi]\n", + " for t in thresh:\n", + " # Get the vector for feature `fi`\n", + " feature_i = features[:, fi]\n", + "\n", + " # apply threshold `t`\n", + " pred = (feature_i > t)\n", + " acc = (pred == is_virginica).mean()\n", + " rev_acc = (pred == ~is_virginica).mean()\n", + "\n", + " if rev_acc > acc:\n", + " reverse = True\n", + " acc = rev_acc\n", + " else:\n", + " reverse = False\n", + " \n", + " if acc > best_acc:\n", + " best_acc = acc\n", + " best_fi = fi\n", + " best_t = t\n", + " best_reverse = reverse\n", + " \n", + "print(\"Best acc: %s, fi: %s, t: %s, reverse: %s\" \n", + " % (best_acc, best_fi, best_t, best_reverse)) " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "#print(features[1, :])\n", + "#And now we have our classification method\n", + "def is_virginica_test(fi, t, reverse, example):\n", + " \"Apply threshold model to a new example\"\n", + " test = example[fi] > t\n", + " if reverse:\n", + " test = not test\n", + " return test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}