|
| 1 | +import os |
| 2 | +import scipy as sp |
| 3 | +import matplotlib.pyplot as plt |
| 4 | + |
| 5 | +data_dir = os.path.join( |
| 6 | + os.path.dirname(os.path.realpath(__file__)), "..", "data") |
| 7 | +data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t") |
| 8 | +print(data[:10]) |
| 9 | + |
| 10 | +# all examples will have three classes in this file |
| 11 | +colors = ['g', 'k', 'b', 'm', 'r'] |
| 12 | +linestyles = ['-', '-.', '--', ':', '-'] |
| 13 | + |
| 14 | +x = data[:, 0] |
| 15 | +y = data[:, 1] |
| 16 | +print("Number of invalid entries:", sp.sum(sp.isnan(y))) |
| 17 | +x = x[~sp.isnan(y)] |
| 18 | +y = y[~sp.isnan(y)] |
| 19 | + |
| 20 | +# plot input data |
| 21 | + |
| 22 | + |
| 23 | +def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None): |
| 24 | + plt.clf() |
| 25 | + plt.scatter(x, y, s=10) |
| 26 | + plt.title("Web traffic over the last month") |
| 27 | + plt.xlabel("Time") |
| 28 | + plt.ylabel("Hits/hour") |
| 29 | + plt.xticks( |
| 30 | + [w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)]) |
| 31 | + |
| 32 | + if models: |
| 33 | + if mx is None: |
| 34 | + mx = sp.linspace(0, x[-1], 1000) |
| 35 | + for model, style, color in zip(models, linestyles, colors): |
| 36 | + # print "Model:",model |
| 37 | + # print "Coeffs:",model.coeffs |
| 38 | + plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color) |
| 39 | + |
| 40 | + plt.legend(["d=%i" % m.order for m in models], loc="upper left") |
| 41 | + |
| 42 | + plt.autoscale(tight=True) |
| 43 | + plt.ylim(ymin=0) |
| 44 | + if ymax: |
| 45 | + plt.ylim(ymax=ymax) |
| 46 | + if xmin: |
| 47 | + plt.xlim(xmin=xmin) |
| 48 | + plt.grid(True, linestyle='-', color='0.75') |
| 49 | + plt.savefig(fname) |
| 50 | + |
| 51 | +# first look at the data |
| 52 | +plot_models(x, y, None, os.path.join("..", "1400_01_01.png")) |
| 53 | + |
| 54 | +# create and plot models |
| 55 | +fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True) |
| 56 | +print("Model parameters: %s" % fp1) |
| 57 | +print("Error of the model:", res) |
| 58 | +f1 = sp.poly1d(fp1) |
| 59 | +f2 = sp.poly1d(sp.polyfit(x, y, 2)) |
| 60 | +f3 = sp.poly1d(sp.polyfit(x, y, 3)) |
| 61 | +f10 = sp.poly1d(sp.polyfit(x, y, 10)) |
| 62 | +f100 = sp.poly1d(sp.polyfit(x, y, 100)) |
| 63 | + |
| 64 | +plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png")) |
| 65 | +plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png")) |
| 66 | +plot_models( |
| 67 | + x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png")) |
| 68 | + |
| 69 | +# fit and plot a model using the knowledge about inflection point |
| 70 | +inflection = 3.5 * 7 * 24 |
| 71 | +xa = x[:inflection] |
| 72 | +ya = y[:inflection] |
| 73 | +xb = x[inflection:] |
| 74 | +yb = y[inflection:] |
| 75 | + |
| 76 | +fa = sp.poly1d(sp.polyfit(xa, ya, 1)) |
| 77 | +fb = sp.poly1d(sp.polyfit(xb, yb, 1)) |
| 78 | + |
| 79 | +plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png")) |
| 80 | + |
| 81 | + |
| 82 | +def error(f, x, y): |
| 83 | + return sp.sum((f(x) - y) ** 2) |
| 84 | + |
| 85 | +print("Errors for the complete data set:") |
| 86 | +for f in [f1, f2, f3, f10, f100]: |
| 87 | + print("Error d=%i: %f" % (f.order, error(f, x, y))) |
| 88 | + |
| 89 | +print("Errors for only the time after inflection point") |
| 90 | +for f in [f1, f2, f3, f10, f100]: |
| 91 | + print("Error d=%i: %f" % (f.order, error(f, xb, yb))) |
| 92 | + |
| 93 | +print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb))) |
| 94 | + |
| 95 | + |
| 96 | +# extrapolating into the future |
| 97 | +plot_models( |
| 98 | + x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"), |
| 99 | + mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), |
| 100 | + ymax=10000, xmin=0 * 7 * 24) |
| 101 | + |
| 102 | +print("Trained only on data after inflection point") |
| 103 | +fb1 = fb |
| 104 | +fb2 = sp.poly1d(sp.polyfit(xb, yb, 2)) |
| 105 | +fb3 = sp.poly1d(sp.polyfit(xb, yb, 3)) |
| 106 | +fb10 = sp.poly1d(sp.polyfit(xb, yb, 10)) |
| 107 | +fb100 = sp.poly1d(sp.polyfit(xb, yb, 100)) |
| 108 | + |
| 109 | +print("Errors for only the time after inflection point") |
| 110 | +for f in [fb1, fb2, fb3, fb10, fb100]: |
| 111 | + print("Error d=%i: %f" % (f.order, error(f, xb, yb))) |
| 112 | + |
| 113 | +plot_models( |
| 114 | + x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"), |
| 115 | + mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), |
| 116 | + ymax=10000, xmin=0 * 7 * 24) |
| 117 | + |
| 118 | +# separating training from testing data |
| 119 | +frac = 0.3 |
| 120 | +split_idx = int(frac * len(xb)) |
| 121 | +shuffled = sp.random.permutation(list(range(len(xb)))) |
| 122 | +test = sorted(shuffled[:split_idx]) |
| 123 | +train = sorted(shuffled[split_idx:]) |
| 124 | +fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1)) |
| 125 | +fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2)) |
| 126 | +fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3)) |
| 127 | +fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10)) |
| 128 | +fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100)) |
| 129 | + |
| 130 | +print("Test errors for only the time after inflection point") |
| 131 | +for f in [fbt1, fbt2, fbt3, fbt10, fbt100]: |
| 132 | + print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test]))) |
| 133 | + |
| 134 | +plot_models( |
| 135 | + x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..", |
| 136 | + "1400_01_08.png"), |
| 137 | + mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100), |
| 138 | + ymax=10000, xmin=0 * 7 * 24) |
| 139 | + |
| 140 | +from scipy.optimize import fsolve |
| 141 | +print(fbt2) |
| 142 | +print(fbt2 - 100000) |
| 143 | +reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24) |
| 144 | +print("100,000 hits/hour expected at week %f" % reached_max[0]) |
0 commit comments