diff --git a/Chapter2_MorePyMC/daft_plot.py b/Chapter2_MorePyMC/daft_plot.py index 5f68e901..30e38831 100644 --- a/Chapter2_MorePyMC/daft_plot.py +++ b/Chapter2_MorePyMC/daft_plot.py @@ -1,15 +1,14 @@ -#daft drawing for SMS example +# daft drawing for SMS example import matplotlib.pyplot as plt - try: import daft except ImportError: print "python library Daft required." - -pgm = daft.PGM([9, 4], origin=[.5,.5]) + +pgm = daft.PGM([9, 4], origin=[.5, .5]) pgm.add_node(daft.Node("tau", r"$\tau$", 4.0, 3.5)) pgm.add_node(daft.Node("alpha", r"$\alpha$", 6, 4.0)) pgm.add_node(daft.Node("lambda1", r"$\lambda_1$", 5.5, 3.2,)) @@ -18,7 +17,6 @@ pgm.add_node(daft.Node("obs", "obs", 5.0, 1.0, 1.2, observed=True)) - pgm.add_edge("tau", "lambda") pgm.add_edge("alpha", "lambda1") pgm.add_edge("alpha", "lambda2") @@ -27,5 +25,5 @@ pgm.add_edge("lambda", "obs") pgm.render() -plt.figure( figsize=(12,5) ) -plt.show() \ No newline at end of file +plt.figure(figsize=(12, 5)) +plt.show() diff --git a/Chapter2_MorePyMC/separation_plot.py b/Chapter2_MorePyMC/separation_plot.py index 981f968b..bf298a8e 100644 --- a/Chapter2_MorePyMC/separation_plot.py +++ b/Chapter2_MorePyMC/separation_plot.py @@ -7,49 +7,44 @@ import numpy as np - -def separation_plot( p, y, **kwargs ): +def separation_plot(p, y, **kwargs): """ This function creates a separation plot for logistic and probit classification. See http://mdwardlab.com/sites/default/files/GreenhillWardSacks.pdf - + p: The proportions/probabilities, can be a nxM matrix which represents M models. y: the 0-1 response variables. - - """ + + """ assert p.shape[0] == y.shape[0], "p.shape[0] != y.shape[0]" n = p.shape[0] try: M = p.shape[1] except: - p = p.reshape( n, 1 ) + p = p.reshape(n, 1) M = p.shape[1] - #colors = np.array( ["#fdf2db", "#e44a32"] ) - colors_bmh = np.array( ["#eeeeee", "#348ABD"] ) + # colors = np.array( ["#fdf2db", "#e44a32"] ) + colors_bmh = np.array(["#eeeeee", "#348ABD"]) + fig = plt.figure() # figsize = (8, 1.3*M) ) - fig = plt.figure( )#figsize = (8, 1.3*M) ) - for i in range(M): - ax = fig.add_subplot(M, 1, i+1) - ix = np.argsort( p[:,i] ) - #plot the different bars - bars = ax.bar( np.arange(n), np.ones(n), width=1., - color = colors_bmh[ y[ix].astype(int) ], - edgecolor = 'none') - ax.plot( np.arange(n), p[ix,i], "k", - linewidth = 1.,drawstyle="steps-post" ) - #create expected value bar. - ax.vlines( [(1-p[ix,i]).sum()], [0], [1] ) - #ax.grid(False) - #ax.axis('off') - plt.xlim( 0, n-1) - + ax = fig.add_subplot(M, 1, i + 1) + ix = np.argsort(p[:, i]) + # plot the different bars + bars = ax.bar(np.arange(n), np.ones(n), width=1., + color=colors_bmh[y[ix].astype(int)], + edgecolor='none') + ax.plot(np.arange(n), p[ix, i], "k", + linewidth=1., drawstyle="steps-post") + # create expected value bar. + ax.vlines([(1 - p[ix, i]).sum()], [0], [1]) + # ax.grid(False) + # ax.axis('off') + plt.xlim(0, n - 1) + plt.tight_layout() - - return - - + return diff --git a/Chapter3_MCMC/github_pull.py b/Chapter3_MCMC/github_pull.py index 60a1fa7e..77276570 100644 --- a/Chapter3_MCMC/github_pull.py +++ b/Chapter3_MCMC/github_pull.py @@ -1,4 +1,4 @@ -#github data scrapper +# github data scrapper """ variables of interest: @@ -21,69 +21,66 @@ from requests import get - MAX = 8000000 -today = datetime.datetime.today() +today = datetime.datetime.today() randint = np.random.randint -N = 120 #sample size. -auth = ("username", "password" ) +N = 120 # sample size. +auth = ("username", "password") -language_mappings = {"Python": 0, "JavaScript": 1, "Ruby": 2, "Java":3, "Shell":4, "PHP":5} +language_mappings = { + "Python": 0, "JavaScript": 1, "Ruby": 2, "Java": 3, "Shell": 4, "PHP": 5} -#define data matrix: -X = np.zeros( (N , 12), dtype = int ) +# define data matrix: +X = np.zeros((N, 12), dtype=int) for i in xrange(N): is_fork = True is_valid_language = False - + while is_fork == True or is_valid_language == False: is_fork = True is_valid_language = False - - params = {"since":randint(0, MAX ) } - r = get("https://api.github.com/repositories", params = params, auth=auth ) - results = loads( r.text )[0] - #im only interested in the first one, and if it is not a fork. + + params = {"since": randint(0, MAX)} + r = get( + "https://api.github.com/repositories", params=params, auth=auth) + results = loads(r.text)[0] + # im only interested in the first one, and if it is not a fork. is_fork = results["fork"] - - r = get( results["url"], auth = auth) - - #check the language - repo_results = loads( r.text ) - try: - language_mappings[ repo_results["language" ] ] + + r = get(results["url"], auth=auth) + + # check the language + repo_results = loads(r.text) + try: + language_mappings[repo_results["language"]] is_valid_language = True except: pass - - - #languages - X[ i, language_mappings[ repo_results["language" ] ] ] = 1 - - #delta time - X[ i, 6] = ( today - datetime.datetime.strptime( repo_results["created_at"][:10], "%Y-%m-%d" ) ).days - - #haswiki + # languages + X[i, language_mappings[repo_results["language"]]] = 1 + + # delta time + X[i, 6] = ( + today - datetime.datetime.strptime(repo_results["created_at"][:10], "%Y-%m-%d")).days + + # haswiki X[i, 7] = repo_results["has_wiki"] - - #get user information - r = get( results["owner"]["url"] , auth = auth) - user_results = loads( r.text ) + + # get user information + r = get(results["owner"]["url"], auth=auth) + user_results = loads(r.text) X[i, 8] = user_results["following"] X[i, 9] = user_results["followers"] - - #get dep. data + + # get dep. data X[i, 10] = repo_results["watchers_count"] X[i, 11] = repo_results["forks_count"] - print + print print " -------------- " - print i, ": ", results["full_name"], repo_results["language" ], repo_results["watchers_count"], repo_results["forks_count"] + print i, ": ", results["full_name"], repo_results["language"], repo_results["watchers_count"], repo_results["forks_count"] print " -------------- " - print - -np.savetxt("data/github_data.csv", X, delimiter=",", fmt="%d" ) - - + print +np.savetxt("data/github_data.csv", X, delimiter=",", fmt="%d") diff --git a/Chapter4_TheGreatestTheoremNeverTold/top_pic_comments.py b/Chapter4_TheGreatestTheoremNeverTold/top_pic_comments.py index 9f0eeffe..37e4eda3 100644 --- a/Chapter4_TheGreatestTheoremNeverTold/top_pic_comments.py +++ b/Chapter4_TheGreatestTheoremNeverTold/top_pic_comments.py @@ -7,20 +7,20 @@ reddit = praw.Reddit("BayesianMethodsForHackers") -subreddit = reddit.get_subreddit( "pics" ) +subreddit = reddit.get_subreddit("pics") top_submissions = subreddit.get_top() -n_pic = int( sys.argv[1] ) if sys.argv[1] else 1 +n_pic = int(sys.argv[1]) if sys.argv[1] else 1 i = 0 while i < n_pic: top_submission = top_submissions.next() while "i.imgur.com" not in top_submission.url: - #make sure it is linking to an image, not a webpage. + # make sure it is linking to an image, not a webpage. top_submission = top_submissions.next() - i+=1 + i += 1 print "Title of submission: \n", top_submission.title top_post_url = top_submission.url @@ -31,33 +31,13 @@ downvotes = [] contents = [] _all_comments = top_submission.comments -all_comments=[] +all_comments = [] for comment in _all_comments: - try: - upvotes.append( comment.ups ) - downvotes.append( comment.downs ) - contents.append( comment.body ) - except Exception as e: - continue - -votes = np.array( [ upvotes, downvotes] ).T - - - - - - - - - - - - - - - - - - - - + try: + upvotes.append(comment.ups) + downvotes.append(comment.downs) + contents.append(comment.body) + except Exception as e: + continue + +votes = np.array([upvotes, downvotes]).T diff --git a/Chapter5_LossFunctions/DarkWorldsMetric.py b/Chapter5_LossFunctions/DarkWorldsMetric.py index 0b59edf4..7284f56a 100644 --- a/Chapter5_LossFunctions/DarkWorldsMetric.py +++ b/Chapter5_LossFunctions/DarkWorldsMetric.py @@ -19,7 +19,8 @@ import string as st import random as rd -def calc_delta_r(x_predicted,y_predicted,x_true,y_true): + +def calc_delta_r(x_predicted, y_predicted, x_true, y_true): """ Compute the scalar distance between predicted halo centers and the true halo centers. Predictions are matched to the closest halo center. @@ -33,59 +34,78 @@ def calc_delta_r(x_predicted,y_predicted,x_true,y_true): measured_halo_indexes: vector containing indexes of the predicted halo position with the reference to the true halo position. e.g if true_halo_indexes=[0,1] and measured_halo_indexes=[1,0] then the first x,y coordinates of the true halo position matches the second input of the predicted x,y coordinates. """ - - num_halos=len(x_true) #Only works for number of halos > 1 - num_configurations=mt.factorial(num_halos) #The number of possible different comb - configurations=np.zeros([num_halos,num_configurations],int) #The array of combinations - #I will pass back - distances = np.zeros([num_configurations],float) #The array of the distances - #for all possible combinations - - radial_distance=[] #The vector of distances - #I will pass back - - #Pick a combination of true and predicted - a=['01','012'] #Input for the permutatiosn, 01 number halos or 012 - count=0 #For the index of the distances array - true_halo_indexes=[] #The tuples which will show the order of halos picked - predicted_halo_indexes=[] - distances_perm=np.zeros([num_configurations,num_halos],float) #The distance between each + + num_halos = len(x_true) # Only works for number of halos > 1 + # The number of possible different comb + num_configurations = mt.factorial(num_halos) + # The array of combinations + configurations = np.zeros([num_halos, num_configurations], int) + # I will pass + # back + # The array of the distances + distances = np.zeros([num_configurations], float) + # for all possible + # combinations + + radial_distance = [] # The vector of distances + # I will pass back + + # Pick a combination of true and predicted + a = ['01', '012'] # Input for the permutatiosn, 01 number halos or 012 + count = 0 # For the index of the distances array + # The tuples which will show the order of halos picked + true_halo_indexes = [] + predicted_halo_indexes = [] + # The distance between each + distances_perm = np.zeros([num_configurations, num_halos], float) #true and predicted - #halo for every comb - true_halo_indexes_perm=[] #log of all the permutations of true halos used - predicted_halo_indexes_perm=[] #log of all the predicted permutations - - for perm in it.permutations(a[num_halos-2],num_halos): - which_true_halos=[] - which_predicted_halos=[] - for j in xrange(num_halos): #loop through all the true halos with the - - distances_perm[count,j]=np.sqrt((x_true[j]-x_predicted[int(perm[j])])**2\ - +(y_true[j]-y_predicted[int(perm[j])])**2) - #This array logs the distance between true and - #predicted halo for ALL configurations - - which_true_halos.append(j) #log the order in which I try each true halo - which_predicted_halos.append(int(perm[j])) #log the order in which I true - #each predicted halo - true_halo_indexes_perm.append(which_true_halos) #this is a tuple of tuples of - #all of thifferent config - #true halo indexes + # halo for + # every comb + # log of all the permutations of true halos used + true_halo_indexes_perm = [] + predicted_halo_indexes_perm = [] # log of all the predicted permutations + + for perm in it.permutations(a[num_halos - 2], num_halos): + which_true_halos = [] + which_predicted_halos = [] + for j in xrange(num_halos): # loop through all the true halos with the + + distances_perm[count, j] = np.sqrt((x_true[j] - x_predicted[int(perm[j])]) ** 2 + + (y_true[j] - y_predicted[int(perm[j])]) ** 2) + # This array logs the distance between true and + # predicted halo for ALL configurations + + # log the order in which I try each true halo + which_true_halos.append(j) + # log the order in which I true + which_predicted_halos.append(int(perm[j])) + # each predicted halo + # this is a tuple of tuples of + true_halo_indexes_perm.append(which_true_halos) + # all of thifferent config + # true halo indexes predicted_halo_indexes_perm.append(which_predicted_halos) - - distances[count]=sum(distances_perm[count,0::]) #Find what the total distances - #are for each configuration - count=count+1 - - config = np.where(distances == min(distances))[0][0] #The configuration used is the one - #which has the smallest distance - radial_distance.append(distances_perm[config,0::]) #Find the tuple of distances that - #correspond to this smallest distance - true_halo_indexes=true_halo_indexes_perm[config] #Find the tuple of the index which refers - #to the smallest distance - predicted_halo_indexes=predicted_halo_indexes_perm[config] - - return radial_distance,true_halo_indexes,predicted_halo_indexes + + # Find what the total distances + distances[count] = sum(distances_perm[count, 0::]) + # are for each + # configuration + count = count + 1 + + # The configuration used is the one + config = np.where(distances == min(distances))[0][0] + # which has the + # smallest distance + # Find the tuple of distances that + radial_distance.append(distances_perm[config, 0::]) + # correspond to this + # smallest distance + # Find the tuple of the index which refers + true_halo_indexes = true_halo_indexes_perm[config] + # to the smallest distance + predicted_halo_indexes = predicted_halo_indexes_perm[config] + + return radial_distance, true_halo_indexes, predicted_halo_indexes def calc_theta(x_predicted, y_predicted, x_true, y_true, x_ref, y_ref): @@ -101,33 +121,38 @@ def calc_theta(x_predicted, y_predicted, x_true, y_true, x_ref, y_ref): with the vector joining the reference point and the halo as the zero line. """ - num_halos=len(x_predicted) - theta=np.zeros([num_halos+1],float) #Set up the array which will pass back the values - phi = np.zeros([num_halos],float) - - psi = np.arctan( (y_true-y_ref)/(x_true-x_ref) ) + num_halos = len(x_predicted) + # Set up the array which will pass back the values + theta = np.zeros([num_halos + 1], float) + phi = np.zeros([num_halos], float) + + psi = np.arctan((y_true - y_ref) / (x_true - x_ref)) - # Angle at which the halo is at - #with respect to the reference point - phi[x_true != x_ref] = np.arctan((y_predicted[x_true != x_predicted]-\ - y_true[x_true != x_predicted])\ - /(x_predicted[x_true != x_predicted]-\ - x_true[x_true != x_predicted])) # Angle of the estimate - #wrt true halo centre - - #Before finding the angle with the zero line as the line joiing the halo and the reference - #point I need to convert the angle produced by Python to an angle between 0 and 2pi - phi =convert_to_360(phi, x_predicted-x_true,\ - y_predicted-y_true) - psi = convert_to_360(psi, x_true-x_ref,\ - y_true-y_ref) - theta = phi-psi #The angle with the baseline as the line joing the ref and the halo - - - theta[theta< 0.0]=theta[theta< 0.0]+2.0*mt.pi #If the angle of the true pos wrt the ref is - #greater than the angle of predicted pos - #and the true pos then add 2pi + # with respect to the + # reference point + phi[x_true != x_ref] = np.arctan((y_predicted[x_true != x_predicted] - + y_true[x_true != x_predicted]) + / (x_predicted[x_true != x_predicted] - + x_true[x_true != x_predicted])) # Angle of the estimate + # wrt true halo + # centre + + # Before finding the angle with the zero line as the line joiing the halo and the reference + # point I need to convert the angle produced by Python to an angle between + # 0 and 2pi + phi = convert_to_360(phi, x_predicted - x_true, + y_predicted - y_true) + psi = convert_to_360(psi, x_true - x_ref, + y_true - y_ref) + # The angle with the baseline as the line joing the ref and the halo + theta = phi - psi + + # If the angle of the true pos wrt the ref is + theta[theta < 0.0] = theta[theta < 0.0] + 2.0 * mt.pi + # greater than the angle of predicted pos + # and the true pos then add + # 2pi return theta @@ -143,25 +168,24 @@ def convert_to_360(angle, x_in, y_in): n = len(x_in) for i in xrange(n): if x_in[i] < 0 and y_in[i] > 0: - angle[i] = angle[i]+mt.pi + angle[i] = angle[i] + mt.pi elif x_in[i] < 0 and y_in[i] < 0: - angle[i] = angle[i]+mt.pi + angle[i] = angle[i] + mt.pi elif x_in[i] > 0 and y_in[i] < 0: - angle[i] = angle[i]+2.0*mt.pi + angle[i] = angle[i] + 2.0 * mt.pi elif x_in[i] == 0 and y_in[i] == 0: angle[i] = 0 elif x_in[i] == 0 and y_in[i] > 0: - angle[i] = mt.pi/2. + angle[i] = mt.pi / 2. elif x_in[i] < 0 and y_in[i] == 0: angle[i] = mt.pi elif x_in[i] == 0 and y_in[i] < 0: - angle[i] = 3.*mt.pi/2. - - + angle[i] = 3. * mt.pi / 2. return angle -def get_ref(x_halo,y_halo,weight): + +def get_ref(x_halo, y_halo, weight): """ Gets the reference point of the system of halos by weighted averaging the x and y coordinates. Arguments: @@ -171,263 +195,277 @@ def get_ref(x_halo,y_halo,weight): Returns: x_ref, y_ref: The coordinates of the reference point for the metric """ - - #Find the weighted average of the x and y coordinates - x_ref = np.sum([x_halo*weight])/np.sum([weight]) - y_ref = np.sum([y_halo*weight])/np.sum([weight]) + # Find the weighted average of the x and y coordinates + x_ref = np.sum([x_halo * weight]) / np.sum([weight]) + y_ref = np.sum([y_halo * weight]) / np.sum([weight]) + return x_ref, y_ref - return x_ref,y_ref - -def main_score( nhalo_all, x_true_all, y_true_all, x_ref_all, y_ref_all, sky_prediction): +def main_score(nhalo_all, x_true_all, y_true_all, x_ref_all, y_ref_all, sky_prediction): """abstracts the score from the old command-line interface. sky_prediction is a dx2 array of predicted x,y positions - + -camdp""" - - r=np.array([],dtype=float) # The array which I will log all the calculated radial distances - angle=np.array([],dtype=float) #The array which I will log all the calculated angles - #Load in the sky_ids from the true - num_halos_total=0 #Keep track of how many halos are input into the metric - + # The array which I will log all the calculated radial distances + r = np.array([], dtype=float) + # The array which I will log all the calculated angles + angle = np.array([], dtype=float) + # Load in the sky_ids from the true + # Keep track of how many halos are input into the metric + num_halos_total = 0 - for selectskyinsolutions, sky in enumerate(sky_prediction): #Loop through each line in result.csv and analyse each one + # Loop through each line in result.csv and analyse each one + for selectskyinsolutions, sky in enumerate(sky_prediction): + nhalo = int(nhalo_all[selectskyinsolutions]) # How many halos in the + # selected sky? + x_true = x_true_all[selectskyinsolutions][0:nhalo] + y_true = y_true_all[selectskyinsolutions][0:nhalo] - nhalo=int(nhalo_all[selectskyinsolutions])#How many halos in the - #selected sky? - x_true=x_true_all[selectskyinsolutions][0:nhalo] - y_true=y_true_all[selectskyinsolutions][0:nhalo] - - x_predicted=np.array([],dtype=float) - y_predicted=np.array([],dtype=float) + x_predicted = np.array([], dtype=float) + y_predicted = np.array([], dtype=float) for i in xrange(nhalo): - x_predicted=np.append(x_predicted,float(sky[0])) #get the predicted values - y_predicted=np.append(y_predicted,float(sky[1])) - #The solution file for the test data provides masses - #to calculate the centre of mass where as the Training_halo.csv - #direct provides x_ref y_ref. So in the case of test data - #we need to calculate the ref point from the masses using - #Get_ref() - - x_ref=x_ref_all[selectskyinsolutions] - y_ref=y_ref_all[selectskyinsolutions] + # get the predicted values + x_predicted = np.append(x_predicted, float(sky[0])) + y_predicted = np.append(y_predicted, float(sky[1])) + # The solution file for the test data provides masses + # to calculate the centre of mass where as the Training_halo.csv + # direct provides x_ref y_ref. So in the case of test data + # we need to calculate the ref point from the masses using + # Get_ref() - num_halos_total=num_halos_total+nhalo + x_ref = x_ref_all[selectskyinsolutions] + y_ref = y_ref_all[selectskyinsolutions] + num_halos_total = num_halos_total + nhalo - #Single halo case, this needs to be separately calculated since + # Single halo case, this needs to be separately calculated since #x_ref = x_true if nhalo == 1: - #What is the radial distance between the true and predicted position - r=np.append(r,np.sqrt( (x_predicted-x_true)**2 \ - + (y_predicted-y_true)**2)) - #What is the angle between the predicted position and true halo position - if (x_predicted-x_true) != 0: - psi = np.arctan((y_predicted-y_true)/(x_predicted-x_true)) - else: psi=0. - theta = convert_to_360([psi], [x_predicted-x_true], [y_predicted-y_true]) - angle=np.append(angle,theta) - - - else: - #r_index_index, contains the radial distances of the predicted to - #true positions. These are found by matching up the true halos to - #the predicted halos such that the average of all the radial distances - #is optimal. it also contains indexes of the halos used which are used to - #show which halo has been mathced to which. - - r_index_index = calc_delta_r(x_predicted, y_predicted, x_true, \ + # What is the radial distance between the true and predicted + # position + r = np.append(r, np.sqrt((x_predicted - x_true) ** 2 + + (y_predicted - y_true) ** 2)) + # What is the angle between the predicted position and true halo + # position + if (x_predicted - x_true) != 0: + psi = np.arctan( + (y_predicted - y_true) / (x_predicted - x_true)) + else: + psi = 0. + theta = convert_to_360( + [psi], [x_predicted - x_true], [y_predicted - y_true]) + angle = np.append(angle, theta) + + else: + # r_index_index, contains the radial distances of the predicted to + # true positions. These are found by matching up the true halos to + # the predicted halos such that the average of all the radial distances + # is optimal. it also contains indexes of the halos used which are used to + # show which halo has been mathced to which. + + r_index_index = calc_delta_r(x_predicted, y_predicted, x_true, y_true) - - r=np.append(r,r_index_index[0][0]) - halo_index= r_index_index[1] #The true halos indexes matched with the - predicted_index=r_index_index[2] #predicted halo index - - angle=np.append(angle,calc_theta\ - (x_predicted[predicted_index],\ - y_predicted[predicted_index],\ - x_true[halo_index],\ - y_true[halo_index],x_ref,\ - y_ref)) # Find the angles of the predicted - #position wrt to the halo and + + r = np.append(r, r_index_index[0][0]) + # The true halos indexes matched with the + halo_index = r_index_index[1] + predicted_index = r_index_index[2] # predicted halo index + + angle = np.append(angle, calc_theta + (x_predicted[predicted_index], + y_predicted[predicted_index], + x_true[halo_index], + y_true[halo_index], x_ref, + y_ref)) # Find the angles of the predicted + # position wrt to the halo and # add to the vector angle - # Find what the average distance the estimate is from the halo position - av_r=sum(r)/len(r) - - #In order to quantify the orientation invariance we will express each angle + av_r = sum(r) / len(r) + + # In order to quantify the orientation invariance we will express each angle # as a vector and find the average vector - #R_bar^2=(1/N Sum^Ncos(theta))^2+(1/N Sum^Nsin(theta))**2 - + # R_bar^2=(1/N Sum^Ncos(theta))^2+(1/N Sum^Nsin(theta))**2 + N = float(num_halos_total) - angle_vec = np.sqrt(( 1.0/N * sum(np.cos(angle)) )**2 + \ - ( 1.0/N * sum(np.sin(angle)) )**2) - - W1=1./1000. #Weight the av_r such that < 1 is a good score > 1 is not so good. - W2=1. - metric = W1*av_r + W2*angle_vec #Weighted metric, weights TBD + angle_vec = np.sqrt((1.0 / N * sum(np.cos(angle))) ** 2 + + (1.0 / N * sum(np.sin(angle))) ** 2) + + # Weight the av_r such that < 1 is a good score > 1 is not so good. + W1 = 1. / 1000. + W2 = 1. + metric = W1 * av_r + W2 * angle_vec # Weighted metric, weights TBD print 'Your average distance in pixels you are away from the true halo is', av_r print 'Your average angular vector is', angle_vec print 'Your score for the training data is', metric return metric - - + + def main(user_fname, fname): """ Script to compute the evaluation metric for the Observing Dark Worlds competition. You can run it on your training data to understand how well you have done with the training data. """ - r=np.array([],dtype=float) # The array which I will log all the calculated radial distances - angle=np.array([],dtype=float) #The array which I will log all the calculated angles - #Load in the sky_ids from the true - - true_sky_id=[] - sky_loader = c.reader(open(fname, 'rb')) #Load in the sky_ids from the solution file + # The array which I will log all the calculated radial distances + r = np.array([], dtype=float) + # The array which I will log all the calculated angles + angle = np.array([], dtype=float) + # Load in the sky_ids from the true + + true_sky_id = [] + # Load in the sky_ids from the solution file + sky_loader = c.reader(open(fname, 'rb')) for row in sky_loader: true_sky_id.append(row[0]) - #Load in the true values from the solution file + # Load in the true values from the solution file - nhalo_all=np.loadtxt(fname,usecols=(1,),delimiter=',',skiprows=1) - x_true_all=np.loadtxt(fname,usecols=(4,6,8),delimiter=',',skiprows=1) - y_true_all=np.loadtxt(fname,usecols=(5,7,9),delimiter=',',skiprows=1) - x_ref_all=np.loadtxt(fname,usecols=(2,),delimiter=',',skiprows=1) - y_ref_all=np.loadtxt(fname,usecols=(3,),delimiter=',',skiprows=1) + nhalo_all = np.loadtxt(fname, usecols=(1,), delimiter=',', skiprows=1) + x_true_all = np.loadtxt( + fname, usecols=(4, 6, 8), delimiter=',', skiprows=1) + y_true_all = np.loadtxt( + fname, usecols=(5, 7, 9), delimiter=',', skiprows=1) + x_ref_all = np.loadtxt(fname, usecols=(2,), delimiter=',', skiprows=1) + y_ref_all = np.loadtxt(fname, usecols=(3,), delimiter=',', skiprows=1) - for row in sky_loader: true_sky_id.append(row[1]) - - - num_halos_total=0 #Keep track of how many halos are input into the metric + # Keep track of how many halos are input into the metric + num_halos_total = 0 + sky_prediction = c.reader(open(user_fname, 'rb')) # Open the result.csv - sky_prediction = c.reader(open(user_fname, 'rb')) #Open the result.csv - - try: #See if the input file from user has a header on it - #with open('JoyceTest/trivialUnitTest_Pred.txt', 'r') as f: - with open(user_fname, 'r') as f: - header = float((f.readline()).split(',')[1]) #try and make where the - #first input would be - #a float, if succeed it - #is not a header + try: # See if the input file from user has a header on it + # with open('JoyceTest/trivialUnitTest_Pred.txt', 'r') as f: + with open(user_fname, 'r') as f: + # try and make where the + header = float((f.readline()).split(',')[1]) + # first input would be + # a float, if succeed it + # is not a header print 'THE INPUT FILE DOES NOT APPEAR TO HAVE A HEADER' - except : + except: print 'THE INPUT FILE APPEARS TO HAVE A HEADER, SKIPPING THE FIRST LINE' skip_header = sky_prediction.next() - - - for sky in sky_prediction: #Loop through each line in result.csv and analyse each one - sky_id = str(sky[0]) #Get the sky_id of the input - does_it_exist=true_sky_id.count(sky_id) #Is the input sky_id - #from user a real one? - - if does_it_exist > 0: #If it does then find the matching solutions to the sky_id - selectskyinsolutions=true_sky_id.index(sky_id)-1 - else: #Otherwise exit - print 'Sky_id does not exist, formatting problem: ',sky_id + + # Loop through each line in result.csv and analyse each one + for sky in sky_prediction: + sky_id = str(sky[0]) # Get the sky_id of the input + does_it_exist = true_sky_id.count(sky_id) # Is the input sky_id + # from user a real one? + + # If it does then find the matching solutions to the sky_id + if does_it_exist > 0: + selectskyinsolutions = true_sky_id.index(sky_id) - 1 + else: # Otherwise exit + print 'Sky_id does not exist, formatting problem: ', sky_id sys.exit(2) + nhalo = int(nhalo_all[selectskyinsolutions]) # How many halos in the + # selected sky? + x_true = x_true_all[selectskyinsolutions][0:nhalo] + y_true = y_true_all[selectskyinsolutions][0:nhalo] - nhalo=int(nhalo_all[selectskyinsolutions])#How many halos in the - #selected sky? - x_true=x_true_all[selectskyinsolutions][0:nhalo] - y_true=y_true_all[selectskyinsolutions][0:nhalo] - - x_predicted=np.array([],dtype=float) - y_predicted=np.array([],dtype=float) + x_predicted = np.array([], dtype=float) + y_predicted = np.array([], dtype=float) for i in xrange(nhalo): - x_predicted=np.append(x_predicted,float(sky[2*i+1])) #get the predicted values - y_predicted=np.append(y_predicted,float(sky[2*i+2])) - #The solution file for the test data provides masses - #to calculate the centre of mass where as the Training_halo.csv - #direct provides x_ref y_ref. So in the case of test data - #we need to calculae the ref point from the masses using - #Get_ref() - - x_ref=x_ref_all[selectskyinsolutions] - y_ref=y_ref_all[selectskyinsolutions] + # get the predicted values + x_predicted = np.append(x_predicted, float(sky[2 * i + 1])) + y_predicted = np.append(y_predicted, float(sky[2 * i + 2])) + # The solution file for the test data provides masses + # to calculate the centre of mass where as the Training_halo.csv + # direct provides x_ref y_ref. So in the case of test data + # we need to calculae the ref point from the masses using + # Get_ref() - num_halos_total=num_halos_total+nhalo + x_ref = x_ref_all[selectskyinsolutions] + y_ref = y_ref_all[selectskyinsolutions] + num_halos_total = num_halos_total + nhalo - #Single halo case, this needs to be separately calculated since + # Single halo case, this needs to be separately calculated since #x_ref = x_true if nhalo == 1: - #What is the radial distance between the true and predicted position - r=np.append(r,np.sqrt( (x_predicted-x_true)**2 \ - + (y_predicted-y_true)**2)) - #What is the angle between the predicted position and true halo position - if (x_predicted-x_true) != 0: - psi = np.arctan((y_predicted-y_true)/(x_predicted-x_true)) - else: psi=0. - theta = convert_to_360([psi], [x_predicted-x_true], [y_predicted-y_true]) - angle=np.append(angle,theta) - - - else: - #r_index_index, contains the radial distances of the predicted to - #true positions. These are found by matching up the true halos to - #the predicted halos such that the average of all the radial distances - #is optimal. it also contains indexes of the halos used which are used to - #show which halo has been mathced to which. - - r_index_index = calc_delta_r(x_predicted, y_predicted, x_true, \ + # What is the radial distance between the true and predicted + # position + r = np.append(r, np.sqrt((x_predicted - x_true) ** 2 + + (y_predicted - y_true) ** 2)) + # What is the angle between the predicted position and true halo + # position + if (x_predicted - x_true) != 0: + psi = np.arctan( + (y_predicted - y_true) / (x_predicted - x_true)) + else: + psi = 0. + theta = convert_to_360( + [psi], [x_predicted - x_true], [y_predicted - y_true]) + angle = np.append(angle, theta) + + else: + # r_index_index, contains the radial distances of the predicted to + # true positions. These are found by matching up the true halos to + # the predicted halos such that the average of all the radial distances + # is optimal. it also contains indexes of the halos used which are used to + # show which halo has been mathced to which. + + r_index_index = calc_delta_r(x_predicted, y_predicted, x_true, y_true) - - r=np.append(r,r_index_index[0][0]) - halo_index= r_index_index[1] #The true halos indexes matched with the - predicted_index=r_index_index[2] #predicted halo index - - angle=np.append(angle,calc_theta\ - (x_predicted[predicted_index],\ - y_predicted[predicted_index],\ - x_true[halo_index],\ - y_true[halo_index],x_ref,\ - y_ref)) # Find the angles of the predicted - #position wrt to the halo and + + r = np.append(r, r_index_index[0][0]) + # The true halos indexes matched with the + halo_index = r_index_index[1] + predicted_index = r_index_index[2] # predicted halo index + + angle = np.append(angle, calc_theta + (x_predicted[predicted_index], + y_predicted[predicted_index], + x_true[halo_index], + y_true[halo_index], x_ref, + y_ref)) # Find the angles of the predicted + # position wrt to the halo and # add to the vector angle - # Find what the average distance the estimate is from the halo position - av_r=sum(r)/len(r) - - #In order to quantify the orientation invariance we will express each angle + av_r = sum(r) / len(r) + + # In order to quantify the orientation invariance we will express each angle # as a vector and find the average vector - #R_bar^2=(1/N Sum^Ncos(theta))^2+(1/N Sum^Nsin(theta))**2 - + # R_bar^2=(1/N Sum^Ncos(theta))^2+(1/N Sum^Nsin(theta))**2 + N = float(num_halos_total) - angle_vec = np.sqrt(( 1.0/N * sum(np.cos(angle)) )**2 + \ - ( 1.0/N * sum(np.sin(angle)) )**2) - - W1=1./1000. #Weight the av_r such that < 1 is a good score > 1 is not so good. - W2=1. - metric = W1*av_r + W2*angle_vec #Weighted metric, weights TBD + angle_vec = np.sqrt((1.0 / N * sum(np.cos(angle))) ** 2 + + (1.0 / N * sum(np.sin(angle))) ** 2) + + # Weight the av_r such that < 1 is a good score > 1 is not so good. + W1 = 1. / 1000. + W2 = 1. + metric = W1 * av_r + W2 * angle_vec # Weighted metric, weights TBD print 'Your average distance in pixels you are away from the true halo is', av_r print 'Your average angular vector is', angle_vec print 'Your score for the training data is', metric if __name__ == "__main__": - #For help just typed 'python DarkWorldsMetric.py -h' - - parser = ap.ArgumentParser(description='Work out the Metric for your input file') - parser.add_argument('inputfile',type=str,nargs=1,help='Input file of halo positions. Needs to be in the format SkyId,halo_x1,haloy1,halox_2,halo_y2,halox3,halo_y3 ') - parser.add_argument('reffile',type=str,nargs=1,help='This should point to Training_halos.csv') + # For help just typed 'python DarkWorldsMetric.py -h' + + parser = ap.ArgumentParser( + description='Work out the Metric for your input file') + parser.add_argument('inputfile', type=str, nargs=1, + help='Input file of halo positions. Needs to be in the format SkyId,halo_x1,haloy1,halox_2,halo_y2,halox3,halo_y3 ') + parser.add_argument( + 'reffile', type=str, nargs=1, help='This should point to Training_halos.csv') args = parser.parse_args() - user_fname=args.inputfile[0] + user_fname = args.inputfile[0] filename = (args.reffile[0]).count('Training_halos.csv') if filename == 0: - fname=args.reffile[0]+str('Training_halos.csv') + fname = args.reffile[0] + str('Training_halos.csv') else: - fname=args.reffile[0] + fname = args.reffile[0] main(user_fname, fname) - diff --git a/Chapter5_LossFunctions/draw_sky2.py b/Chapter5_LossFunctions/draw_sky2.py index 26b1f470..237533c6 100644 --- a/Chapter5_LossFunctions/draw_sky2.py +++ b/Chapter5_LossFunctions/draw_sky2.py @@ -2,22 +2,24 @@ from matplotlib.patches import Ellipse import numpy as np -def draw_sky( galaxies ): + +def draw_sky(galaxies): """adapted from Vishal Goklani""" size_multiplier = 45 - fig = plt.figure(figsize=(10,10)) - #fig.patch.set_facecolor("blue") + fig = plt.figure(figsize=(10, 10)) + # fig.patch.set_facecolor("blue") ax = fig.add_subplot(111, aspect='equal') n = galaxies.shape[0] for i in xrange(n): - _g = galaxies[i,:] - x,y = _g[0], _g[1] - d = np.sqrt( _g[2]**2 + _g[3]**2 ) - a = 1.0/ ( 1 - d ) - b = 1.0/( 1 + d) - theta = np.degrees( np.arctan2( _g[3], _g[2])*0.5 ) - - ax.add_patch( Ellipse(xy=(x, y), width=size_multiplier*a, height=size_multiplier*b, angle=theta) ) + _g = galaxies[i, :] + x, y = _g[0], _g[1] + d = np.sqrt(_g[2] ** 2 + _g[3] ** 2) + a = 1.0 / (1 - d) + b = 1.0 / (1 + d) + theta = np.degrees(np.arctan2(_g[3], _g[2]) * 0.5) + + ax.add_patch(Ellipse( + xy=(x, y), width=size_multiplier * a, height=size_multiplier * b, angle=theta)) ax.autoscale_view(tight=True) - - return fig \ No newline at end of file + + return fig diff --git a/Chapter6_Priorities/other_strats.py b/Chapter6_Priorities/other_strats.py index 430c08d7..2ff047e8 100644 --- a/Chapter6_Priorities/other_strats.py +++ b/Chapter6_Priorities/other_strats.py @@ -1,4 +1,4 @@ -#other strats. +# other strats. # TODO: UBC strat, epsilon-greedy import scipy.stats as stats @@ -9,16 +9,16 @@ beta = stats.beta -class GeneralBanditStrat( object ): +class GeneralBanditStrat(object): """ Implements a online, learning strategy to solve the Multi-Armed Bandit problem. - + parameters: bandits: a Bandit class with .pull method - choice_function: accepts a self argument (which gives access to all the variables), and - returns and int between 0 and n-1 + choice_function: accepts a self argument (which gives access to all the variables), and + returns and int between 0 and n-1 methods: sample_bandits(n): sample and train on n pulls. @@ -28,81 +28,84 @@ class GeneralBanditStrat( object ): bb_score: the historical score as a (N,) array """ - + def __init__(self, bandits, choice_function): - + self.bandits = bandits - n_bandits = len( self.bandits ) - self.wins = np.zeros( n_bandits ) - self.trials = np.zeros(n_bandits ) + n_bandits = len(self.bandits) + self.wins = np.zeros(n_bandits) + self.trials = np.zeros(n_bandits) self.N = 0 self.choices = [] self.score = [] self.choice_function = choice_function - def sample_bandits( self, n=1 ): - - score = np.zeros( n ) - choices = np.zeros( n ) - + def sample_bandits(self, n=1): + + score = np.zeros(n) + choices = np.zeros(n) + for k in range(n): - #sample from the bandits's priors, and select the largest sample + # sample from the bandits's priors, and select the largest sample choice = self.choice_function(self) - - #sample the chosen bandit - result = self.bandits.pull( choice ) - - #update priors and score - self.wins[ choice ] += result - self.trials[ choice ] += 1 - score[ k ] = result + + # sample the chosen bandit + result = self.bandits.pull(choice) + + # update priors and score + self.wins[choice] += result + self.trials[choice] += 1 + score[k] = result self.N += 1 - choices[ k ] = choice - - self.score = np.r_[ self.score, score ] - self.choices = np.r_[ self.choices, choices ] - return - - + choices[k] = choice + + self.score = np.r_[self.score, score] + self.choices = np.r_[self.choices, choices] + return + + def bayesian_bandit_choice(self): - return np.argmax( rbeta( 1 + self.wins, 1 + self.trials - self.wins) ) - -def max_mean( self ): + return np.argmax(rbeta(1 + self.wins, 1 + self.trials - self.wins)) + + +def max_mean(self): """pick the bandit with the current best observed proportion of winning """ - return np.argmax( self.wins / ( self.trials +1 ) ) + return np.argmax(self.wins / (self.trials + 1)) + -def lower_credible_choice( self ): +def lower_credible_choice(self): """pick the bandit with the best LOWER BOUND. See chapter 5""" - def lb(a,b): - return a/(a+b) - 1.65*np.sqrt( (a*b)/( (a+b)**2*(a+b+1) ) ) + def lb(a, b): + return a / (a + b) - 1.65 * np.sqrt((a * b) / ((a + b) ** 2 * (a + b + 1))) a = self.wins + 1 b = self.trials - self.wins + 1 - return np.argmax( lb(a,b) ) - -def upper_credible_choice( self ): + return np.argmax(lb(a, b)) + + +def upper_credible_choice(self): """pick the bandit with the best LOWER BOUND. See chapter 5""" - def lb(a,b): - return a/(a+b) + 1.65*np.sqrt( (a*b)/( (a+b)**2*(a+b+1) ) ) + def lb(a, b): + return a / (a + b) + 1.65 * np.sqrt((a * b) / ((a + b) ** 2 * (a + b + 1))) a = self.wins + 1 b = self.trials - self.wins + 1 - return np.argmax( lb(a,b) ) - -def random_choice( self): - return np.random.randint( 0, len( self.wins ) ) - - -def ucb_bayes( self ): - C = 0 - n = 10000 - alpha =1 - 1./( (self.N+1) ) - return np.argmax( beta.ppf( alpha, - 1 + self.wins, - 1 + self.trials - self.wins ) ) - - - - + return np.argmax(lb(a, b)) + + +def random_choice(self): + return np.random.randint(0, len(self.wins)) + + +def ucb_bayes(self): + C = 0 + n = 10000 + alpha = 1 - 1. / ((self.N + 1)) + return np.argmax(beta.ppf(alpha, + 1 + self.wins, + 1 + self.trials - self.wins)) + + class Bandits(object): + """ This class represents N bandits machines. @@ -113,13 +116,14 @@ class Bandits(object): pull( i ): return the results, 0 or 1, of pulling the ith bandit. """ + def __init__(self, p_array): self.p = p_array self.optimal = np.argmax(p_array) - - def pull( self, i ): - #i is which arm to pull + + def pull(self, i): + # i is which arm to pull return rand() < self.p[i] - + def __len__(self): return len(self.p) diff --git a/Chapter6_Priorities/ystockquote.py b/Chapter6_Priorities/ystockquote.py index 22e7234f..b2ff030a 100644 --- a/Chapter6_Priorities/ystockquote.py +++ b/Chapter6_Priorities/ystockquote.py @@ -167,4 +167,4 @@ def get_historical_prices(symbol, start_date, end_date): resp = urlopen(req) content = str(resp.read().decode('utf-8').strip()) days = content.splitlines() - return [day.split(',') for day in days] \ No newline at end of file + return [day.split(',') for day in days] diff --git a/Chapter7_BayesianMachineLearning/auc.py b/Chapter7_BayesianMachineLearning/auc.py index 0f2e68f0..897ba076 100644 --- a/Chapter7_BayesianMachineLearning/auc.py +++ b/Chapter7_BayesianMachineLearning/auc.py @@ -1,4 +1,4 @@ -#contributed by Ben Hammer, 2013 +# contributed by Ben Hammer, 2013 def tied_rank(x): @@ -17,21 +17,22 @@ def tied_rank(x): The tied rank f each element in x """ - sorted_x = sorted(zip(x,range(len(x)))) + sorted_x = sorted(zip(x, range(len(x)))) r = [0 for k in x] cur_val = sorted_x[0][0] last_rank = 0 for i in range(len(sorted_x)): if cur_val != sorted_x[i][0]: cur_val = sorted_x[i][0] - for j in range(last_rank, i): - r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 + for j in range(last_rank, i): + r[sorted_x[j][1]] = float(last_rank + 1 + i) / 2.0 last_rank = i - if i==len(sorted_x)-1: - for j in range(last_rank, i+1): - r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 + if i == len(sorted_x) - 1: + for j in range(last_rank, i + 1): + r[sorted_x[j][1]] = float(last_rank + i + 2) / 2.0 return r + def auc(actual, posterior): """ Computes the area under the receiver-operater characteristic (AUC) @@ -53,9 +54,9 @@ def auc(actual, posterior): """ r = tied_rank(posterior) - num_positive = len([0 for x in actual if x==1]) - num_negative = len(actual)-num_positive - sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) - auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / - (num_negative*num_positive)) - return auc \ No newline at end of file + num_positive = len([0 for x in actual if x == 1]) + num_negative = len(actual) - num_positive + sum_positive = sum([r[i] for i in range(len(r)) if actual[i] == 1]) + auc = ((sum_positive - num_positive * (num_positive + 1) / 2.0) / + (num_negative * num_positive)) + return auc diff --git a/ExamplesFromChapters/Chapter3/ClusteringWithGaussians.py b/ExamplesFromChapters/Chapter3/ClusteringWithGaussians.py index 4cfac7db..77c1f2ce 100644 --- a/ExamplesFromChapters/Chapter3/ClusteringWithGaussians.py +++ b/ExamplesFromChapters/Chapter3/ClusteringWithGaussians.py @@ -20,12 +20,12 @@ @pm.deterministic def center_i(assignment=assignment, centers=centers): - return centers[assignment] + return centers[assignment] @pm.deterministic def tau_i(assignment=assignment, taus=taus): - return taus[assignment] + return taus[assignment] # and to combine it with the observations: observations = pm.Normal("obs", center_i, tau_i, diff --git a/sandbox/Chapter10_/github_datapull.py b/sandbox/Chapter10_/github_datapull.py index fdf568bc..668ce4a3 100644 --- a/sandbox/Chapter10_/github_datapull.py +++ b/sandbox/Chapter10_/github_datapull.py @@ -4,69 +4,68 @@ from requests import get from bs4 import BeautifulSoup - - - - stars_to_explore = ( 2**np.arange( -1, 16 ) ).astype("int") - forks_to_explore = ( 2**np.arange( -1, 16 ) ).astype("int") - repo_with_stars = np.ones_like( stars_to_explore ) - repo_with_forks = np.ones_like( forks_to_explore ) + stars_to_explore = (2 ** np.arange(-1, 16)).astype("int") + forks_to_explore = (2 ** np.arange(-1, 16)).astype("int") + repo_with_stars = np.ones_like(stars_to_explore) + repo_with_forks = np.ones_like(forks_to_explore) URL = "https://github.com/search" print "Scrapping data from Github. Sorry Github..." print "The data is contained in variables `foo_to_explore` and `repo_with_foo`" print print "stars first..." - payload = {"q":""} + payload = {"q": ""} for i, _star in enumerate(stars_to_explore): - payload["q"] = "stars:>=%d"%_star - r = get( URL, params = payload ) - soup = BeautifulSoup( r.text ) + payload["q"] = "stars:>=%d" % _star + r = get(URL, params=payload) + soup = BeautifulSoup(r.text) try: - h3 = soup.find( class_="sort-bar").find( "h3" ).text #hopefully the github search results page plays nicely. - value = int( h3.split(" ")[2].replace(",", "" ) ) + # hopefully the github search results page plays nicely. + h3 = soup.find(class_="sort-bar").find("h3").text + value = int(h3.split(" ")[2].replace(",", "")) except AttributeError as e: - #there might be less than 10 repos, so I'll count the number of display results - value = len( soup.findAll(class_= "mega-icon-public-repo" ) ) - + # there might be less than 10 repos, so I'll count the number of + # display results + value = len(soup.findAll(class_="mega-icon-public-repo")) + repo_with_stars[i] = value - print "number of repos with greater than or equal to %d stars: %d"%(_star, value ) - - #repo_with_stars = repo_with_stars.astype("float")/repo_with_stars[0] + print "number of repos with greater than or equal to %d stars: %d" % (_star, value) + #repo_with_stars = repo_with_stars.astype("float")/repo_with_stars[0] - print + print print "forks second..." - payload = {"q":""} + payload = {"q": ""} for i, _fork in enumerate(stars_to_explore): - payload["q"] = "forks:>=%d"%_fork - r = get( URL, params = payload ) - soup = BeautifulSoup( r.text ) + payload["q"] = "forks:>=%d" % _fork + r = get(URL, params=payload) + soup = BeautifulSoup(r.text) try: - h3 = soup.find( class_="sort-bar").find( "h3" ).text #hopefully the github search results page plays nicely. - value = int( h3.split(" ")[2].replace(",", "" ) ) + # hopefully the github search results page plays nicely. + h3 = soup.find(class_="sort-bar").find("h3").text + value = int(h3.split(" ")[2].replace(",", "")) except AttributeError as e: - #there might be less than 10 repos, so I'll count the number of display results - value = len( soup.findAll(class_= "mega-icon-public-repo" ) ) - + # there might be less than 10 repos, so I'll count the number of + # display results + value = len(soup.findAll(class_="mega-icon-public-repo")) + repo_with_forks[i] = value - print "number of repos with greater than or equal to %d forks: %d"%(_fork, value ) - + print "number of repos with greater than or equal to %d forks: %d" % (_fork, value) + #repo_with_forks = repo_with_forks.astype("float")/repo_with_forks[0] - - np.savetxt( "data/gh_forks.csv", np.concatenate( [forks_to_explore, repo_with_forks], axis=1) ) - np.savetxt( "data/gh_stars.csv", np.concatenate( [stars_to_explore, repo_with_stars], axis=1) ) + + np.savetxt("data/gh_forks.csv", + np.concatenate([forks_to_explore, repo_with_forks], axis=1)) + np.savetxt("data/gh_stars.csv", + np.concatenate([stars_to_explore, repo_with_stars], axis=1)) except ImportError as e: print e print "requests / BeautifulSoup not found. Using data pulled on Feburary 11, 2013" - _data = np.genfromtxt( "data/gh_forks.csv", delimiter = "," ) #cehck this. - forks_to_explore = _data[:,0] - repo_with_forks = _data[:,1] - - _data = np.genfromtxt( "data/gh_stars.csv", delimiter = "," ) #cehck this. - stars_to_explore = _data[:,0] - repo_with_stars = _data[:,1] - - - \ No newline at end of file + _data = np.genfromtxt("data/gh_forks.csv", delimiter=",") # cehck this. + forks_to_explore = _data[:, 0] + repo_with_forks = _data[:, 1] + + _data = np.genfromtxt("data/gh_stars.csv", delimiter=",") # cehck this. + stars_to_explore = _data[:, 0] + repo_with_stars = _data[:, 1] diff --git a/sandbox/github_events.py b/sandbox/github_events.py index 85dd9a4a..7560b1b5 100644 --- a/sandbox/github_events.py +++ b/sandbox/github_events.py @@ -1,4 +1,4 @@ -#github_events.py +# github_events.py try: from json import loads @@ -12,16 +12,17 @@ URL = "https://api.github.com/events" -#github allows up to 10 pages of 30 events, but we will only keep the unique ones. +# github allows up to 10 pages of 30 events, but we will only keep the +# unique ones. ids = np.empty(300, dtype=int) -k = 0 -for page in range(10,0, -1): - - r = get( URL, params = {"page":page} ) +k = 0 +for page in range(10, 0, -1): + + r = get(URL, params={"page": page}) data = loads(r.text) for event in data: - ids[k] = ( event["actor"]["id"] ) - k+=1 - -ids = np.unique( ids.astype(int) ) \ No newline at end of file + ids[k] = (event["actor"]["id"]) + k += 1 + +ids = np.unique(ids.astype(int))