1111#
1212# Pulls tweet data from Twitter because ToS prevents distributing it directly.
1313#
14- # Right now we use unauthenticated requests, which are rate-limited to 150/hr.
15- # We use 125/hr to stay safe.
16- #
17- #
1814# - Niek Sanders
19152016# October 20, 2011
2117#
2218#
23- # Excuse the ugly code. I threw this together as quickly as possible and I
24- # don't normally code in Python.
25- #
2619
2720# In Sanders' original form, the code was using Twitter API 1.0.
2821# Now that Twitter moved to 1.1, we had to make a few changes.
2922# Cf. twitterauth.py for the details.
3023
24+ # Regarding rate limiting, please check
25+ # https://dev.twitter.com/rest/public/rate-limiting
26+
3127import sys
3228import csv
3329import json
3834 import twitter
3935except ImportError :
4036 print ("""\
41- You need to install python-twitter:
42- pip install python- twitter
37+ You need to ...
38+ pip install twitter
4339If pip is not found you might have to install it using easy_install.
4440If it does not work on your system, you might want to follow instructions
45- at https://github.com/bear/python- twitter, most likely:
46- $ git clone https://github.com/bear/python- twitter
47- $ cd python- twitter
41+ at https://github.com/sixohsix/ twitter, most likely:
42+ $ git clone https://github.com/sixohsix/ twitter
43+ $ cd twitter
4844 $ sudo python setup.py install
4945""" )
5046
5147 sys .exit (1 )
5248
5349from twitterauth import CONSUMER_KEY , CONSUMER_SECRET , ACCESS_TOKEN_KEY , ACCESS_TOKEN_SECRET
54- api = twitter .Api (consumer_key = CONSUMER_KEY , consumer_secret = CONSUMER_SECRET ,
55- access_token_key = ACCESS_TOKEN_KEY , access_token_secret = ACCESS_TOKEN_SECRET )
56-
57-
58- MAX_TWEETS_PER_HR = 350
50+ api = twitter .Twitter (auth = twitter .OAuth (consumer_key = CONSUMER_KEY , consumer_secret = CONSUMER_SECRET ,
51+ token = ACCESS_TOKEN_KEY , token_secret = ACCESS_TOKEN_SECRET ))
5952
6053DATA_PATH = "data"
6154
@@ -87,16 +80,15 @@ def get_user_params(DATA_PATH):
8780def dump_user_params (user_params ):
8881
8982 # dump user params for confirmation
90- print 'Input: ' + user_params ['inList' ]
91- print 'Output: ' + user_params ['outList' ]
92- print 'Raw data: ' + user_params ['rawDir' ]
93- return
83+ print ('Input: ' + user_params ['inList' ])
84+ print ('Output: ' + user_params ['outList' ])
85+ print ('Raw data: ' + user_params ['rawDir' ])
9486
9587
9688def read_total_list (in_filename ):
9789
9890 # read total fetch list csv
99- fp = open (in_filename , 'rb ' )
91+ fp = open (in_filename , 'rt ' )
10092 reader = csv .reader (fp , delimiter = ',' , quotechar = '"' )
10193
10294 if os .path .exists (MISSING_ID_FILE ):
@@ -111,10 +103,12 @@ def read_total_list(in_filename):
111103 else :
112104 not_authed_ids = []
113105
114- print "We will skip %i tweets that are not available/visible any more on twitter" % (len (missing_ids ) + len (not_authed_ids ))
106+ print ("We will skip %i tweets that are not available or visible any more on twitter" % (
107+ len (missing_ids ) + len (not_authed_ids )))
115108
116109 ignore_ids = set (missing_ids + not_authed_ids )
117110 total_list = []
111+
118112 for row in reader :
119113 if row [2 ] not in ignore_ids :
120114 total_list .append (row )
@@ -140,12 +134,12 @@ def purge_already_fetched(fetch_list, raw_dir):
140134 parse_tweet_json (tweet_file )
141135 count_done += 1
142136 except RuntimeError :
143- print "Error parsing" , item
137+ print ( "Error parsing" , item )
144138 rem_list .append (item )
145139 else :
146140 rem_list .append (item )
147141
148- print "We have already downloaded %i tweets." % count_done
142+ print ( "We have already downloaded %i tweets." % count_done )
149143
150144 return rem_list
151145
@@ -156,66 +150,50 @@ def download_tweets(fetch_list, raw_dir):
156150 if not os .path .exists (raw_dir ):
157151 os .mkdir (raw_dir )
158152
159- # stay within rate limits
160- download_pause_sec = 3600 / MAX_TWEETS_PER_HR
161-
162153 # download tweets
163154 for idx in range (0 , len (fetch_list )):
164- # stay in Twitter API rate limits
165- print 'Pausing %d sec to obey Twitter API rate limits' % \
166- (download_pause_sec )
167- time .sleep (download_pause_sec )
168-
169155 # current item
170156 item = fetch_list [idx ]
171- print item
172-
173- # print status
174- print '--> downloading tweet #%s (%d of %d)' % \
175- (item [2 ], idx + 1 , len (fetch_list ))
157+ print (item )
176158
177- # Old Twitter API 1.0
178- # pull data
179- # url = 'https://api.twitter.com/1/statuses/show.json?id=' + item[2]
180- # print url
181- # urllib.urlretrieve(url, raw_dir + item[2] + '.json')
159+ print ('--> downloading tweet #%s (%d of %d)' %
160+ (item [2 ], idx + 1 , len (fetch_list )))
182161
183- # New Twitter API 1.1
184162 try :
185- sec = api .GetSleepTime ('/statuses/show/:id' )
186- if sec > 0 :
187- print "Sleeping %i seconds to conform to Twitter's rate limiting" % sec
188- time .sleep (sec )
163+ #import pdb;pdb.set_trace()
164+ response = api .statuses .show (_id = item [2 ])
189165
190- result = api .GetStatus (item [2 ])
191- json_data = result .AsJsonString ()
166+ if response .rate_limit_remaining <= 0 :
167+ wait_seconds = response .rate_limit_reset - time .time ()
168+ print ("Rate limiting requests us to wait %f seconds" %
169+ wait_seconds )
170+ time .sleep (wait_seconds )
192171
193- except twitter .TwitterError , e :
172+ except twitter .TwitterError as e :
194173 fatal = True
195- for m in e .message :
174+ print (e )
175+ for m in json .loads (e .response_data .decode ())['errors' ]:
196176 if m ['code' ] == 34 :
197- print "Tweet missing: " , item
198- # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
199- with open (MISSING_ID_FILE , "a" ) as f :
177+ print ("Tweet missing: " , item )
178+ with open (MISSING_ID_FILE , "at" ) as f :
200179 f .write (item [2 ] + "\n " )
201180
202181 fatal = False
203182 break
204183 elif m ['code' ] == 63 :
205- print "User of tweet '%s' has been suspended." % item
206- # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
207- with open (MISSING_ID_FILE , "a" ) as f :
184+ print ("User of tweet '%s' has been suspended." % item )
185+ with open (MISSING_ID_FILE , "at" ) as f :
208186 f .write (item [2 ] + "\n " )
209187
210188 fatal = False
211189 break
212190 elif m ['code' ] == 88 :
213- print "Rate limit exceeded. Please lower max_tweets_per_hr."
191+ print ( "Rate limit exceeded." )
214192 fatal = True
215193 break
216194 elif m ['code' ] == 179 :
217- print "Not authorized to view this tweet."
218- with open (NOT_AUTHORIZED_ID_FILE , "a " ) as f :
195+ print ( "Not authorized to view this tweet." )
196+ with open (NOT_AUTHORIZED_ID_FILE , "at " ) as f :
219197 f .write (item [2 ] + "\n " )
220198 fatal = False
221199 break
@@ -225,21 +203,22 @@ def download_tweets(fetch_list, raw_dir):
225203 else :
226204 continue
227205
228- with open (raw_dir + item [2 ] + '.json' , "w " ) as f :
229- f .write (json_data + "\n " )
206+ with open (raw_dir + item [2 ] + '.json' , "wt " ) as f :
207+ f .write (json . dumps ( dict ( response )) + "\n " )
230208
231209 return
232210
233211
234212def parse_tweet_json (filename ):
235213
236214 # read tweet
237- fp = open (filename , 'rb ' )
215+ fp = open (filename , 'r ' )
238216
239217 # parse json
240218 try :
241219 tweet_json = json .load (fp )
242- except ValueError :
220+ except ValueError as e :
221+ print (e )
243222 raise RuntimeError ('error parsing json' )
244223
245224 # look for twitter api error msgs
@@ -281,36 +260,36 @@ def build_output_corpus(out_filename, raw_dir, total_list):
281260 writer .writerow (full_row )
282261
283262 except RuntimeError :
284- print '--> bad data in tweet #' + item [2 ]
263+ print ( '--> bad data in tweet #' + item [2 ])
285264 missing_count += 1
286265
287266 else :
288- print '--> missing tweet #' + item [2 ]
267+ print ( '--> missing tweet #' + item [2 ])
289268 missing_count += 1
290269
291270 # indicate success
292271 if missing_count == 0 :
293- print '\n Successfully downloaded corpus!'
294- print 'Output in: ' + out_filename + '\n '
272+ print ( '\n Successfully downloaded corpus!' )
273+ print ( 'Output in: ' + out_filename + '\n ' )
295274 else :
296- print '\n Missing %d of %d tweets!' % (missing_count , len (total_list ))
297- print 'Partial output in: ' + out_filename + '\n '
275+ print ( '\n Missing %d of %d tweets!' % (missing_count , len (total_list ) ))
276+ print ( 'Partial output in: ' + out_filename + '\n ' )
298277
299278 return
300279
301280
302281def main ():
303282 # get user parameters
304283 user_params = get_user_params (DATA_PATH )
305- print user_params
284+ print ( user_params )
306285 dump_user_params (user_params )
307286
308287 # get fetch list
309288 total_list = read_total_list (user_params ['inList' ])
310289
311290 # remove already fetched or missing tweets
312291 fetch_list = purge_already_fetched (total_list , user_params ['rawDir' ])
313- print "Fetching %i tweets..." % len (fetch_list )
292+ print ( "Fetching %i tweets..." % len (fetch_list ) )
314293
315294 if fetch_list :
316295 # start fetching data from twitter
@@ -319,10 +298,11 @@ def main():
319298 # second pass for any failed downloads
320299 fetch_list = purge_already_fetched (total_list , user_params ['rawDir' ])
321300 if fetch_list :
322- print '\n Starting second pass to retry %i failed downloads...' % len (fetch_list )
301+ print ('\n Starting second pass to retry %i failed downloads...' %
302+ len (fetch_list ))
323303 download_tweets (fetch_list , user_params ['rawDir' ])
324304 else :
325- print "Nothing to fetch any more."
305+ print ( "Nothing to fetch any more." )
326306
327307 # build output corpus
328308 build_output_corpus (user_params ['outList' ], user_params ['rawDir' ],
0 commit comments