Supporting Twitter API 1.1

wrichert · wrichert · commit 6775c6885714 · 2013-10-06T20:52:50.000+02:00
diff --git a/ch06/install.py b/ch06/install.py
@@ -26,13 +26,23 @@
 # Excuse the ugly code.  I threw this together as quickly as possible and I
 # don't normally code in Python.
 #
+
+# In Sanders' original form, the code was using Twitter API 1.0.
+# Now that Twitter moved to 1.1, we had to make a few changes.
+# Cf. twitterauth.py for the details.
+
 import csv
-import getpass
 import json
 import os
 import time
 import urllib
 
+from twitterauth import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET
+
+import twitter
+api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
+                  access_token_key=ACCESS_TOKEN_KEY, access_token_secret=ACCESS_TOKEN_SECRET)
+
 
 def get_user_params(data_path):
 
@@ -124,7 +134,6 @@ def download_tweets(fetch_list, raw_dir):
 
     # download tweets
     for idx in range(0, len(fetch_list)):
-
         # current item
         item = fetch_list[idx]
 
@@ -133,9 +142,16 @@ def download_tweets(fetch_list, raw_dir):
         print '--> downloading tweet #%s (%d of %d) (%s left)' % \
               (item[2], idx + 1, len(fetch_list), trem)
 
+        # Old Twitter API 1.0
         # pull data
-        url = 'http://api.twitter.com/1/statuses/show.json?id=' + item[2]
-        urllib.urlretrieve(url, raw_dir + item[2] + '.json')
+        # url = 'https://api.twitter.com/1/statuses/show.json?id=' + item[2]
+        # print url
+        # urllib.urlretrieve(url, raw_dir + item[2] + '.json')
+
+        # New Twitter API 1.1
+        json_data = api.GetStatus(item[2]).AsJsonString()
+        with open(raw_dir + item[2] + '.json', "w") as f:
+            f.write(json_data + "\n")
 
         # stay in Twitter API rate limits
         print '    pausing %d sec to obey Twitter API rate limits' % \
@@ -236,8 +252,6 @@ def main(data_path):
     build_output_corpus(user_params['outList'], user_params['rawDir'],
                         total_list)
 
-    return
-
 
 if __name__ == '__main__':
-    main(os.path.join("..", "data"))
+    main("data")
diff --git a/ch06/utils.py b/ch06/utils.py
@@ -7,15 +7,21 @@
 
 import os
 import collections
+import csv
+import json
 
 from matplotlib import pylab
 import numpy as np
 
-DATA_DIR = os.path.join("..", "data")
-CHART_DIR = os.path.join("..", "charts")
 
-import csv
-import json
+DATA_DIR = "data"
+CHART_DIR = "charts"
+
+if not os.path.exists(DATA_DIR):
+    raise RuntimeError("Expecting directory 'data' in current path")
+
+if not os.path.exists(CHART_DIR):
+    os.mkdir(CHART_DIR)
 
 
 def tweak_labels(Y, pos_sent_list):
@@ -58,40 +64,9 @@ def load_sanders_data(dirname=".", line_count=-1):
     tweets = np.asarray(tweets)
     labels = np.asarray(labels)
 
-    # return topics, tweets, labels
     return tweets, labels
 
 
-def load_kaggle_data(filename="kaggle/training.txt", line_count=-1):
-    count = 0
-
-    labels = []
-    texts = []
-
-    read_texts = set([])
-
-    for line in open(os.path.join(DATA_DIR, filename), "r"):
-        count += 1
-        if line_count > 0 and count > line_count:
-            break
-
-        label, text = line.split("\t")
-
-        # Some tweets occur multiple times, so we have to
-        # remove them to not bias the training set.
-        if text in read_texts:
-            continue
-        read_texts.add(text)
-
-        labels.append(label)
-        texts.append(text)
-
-    texts = np.asarray(texts)
-    labels = np.asarray(labels, dtype=np.int)
-
-    return texts, labels
-
-
 def plot_pr(auc_score, name, phase, precision, recall, label=None):
     pylab.clf()
     pylab.figure(num=None, figsize=(5, 4))