ENH Better plotting for wikipedia topics

luispedro · luispedro · commit 28b8273961ce · 2014-09-01T20:14:22.000+02:00
diff --git a/ch04/wikitopics_plot.py b/ch04/wikitopics_plot.py
@@ -28,29 +28,29 @@
 # Load the precomputed model
 model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl')
 
-# Compute topics for all elements of the model:
-topics = [model[doc] for doc in mm]
-lens = np.array([len(t) for t in topics])
+topics = np.load('topics.npy', mmap_mode='r')
+
+# Compute the number of topics mentioned in each document
+lens = (topics > 0).sum(1)
 print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
 print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))
 
-
-counts = np.zeros(100)
-for doc_top in topics:
-    for ti,tv in doc_top:
-        counts[ti] += tv
+# Weights will be the total weight of each topic
+weights = topics.sum(0)
 
 # Retrieve the most heavily used topic and plot it as a word cloud:
-words = model.show_topic(counts.argmax(), 64)
-create_cloud('Wikipedia_most.png', words)
+words = model.show_topic(weights.argmax(), 64)
+
+# The parameter ``maxsize`` often needs some manual tuning to make it look nice.
+create_cloud('Wikipedia_most.png', words, maxsize=410, fontname='Neucha')
 print(words)
 print()
 print()
 print()
 
 # Retrieve the **least** heavily used topic and plot it as a word cloud:
-words = model.show_topic(counts.argmin(), 64)
-create_cloud('Wikipedia_least.png', words)
+words = model.show_topic(weights.argmin(), 64)
+create_cloud('Wikipedia_least.png', words, maxsize=180, fontname='Neucha')
 print(words)
 print()
 print()
diff --git a/ch04/wordcloud.py b/ch04/wordcloud.py
@@ -1,7 +1,19 @@
 from __future__ import print_function
 warned_of_error = False
 
-def create_cloud(oname, words):
+def create_cloud(oname, words,maxsize=120, fontname='Lobster'):
+    '''Creates a word cloud (when pytagcloud is installed)
+
+    Parameters
+    ----------
+    oname : output filename
+    words : list of (value,str)
+    maxsize : int, optional
+        Size of maximum word. The best setting for this parameter will often
+        require some manual tuning for each input.
+    fontname : str, optional
+        Font to use.
+    '''
     try:
         from pytagcloud import create_tag_image, make_tags
     except ImportError:
@@ -15,5 +27,5 @@ def create_cloud(oname, words):
     # We also need to flip the order as gensim returns (value, word), whilst
     # pytagcloud expects (word, value):
     words = [(w,int(v*10000)) for v,w in words]
-    tags = make_tags(words, maxsize=120)
-    create_tag_image(tags, oname, size=(1800, 1200), fontname='Lobster')
+    tags = make_tags(words, maxsize=maxsize)
+    create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)