Skip to content

Commit 28b8273

Browse files
committed
ENH Better plotting for wikipedia topics
1 parent 245e04e commit 28b8273

File tree

2 files changed

+27
-15
lines changed

2 files changed

+27
-15
lines changed

ch04/wikitopics_plot.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,29 +28,29 @@
2828
# Load the precomputed model
2929
model = gensim.models.ldamodel.LdaModel.load('wiki_lda.pkl')
3030

31-
# Compute topics for all elements of the model:
32-
topics = [model[doc] for doc in mm]
33-
lens = np.array([len(t) for t in topics])
31+
topics = np.load('topics.npy', mmap_mode='r')
32+
33+
# Compute the number of topics mentioned in each document
34+
lens = (topics > 0).sum(1)
3435
print('Mean number of topics mentioned: {0:.3}'.format(np.mean(lens)))
3536
print('Percentage of articles mentioning less than 10 topics: {0:.1%}'.format(np.mean(lens <= 10)))
3637

37-
38-
counts = np.zeros(100)
39-
for doc_top in topics:
40-
for ti,tv in doc_top:
41-
counts[ti] += tv
38+
# Weights will be the total weight of each topic
39+
weights = topics.sum(0)
4240

4341
# Retrieve the most heavily used topic and plot it as a word cloud:
44-
words = model.show_topic(counts.argmax(), 64)
45-
create_cloud('Wikipedia_most.png', words)
42+
words = model.show_topic(weights.argmax(), 64)
43+
44+
# The parameter ``maxsize`` often needs some manual tuning to make it look nice.
45+
create_cloud('Wikipedia_most.png', words, maxsize=410, fontname='Neucha')
4646
print(words)
4747
print()
4848
print()
4949
print()
5050

5151
# Retrieve the **least** heavily used topic and plot it as a word cloud:
52-
words = model.show_topic(counts.argmin(), 64)
53-
create_cloud('Wikipedia_least.png', words)
52+
words = model.show_topic(weights.argmin(), 64)
53+
create_cloud('Wikipedia_least.png', words, maxsize=180, fontname='Neucha')
5454
print(words)
5555
print()
5656
print()

ch04/wordcloud.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
from __future__ import print_function
22
warned_of_error = False
33

4-
def create_cloud(oname, words):
4+
def create_cloud(oname, words,maxsize=120, fontname='Lobster'):
5+
'''Creates a word cloud (when pytagcloud is installed)
6+
7+
Parameters
8+
----------
9+
oname : output filename
10+
words : list of (value,str)
11+
maxsize : int, optional
12+
Size of maximum word. The best setting for this parameter will often
13+
require some manual tuning for each input.
14+
fontname : str, optional
15+
Font to use.
16+
'''
517
try:
618
from pytagcloud import create_tag_image, make_tags
719
except ImportError:
@@ -15,5 +27,5 @@ def create_cloud(oname, words):
1527
# We also need to flip the order as gensim returns (value, word), whilst
1628
# pytagcloud expects (word, value):
1729
words = [(w,int(v*10000)) for v,w in words]
18-
tags = make_tags(words, maxsize=120)
19-
create_tag_image(tags, oname, size=(1800, 1200), fontname='Lobster')
30+
tags = make_tags(words, maxsize=maxsize)
31+
create_tag_image(tags, oname, size=(1800, 1200), fontname=fontname)

0 commit comments

Comments
 (0)