Skip to content

Commit be5afa9

Browse files
committed
RFCT Separate script for model creation&saving
1 parent 1d08594 commit be5afa9

File tree

1 file changed

+9
-25
lines changed

1 file changed

+9
-25
lines changed

ch04/wikitopics.py renamed to ch04/wikitopics_create.py

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,44 +6,28 @@
66
# It is made available under the MIT License
77

88
from __future__ import print_function
9-
import numpy as np
109
import logging
1110
import gensim
11+
12+
# Set up logging in order to get progress information as the model is being built:
1213
logging.basicConfig(
1314
format='%(asctime)s : %(levelname)s : %(message)s',
1415
level=logging.INFO)
16+
17+
# Load the preprocessed corpus (id2word & mm):
1518
id2word = gensim.corpora.Dictionary.load_from_text(
1619
'data/wiki_en_output_wordids.txt.bz2')
1720
mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
21+
22+
# Calling the constructor is enough to build the model
23+
# This call will take a few hours!
1824
model = gensim.models.ldamodel.LdaModel(
1925
corpus=mm,
2026
id2word=id2word,
2127
num_topics=100,
2228
update_every=1,
2329
chunksize=10000,
2430
passes=1)
25-
model.save('wiki_lda.pkl')
26-
topics = [model[doc] for doc in mm]
27-
lens = np.array([len(t) for t in topics])
28-
print(np.mean(lens <= 10))
29-
print(np.mean(lens))
3031

31-
counts = np.zeros(100)
32-
for doc_top in topics:
33-
for ti, _ in doc_top:
34-
counts[ti] += 1
35-
36-
for doc_top in topics:
37-
for ti, _ in doc_top:
38-
counts[ti] += 1
39-
40-
words = model.show_topic(counts.argmax(), 64)
41-
print(words)
42-
print()
43-
print()
44-
print()
45-
words = model.show_topic(counts.argmin(), 64)
46-
print(words)
47-
print()
48-
print()
49-
print()
32+
# Save the model so we do not need to learn it again.
33+
model.save('wiki_lda.pkl')

0 commit comments

Comments
 (0)