RFCT Separate script for model creation&saving

luispedro · luispedro · commit be5afa9e5626 · 2014-08-27T15:06:22.000+02:00
diff --git a/ch04/wikitopics_create.py b/ch04/wikitopics_create.py
@@ -6,44 +6,28 @@
 # It is made available under the MIT License
 
 from __future__ import print_function
-import numpy as np
 import logging
 import gensim
+
+# Set up logging in order to get progress information as the model is being built:
 logging.basicConfig(
     format='%(asctime)s : %(levelname)s : %(message)s',
     level=logging.INFO)
+
+# Load the preprocessed corpus (id2word & mm):
 id2word = gensim.corpora.Dictionary.load_from_text(
     'data/wiki_en_output_wordids.txt.bz2')
 mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
+
+# Calling the constructor is enough to build the model
+# This call will take a few hours!
 model = gensim.models.ldamodel.LdaModel(
     corpus=mm,
     id2word=id2word,
     num_topics=100,
     update_every=1,
     chunksize=10000,
     passes=1)
-model.save('wiki_lda.pkl')
-topics = [model[doc] for doc in mm]
-lens = np.array([len(t) for t in topics])
-print(np.mean(lens <= 10))
-print(np.mean(lens))
 
-counts = np.zeros(100)
-for doc_top in topics:
-    for ti, _ in doc_top:
-        counts[ti] += 1
-
-for doc_top in topics:
-    for ti, _ in doc_top:
-        counts[ti] += 1
-
-words = model.show_topic(counts.argmax(), 64)
-print(words)
-print()
-print()
-print()
-words = model.show_topic(counts.argmin(), 64)
-print(words)
-print()
-print()
-print()
+# Save the model so we do not need to learn it again.
+model.save('wiki_lda.pkl')