File tree Expand file tree Collapse file tree 2 files changed +43
-0
lines changed Expand file tree Collapse file tree 2 files changed +43
-0
lines changed Original file line number Diff line number Diff line change @@ -49,3 +49,7 @@ Scripts
4949
5050blei_lda.py
5151 Computes LDA using the AP Corpus.
52+ wikitopics_create.py
53+ Create the topic model for Wikipedia using LDA (must download wikipedia database first)
54+ wikitopics_create_hdp.py
55+ Create the topic model for Wikipedia using HDP (must download wikipedia database first)
Original file line number Diff line number Diff line change 1+ # This code is supporting material for the book
2+ # Building Machine Learning Systems with Python
3+ # by Willi Richert and Luis Pedro Coelho
4+ # published by PACKT Publishing
5+ #
6+ # It is made available under the MIT License
7+
8+ from __future__ import print_function
9+ import logging
10+ import gensim
11+ import numpy as np
12+
13+ # Set up logging in order to get progress information as the model is being built:
14+ logging .basicConfig (
15+ format = '%(asctime)s : %(levelname)s : %(message)s' ,
16+ level = logging .INFO )
17+
18+ # Load the preprocessed corpus (id2word & mm):
19+ id2word = gensim .corpora .Dictionary .load_from_text (
20+ 'data/wiki_en_output_wordids.txt.bz2' )
21+ mm = gensim .corpora .MmCorpus ('data/wiki_en_output_tfidf.mm' )
22+
23+ # Calling the constructor is enough to build the model
24+ # This call will take a few hours!
25+ model = gensim .models .hdpmodel .HdpModel (
26+ corpus = mm ,
27+ id2word = id2word ,
28+ chunksize = 10000 )
29+
30+ # Save the model so we do not need to learn it again.
31+ model .save ('wiki_hdp.pkl' )
32+
33+ # Compute the document/topic matrix
34+ topics = np .zeros ((len (mm ), model .num_topics ))
35+ for di ,doc in enumerate (mm ):
36+ doc_top = model [doc ]
37+ for ti ,tv in doc_top :
38+ topics [di ,ti ] += tv
39+ np .save ('topics_hdp.npy' , topics )
You can’t perform that action at this time.
0 commit comments