Skip to content

Commit 245e04e

Browse files
committed
ENH Save topics matrix in numpy format
1 parent 864a706 commit 245e04e

File tree

1 file changed

+19
-1
lines changed

1 file changed

+19
-1
lines changed

ch04/wikitopics_create.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
from __future__ import print_function
99
import logging
1010
import gensim
11+
import numpy as np
12+
13+
NR_OF_TOPICS = 100
1114

1215
# Set up logging in order to get progress information as the model is being built:
1316
logging.basicConfig(
@@ -24,10 +27,25 @@
2427
model = gensim.models.ldamodel.LdaModel(
2528
corpus=mm,
2629
id2word=id2word,
27-
num_topics=100,
30+
num_topics=NR_OF_TOPICS,
2831
update_every=1,
2932
chunksize=10000,
3033
passes=1)
3134

3235
# Save the model so we do not need to learn it again.
3336
model.save('wiki_lda.pkl')
37+
38+
# Compute the document/topic matrix
39+
topics = np.zeros((len(mm), model.num_topics))
40+
for di,doc in enumerate(mm):
41+
doc_top = model[doc]
42+
for ti,tv in doc_top:
43+
topics[di,ti] += tv
44+
np.save('topics.npy', topics)
45+
46+
# Alternatively, we create a sparse matrix and save that. This alternative
47+
# saves disk space, at the cost of slightly more complex code:
48+
49+
## from scipy import sparse, io
50+
## sp = sparse.csr_matrix(topics)
51+
## io.savemat('topics.mat', {'topics': sp})

0 commit comments

Comments
 (0)