1+ import argparse
2+ import json
3+ import nltk
4+
5+ nltk .download ('stopwords' )
6+ nltk .download ('wordnet' )
7+
8+ from nltk .corpus import stopwords
9+ from nltk .stem import WordNetLemmatizer
10+ from gensim .corpora import Dictionary
11+ from gensim .models import LdaModel
12+
13+
14+
15+ def parse_arguments ():
16+ parser = argparse .ArgumentParser (description = "Topic Model of Papers in ML4Code" )
17+
18+ parser .add_argument ("json" , default = False , help = "the path the json containing all papers." )
19+ parser .add_argument ("outpath" , default = False , help = "the target path of the visualizations papers." )
20+ parser .add_argument ("--num-topics" , default = 20 , help = "The number of topics." , type = int )
21+ return parser .parse_args ()
22+
23+ if __name__ == "__main__" :
24+ args = parse_arguments ()
25+ with open (args .json ) as f :
26+ data = json .load (f )
27+
28+ print (f"Num papers: { len (data )} " )
29+
30+
31+ lemmatizer = WordNetLemmatizer ()
32+ stopwords = set (stopwords .words ('english' ))
33+ stopwords .update (["one" , "two" , "using" ])
34+
35+ tokens_per_paper = []
36+ for paper_info in data :
37+ text = paper_info ["title" ] + " " + paper_info ["abstract" ].replace ("<p>" , " " ).replace ("</p>" , " " ) + " " .join (paper_info ["tags" ])
38+ lemmatized_tokens = [lemmatizer .lemmatize (w ).lower () for w in nltk .word_tokenize (text ) if w .lower () not in stopwords and w .isalpha ()]
39+ tokens_per_paper .append (lemmatized_tokens )
40+
41+ dictionary = Dictionary (tokens_per_paper )
42+ dictionary .filter_extremes (no_below = 20 , no_above = 0.5 )
43+
44+ corpus = [dictionary .doc2bow (doc ) for doc in tokens_per_paper ]
45+
46+ passes = 100
47+ iterations = 1000
48+
49+ temp = dictionary [0 ] # This is needed to "load" the dictionary.
50+
51+ model = LdaModel (
52+ corpus = corpus ,
53+ id2word = dictionary .id2token ,
54+ chunksize = 1000 ,
55+ alpha = 'asymmetric' ,
56+ eta = 'auto' ,
57+ iterations = iterations ,
58+ num_topics = args .num_topics ,
59+ passes = passes ,
60+ eval_every = None
61+ )
62+
63+ topic_tokens = []
64+ for topicid in range (args .num_topics ):
65+ topic_tokens .append ([dictionary .id2token [k [0 ]] for i , k in enumerate (model .get_topic_terms (topicid , topn = 4 )) if i < 2 or k [1 ] > 0.025 ])
66+
67+ paper_topic_data = []
68+ for paper , paper_bow in zip (data , corpus ):
69+ topic_distr = model .get_document_topics (paper_bow , minimum_probability = 0 )
70+ paper_topic_data .append ({
71+ "key" : paper ["key" ],
72+ "year" : paper ["year" ],
73+ "title" : paper ["title" ],
74+ "topic_distr" : {t : float (p ) for t , p in topic_distr }
75+ })
76+
77+ with open (args .outpath , 'w' ) as f :
78+ json .dump ({
79+ "topics" : topic_tokens ,
80+ "paper_data" : paper_topic_data
81+ }, f )
0 commit comments