Skip to content

Commit c1d52da

Browse files
author
Miltos Allamanis
committed
Add a topic-level visualization that is auto-computed.
1 parent dcc4c19 commit c1d52da

File tree

4 files changed

+147
-0
lines changed

4 files changed

+147
-0
lines changed

.github/workflows/deploy.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ jobs:
2626
python -m pip install transformers sklearn numpy
2727
python -m pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
2828
python ${{ github.workspace }}/etc/compute_embeddings.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/tsne.json
29+
- name: Compute topics
30+
run: |
31+
python -m pip install nltk gensim
32+
python ${{ github.workspace }}/etc/compute_topics.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/topics.json
2933
- name: Deploy
3034
uses: peaceiris/actions-gh-pages@v3
3135
with:

_includes/sidebar.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ <h1>
1515
<a class="sidebar-nav-item{% if page.url == "/papers.html" %} active{% endif %}" href="{% link papers.html %}">List of Papers</a>
1616
<a class="sidebar-nav-item{% if page.url == "/tags.html" %} active{% endif %}" href="{% link tags.html %}">Papers by Tag</a>
1717
<a class="sidebar-nav-item{% if page.url == "/tsne-viz.html" %} active{% endif %}" href="{% link tsne-viz.html %}">2D Map of Papers</a>
18+
<a class="sidebar-nav-item{% if page.url == "/topic-viz.html" %} active{% endif %}" href="{% link topic-viz.html %}">Topic-based Explorer</a>
1819

1920
<a class="sidebar-nav-item{% if page.url == "/base-taxonomy/" %} active{% endif %}" href="{% link base-taxonomy/index.md %}">Core Taxonomy</a>
2021

etc/compute_topics.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import argparse
2+
import json
3+
import nltk
4+
5+
nltk.download('stopwords')
6+
nltk.download('wordnet')
7+
8+
from nltk.corpus import stopwords
9+
from nltk.stem import WordNetLemmatizer
10+
from gensim.corpora import Dictionary
11+
from gensim.models import LdaModel
12+
13+
14+
15+
def parse_arguments():
16+
parser = argparse.ArgumentParser(description="Topic Model of Papers in ML4Code")
17+
18+
parser.add_argument("json", default=False, help="the path the json containing all papers.")
19+
parser.add_argument("outpath", default=False, help="the target path of the visualizations papers.")
20+
parser.add_argument("--num-topics", default=20, help="The number of topics.", type=int)
21+
return parser.parse_args()
22+
23+
if __name__ == "__main__":
24+
args = parse_arguments()
25+
with open(args.json) as f:
26+
data = json.load(f)
27+
28+
print(f"Num papers: {len(data)}")
29+
30+
31+
lemmatizer = WordNetLemmatizer()
32+
stopwords = set(stopwords.words('english'))
33+
stopwords.update(["one", "two", "using"])
34+
35+
tokens_per_paper = []
36+
for paper_info in data:
37+
text = paper_info["title"] + " " + paper_info["abstract"].replace("<p>", " ").replace("</p>", " ") + " ".join(paper_info["tags"])
38+
lemmatized_tokens = [lemmatizer.lemmatize(w).lower() for w in nltk.word_tokenize(text) if w.lower() not in stopwords and w.isalpha()]
39+
tokens_per_paper.append(lemmatized_tokens)
40+
41+
dictionary = Dictionary(tokens_per_paper)
42+
dictionary.filter_extremes(no_below=20, no_above=0.5)
43+
44+
corpus = [dictionary.doc2bow(doc) for doc in tokens_per_paper]
45+
46+
passes = 100
47+
iterations = 1000
48+
49+
temp = dictionary[0] # This is needed to "load" the dictionary.
50+
51+
model = LdaModel(
52+
corpus=corpus,
53+
id2word=dictionary.id2token,
54+
chunksize=1000,
55+
alpha='asymmetric',
56+
eta='auto',
57+
iterations=iterations,
58+
num_topics=args.num_topics,
59+
passes=passes,
60+
eval_every=None
61+
)
62+
63+
topic_tokens = []
64+
for topicid in range(args.num_topics):
65+
topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025])
66+
67+
paper_topic_data = []
68+
for paper, paper_bow in zip(data, corpus):
69+
topic_distr = model.get_document_topics(paper_bow, minimum_probability=0)
70+
paper_topic_data.append({
71+
"key": paper["key"],
72+
"year": paper["year"],
73+
"title": paper["title"],
74+
"topic_distr": {t: float(p) for t, p in topic_distr}
75+
})
76+
77+
with open(args.outpath, 'w') as f:
78+
json.dump({
79+
"topics": topic_tokens,
80+
"paper_data": paper_topic_data
81+
}, f)

topic-viz.html

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
---
2+
layout: default
3+
title: Explore ML4Code papers with Topics
4+
description: A topic model for the papers in the ML4Code survey
5+
---
6+
<h2>Topic-based Explorer</h2>
7+
<p>Using topic-modelling the following topics have been extracted. The top stemmed words apprear below.
8+
Please change the slider to present the papers that mostly related to the appropria topics</p>
9+
<div id="topicslider">
10+
</div>
11+
<p>
12+
<ul id="toppapers">
13+
<li>Please move the sliders to look at the papers.</li>
14+
</ul>
15+
</p>
16+
17+
18+
19+
<script>
20+
var all_papers = null;
21+
var num_topics = -1;
22+
23+
$(document).ready(
24+
function() {
25+
$.getJSON('/topics.json', function(data) {
26+
all_papers=data.paper_data;
27+
num_topics = data.topics.length;
28+
html = "";
29+
for (let i=0; i < num_topics; i++) {
30+
html += '<tag style="white-space: nowrap;">'+ data.topics[i].join(", ") +' <input type="range" min="0" max="10" value="0" style="width:50px" id="topicSlider'+i+'"></tag> '
31+
}
32+
$("#topicslider").append(html);
33+
for (let i=0; i < num_topics; i++) {
34+
$("#topicSlider"+i).on("change", renderPapers);
35+
}
36+
});
37+
});
38+
39+
function scorePaper(paper_id) {
40+
let score = 0;
41+
topic_dist = all_papers[paper_id].topic_distr;
42+
for (let i=0; i < num_topics; i++) {
43+
score += $("#topicSlider"+i).val() * topic_dist[i];
44+
}
45+
return score;
46+
}
47+
48+
function renderPapers(e, u) {
49+
paper_idxs = [];
50+
for (let i=0; i < all_papers.length; i++) {
51+
paper_idxs.push([i, scorePaper(i)]);
52+
}
53+
paper_idxs = paper_idxs.sort(function(a,b){return b[1] - a[1]});
54+
55+
$("#toppapers").text("");
56+
for (let i=0; i < 20; i++) {
57+
data = all_papers[paper_idxs[i][0]];
58+
$("#toppapers").append("<li><a href='/publications/"+data.key+"'>"+ data.title +"</a>. " + data.year + "</li>");
59+
}
60+
}
61+
</script>

0 commit comments

Comments
 (0)