diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 00000000..4a8183d7
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,7 @@
+
+Thank you for submitting a PR! Please replace this text with a high-level description of the PR. Also ensure the following for new publications:
+
+
+- [ ] Files for new publications are in the `_publications` folder.
+- [ ] The name of each file is `lastnameYEARfirstword.markdown`, _e.g._ `smith2019neural` for a Smith _et al._ paper title "A neural approach to the Universe".
+- [ ] Consider using tags that already exist. We aim to avoid variations or introducing new ones when possible. This is to help searching across this literature review.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index f13646cb..b6e8d907 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -19,13 +19,18 @@ jobs:
     - name: Setup Python
       uses: actions/setup-python@v2
       with:
-          python-version: '3.7'
+          python-version: '3.8'
           architecture: x64
     - name: Compute tSNE Embeddings
       run: |
-          python -m pip install transformers sklearn numpy
-          python -m pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+          python -m pip install transformers scikit-learn numpy
+          python -m pip install torch==1.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
           python ${{ github.workspace }}/etc/compute_embeddings.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/tsne.json
+    - name: Compute topics
+      run: |
+          python -m pip install nltk gensim scipy
+          python ${{ github.workspace }}/etc/compute_topics.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/topics.json
+          python ${{ github.workspace }}/etc/compute_related.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/publications-metadata/
     - name: Deploy
       uses: peaceiris/actions-gh-pages@v3
       with:
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..c3200447
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,22 @@
+# YAML 1.2
+---
+authors: 
+  -
+    family-names: Allamanis
+    given-names: Miltiadis
+    orcid: "https://orcid.org/0000-0002-5819-9900"
+  -
+    family-names: Barr
+    given-names: "Earl T"
+  -
+    family-names: Devanbu
+    given-names: Premkumar
+  -
+    family-names: Sutton
+    given-names: Charles
+cff-version: "1.1.0"
+doi: "10.1145/3212695"
+message: "For this live survey cite and the associated paper, please cite it as below."
+repository-code: "https://ml4code.github.io/"
+title: "A survey of machine learning for big code and naturalness"
+...
\ No newline at end of file
diff --git a/_config.yml b/_config.yml
index b99a8666..2b7f88d2 100644
--- a/_config.yml
+++ b/_config.yml
@@ -13,3 +13,6 @@ collections:
 plugins_dir:
   - jekyll-sitemap
   - jekyll-seo-tag
+
+sass:
+    style: compressed
diff --git a/_includes/sidebar.html b/_includes/sidebar.html
index 1535bd8c..6e5e71b6 100644
--- a/_includes/sidebar.html
+++ b/_includes/sidebar.html
@@ -1,3 +1,4 @@
+<a href='https://github.com/contributing.html' class='ribbon'>Contribute to ML4Code</a>
 <div class="sidebar">
   <div class="container sidebar-sticky">
     <div class="sidebar-about">
@@ -14,13 +15,11 @@ <h1>
    <a class="sidebar-nav-item{% if page.url == "/papers.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20papers.html%20%%7D">List of Papers</a>
    <a class="sidebar-nav-item{% if page.url == "/tags.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20tags.html%20%%7D">Papers by Tag</a>
    <a class="sidebar-nav-item{% if page.url == "/tsne-viz.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20tsne-viz.html%20%%7D">2D Map of Papers</a>
-
-   <a class="sidebar-nav-item{% if page.url == "/base-taxonomy/" %} active{% endif %}" href="https://github.com/%7B%%20link%20base-taxonomy/index.md%20%%7D">Core Taxonomy</a>
+   <a class="sidebar-nav-item{% if page.url == "/topic-viz.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20topic-viz.html%20%%7D">Topic-based Explorer</a>
 
 
   <a class="sidebar-nav-item{% if page.url == "/resources.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20resources.md%20%%7D">Resources, Courses &#38; Events</a>
   <a class="sidebar-nav-item{% if page.url == "/contributing.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20contributing.markdown%20%%7D">Contributing</a>
-  <a class="sidebar-nav-item{% if page.url == "/contributors.html" %} active{% endif %}" href="https://github.com/%7B%%20link%20contributors.md%20%%7D">Contributors</a>
   </nav>
 
   <div class="sidebar-item">
diff --git a/_layouts/publication.html b/_layouts/publication.html
index 8ce7e738..89e8b916 100644
--- a/_layouts/publication.html
+++ b/_layouts/publication.html
@@ -4,20 +4,39 @@
 
 <div class="page">
   <h1 class="page-title">{{ page.title }}</h1>
-  <h5>{{ page.authors }}. {{ page.conference }} {{ page.year }}</h5>
+  <h5>{{ page.authors }}. {{ page.conference | default: page.journal }} {{ page.year }}</h5>
   <p>
     {% for additional_link in page.additional_links %}
       [<a href="https://github.com/%7B%7B%20additional_link.url%20%7D%7D" target="_blank">{{ additional_link.name }}</a>]
     {% endfor %}
     &nbsp;<a href='http://scholar.google.com/scholar?q={{%20page.title%20}}' target="_blank"><img  style="display: inline; margin: 0;" src="https://github.com/public/media/google-scholar.png"/></a>
     &nbsp;<a href='https://www.semanticscholar.org/search?q={{%20page.title%20}}' target="_blank"><img style="display: inline; margin: 0;" src="https://github.com/public/media/semscholar.png"/></a>
-    &nbsp;<a href='http://academic.microsoft.com/#/search?iq={{%20page.title%20|%20uri_escape%20}}' target="_blank"><img style="display: inline; margin: 0;" src="https://github.com/public/media/ms-academic.png"/></a>
     <br/>
     {% for tag in page.tags %}
       <tag><a href="https://github.com/tags.html#{{%20tag%20}}">{{ tag }}</a></tag>
     {% endfor %}
   </p>
-      
-      
-      <p>{{ content }}</p>
+  <p>{{ content }}</p>
+
+  <h6>Similar Work</h6>
+  <p>
+    <ul id="relwork">
+
+    </ul>
+  </p>
+
+  <script>  
+    $(document).ready(
+      function() {
+        $.getJSON("/publications-metadata/{{ page.path | replace_first: '_publications/', '' | replace: '.markdown', '' }}.json", function(data) {
+          num_papers = data.length;
+          html = "";
+          for (let i=0; i < num_papers; i++) {
+              html += '<li><a href="https://github.com/publications/' + data[i][0] + '">'+ data[i][1] +'</a></li>'
+          }
+          $("#relwork").append(html);
+        });
+      });
+  </script>
+
 </div>
diff --git a/_publications/abdelaziz2020graph4code.markdown b/_publications/abdelaziz2020graph4code.markdown
new file mode 100644
index 00000000..089b3302
--- /dev/null
+++ b/_publications/abdelaziz2020graph4code.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Graph4Code: A Machine Interpretable Knowledge Graph for Code"
+authors: Ibrahim Abdelaziz, Julian Dolby, James P. McCusker, Kavitha Srinivas
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2002.09440"}
+   - {name: "Website", url: "https://wala.github.io/graph4code/"}
+tags: ["dataset"]
+---
+Knowledge graphs have proven extremely useful in powering diverse applications in semantic search and natural language understanding. Graph4Code is a knowledge graph about program code that can similarly power diverse applications such as program search, code understanding, refactoring, bug detection, and code automation. The graph uses generic techniques to capture the semantics of Python code: the key nodes in the graph are classes, functions and methods in popular Python modules. Edges indicate function usage (e.g., how data flows through function calls, as derived from program analysis of real code), and documentation about functions (e.g., code documentation, usage documentation, or forum discussions such as StackOverflow). We make extensive use of named graphs in RDF to make the knowledge graph extensible by the community. We describe a set of generic extraction techniques that we applied to over 1.3M Python files drawn from GitHub, over 2,300 Python modules, as well as 47M forum posts to generate a graph with over 2 billion triples. We also provide a number of initial use cases of the knowledge graph in code assistance, enforcing best practices, debugging and type inference. The graph and all its artifacts are available to the community for use.
diff --git a/_publications/add_from_arxiv.py b/_publications/add_from_arxiv.py
new file mode 100755
index 00000000..8b69723c
--- /dev/null
+++ b/_publications/add_from_arxiv.py
@@ -0,0 +1,62 @@
+#! /usr/bin/env python3
+
+import argparse
+import arxiv
+import re
+import os
+import textwrap
+
+
+def _first_non_stopword(title: str) -> str:
+    for word in re.split(r"\W", title.lower()):
+        if word in ("a", "an", "the", "is", "are", "what", "who", "your"):
+            continue
+        return word
+    raise ValueError(f'The title seems to have only stopwords! "{title}"')
+
+
+def _author_lastname(author_name: str) -> str:
+    return author_name.split(" ")[-1].lower()
+
+
+def get_info(paper_id: str, out_dir: str) -> None:
+    client = arxiv.Client()
+    search = arxiv.Search(id_list=[paper_id])
+    paper = next(client.results(search))
+
+    summary = (
+        paper.summary.replace("\n\n", "@@--@@")
+        .replace("\n", " ")
+        .replace("@@--@@", "\n\n")
+    )
+
+    tmpl = textwrap.dedent(
+        f"""\
+        ---
+        layout: publication
+        title: "{paper.title}"
+        authors: {", ".join(a.name for a in paper.authors)}
+        conference:
+        year: {paper.published.year}
+        additional_links:
+          - {{name: "ArXiV", url: "https://arxiv.org/abs/%7Bpaper_id%7D"}}
+        tags: ["TODO"]
+        ---
+        {summary}
+        """
+    )
+
+    filename = f"{_author_lastname(paper.authors[0].name)}{paper.published.year}{_first_non_stopword(paper.title)}.markdown"
+    with open(os.path.join(out_dir, filename), "w") as f:
+        f.write(tmpl)
+
+    print(f"Output at: {filename}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("paper_id", help="The id of the paper to retrieve.")
+    parser.add_argument("out_path", help="The path to output the file.")
+    args = parser.parse_args()
+
+    get_info(args.paper_id, args.out_path)
diff --git a/_publications/agashe2019julce.markdown b/_publications/agashe2019julce.markdown
index 68c9829a..be535afe 100644
--- a/_publications/agashe2019julce.markdown
+++ b/_publications/agashe2019julce.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "JuICe: A Large Scale Distantly Supervised Dataset for Open Domain Context-based Code Generation"
-authors: R. Agashe, S. Iyer, L. Zettlemoyer
+authors: Rajas Agashe, Srinivasan Iyer, Luke Zettlemoyer
 conference:
 year: 2019
-bibkey: agashe2019julce
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.02216"}
    - {name: "Dataset", url: "https://drive.google.com/file/d/1xWDV__5hjTWVuJlXD42Ar7nkjU2hRTic/view?usp=sharing"}
diff --git a/_publications/aggarwal2015using.markdown b/_publications/aggarwal2015using.markdown
index 7b65d56d..8a884044 100644
--- a/_publications/aggarwal2015using.markdown
+++ b/_publications/aggarwal2015using.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Using Machine Translation for Converting Python 2 to Python 3 Code"
-authors: K. Aggarwal, M. Salameh, and A. Hindle
+authors: Karan Aggarwal, Mohammad Salameh, Abram Hindle
 conference: 
 year: 2015
-bibkey: aggarwal2015using
 tags: ["migration"]
 ---
 In this paper, we have tried to use Statistical machine translation in order to convert Python 2 code to Python 3 code. We use data from two projects and achieve a high BLEU score. We also investigate the cross-project training and testing to analyze the errors so as to ascertain differences with previous case. We have described a pilot study on modeling programming languages as natural language to build translation models on the lines of natural languages. This can be further worked on to translate between versions of a programming language or cross-programming-languages code translation.
diff --git a/_publications/agrawal2023monitor.markdown b/_publications/agrawal2023monitor.markdown
new file mode 100644
index 00000000..20e2e510
--- /dev/null
+++ b/_publications/agrawal2023monitor.markdown
@@ -0,0 +1,17 @@
+---
+layout: publication
+title: Monitor-Guided Decoding of Code LMs with Static Analysis of Repository Context
+authors: Lakshya A Agrawal, Aditya Kanade, Navin Goyal, Shuvendu K Lahiri, Sriram Rajamani
+conference: NeurIPS
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2306.10763"}
+   - {name: "NeurIPS website", url: "https://neurips.cc/virtual/2023/poster/70362"}
+   - {name: "code", url: "https://github.com/microsoft/monitors4codegen"}
+tags: ["autocomplete", "benchmark", "code completion", "code generation", "compilation", "completion", "dataset", "evaluation", "language model", "large language models", "program analysis", "static analysis", "tool"]
+---
+Language models of code (LMs) work well when the surrounding code provides sufficient context. This is not true when it becomes necessary to use types, functionality or APIs defined elsewhere in the repository or a linked library, especially those not seen during training. LMs suffer from limited awareness of such global context and end up hallucinating.
+
+Integrated development environments (IDEs) assist developers in understanding repository context using static analysis. We extend this assistance, enjoyed by developers, to LMs. We propose monitor-guided decoding (MGD) where a monitor uses static analysis to guide the decoding. We construct a repository-level dataset PragmaticCode for method-completion in Java and evaluate MGD on it. On models of varying parameter scale, by monitoring for type-consistent object dereferences, MGD consistently improves compilation rates and agreement with ground truth. Further, LMs with fewer parameters, when augmented with MGD, can outperform larger LMs. With MGD, SantaCoder-1.1B achieves better compilation rate and next-identifier match than the much larger text-davinci-003 model.
+
+We also conduct a generalizability study to evaluate the ability of MGD to generalize to multiple programming languages (Java, C# and Rust), coding scenarios (e.g., correct number of arguments to method calls), and to enforce richer semantic constraints (e.g., stateful API protocols). Our data and implementation are available at https://github.com/microsoft/monitors4codegen.
diff --git a/_publications/ahmad2020transformer.markdown b/_publications/ahmad2020transformer.markdown
index d74c7dae..1a7394d0 100644
--- a/_publications/ahmad2020transformer.markdown
+++ b/_publications/ahmad2020transformer.markdown
@@ -4,9 +4,9 @@ title: "A Transformer-based Approach for Source Code Summarization"
 authors: Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang
 conference: ACL
 year: 2020
-bibkey: ahmad2020transformer
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.00653"}
+   - {name: "Code", url: "https://github.com/wasiahmad/NeuralCodeSum"}
 tags: ["summarization"]
 ---
 Generating a readable summary that describes the functionality of a program is known as source code summarization. In this task, learning code representation by modeling the pairwise relationship between code tokens to capture their long-range dependencies is crucial. To learn code representation for summarization, we explore the Transformer model that uses a self-attention mechanism and has shown to be effective in capturing long-range dependencies. In this work, we show that despite the approach is simple, it outperforms the state-of-the-art techniques by a significant margin. We perform extensive analysis and ablation studies that reveal several important findings, e.g., the absolute encoding of source code tokens' position hinders, while relative encoding significantly improves the summarization performance. We have made our code publicly available to facilitate future research. 
diff --git a/_publications/ahmad2021unified.markdown b/_publications/ahmad2021unified.markdown
new file mode 100644
index 00000000..a353114d
--- /dev/null
+++ b/_publications/ahmad2021unified.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Unified Pre-training for Program Understanding and Generation"
+authors: Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang
+conference: NAACL
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2103.06333"}
+tags: ["pretraining", "Transformer"]
+---
+Code summarization and generation empower conversion between programming language (PL) and natural language (NL), while code translation avails the migration of legacy code from one PL to another. This paper introduces PLBART, a sequence-to-sequence model capable of performing a broad spectrum of program and language understanding and generation tasks. PLBART is pre-trained on an extensive collection of Java and Python functions and associated NL text via denoising autoencoding. Experiments on language generation tasks, including code summarization, generation, translation in seven programming languages show that PLBART outperforms or rivals state-of-the-art models. Moreover, experiments on discriminative tasks, e.g., program repair, clone detection, and vulnerable code detection demonstrate PLBART's effectiveness in program understanding. Furthermore, analysis reveals that PLBART learns program syntax, style (e.g., identifier naming convention), logical flow (e.g., if block inside an else block is equivalent to else if block) that are crucial to program semantics and thus excels even with limited annotations.
diff --git a/_publications/ahmed2019learning.markdown b/_publications/ahmed2019learning.markdown
index eb0cf3a6..0b4f9bd4 100644
--- a/_publications/ahmed2019learning.markdown
+++ b/_publications/ahmed2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Lenient Parsing & Typing via Indirect Supervision"
-authors: T. Ahmed, V. Hellendoorn, P. Devanbu
+authors: Toufique Ahmed, Vincent Hellendoorn, Premkumar Devanbu
 conference:
 year: 2019
-bibkey: ahmed2019learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05879"}
 tags: ["types"]
diff --git a/_publications/ahmed2022learning.markdown b/_publications/ahmed2022learning.markdown
new file mode 100644
index 00000000..eba1aebc
--- /dev/null
+++ b/_publications/ahmed2022learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning code summarization from a small and local dataset"
+authors: Toufique Ahmed, Premkumar Devanbu
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2206.00804"}
+tags: ["Transformer", "summarization"]
+---
+Foundation models (e.g., CodeBERT, GraphCodeBERT, CodeT5) work well for many software engineering tasks. These models are pre-trained (using self-supervision) with billions of code tokens, and then fine-tuned with hundreds of thousands of labeled examples, typically drawn from many projects. However, software phenomena can be very project-specific. Vocabulary, and other phenomena vary substantially with each project. Thus, training on project-specific data, and testing on the same project, is a promising idea. This hypothesis has to be evaluated carefully, e.g., in a time-series setting, to prevent training-test leakage. We compare several models and training approaches, including same-project training, cross-project training, training a model especially designed to be sample efficient (and thus prima facie well-suited for learning in a limited-sample same-project setting) and a maximalist hybrid approach, fine-tuning first on many projects in many languages and then training on the same-project. We find that the maximalist hybrid setting provides consistent, substantial gains over the state-of-the-art, on many different projects in both Java and Python.
diff --git a/_publications/ahmed2024studying.markdown b/_publications/ahmed2024studying.markdown
new file mode 100644
index 00000000..2996a1bf
--- /dev/null
+++ b/_publications/ahmed2024studying.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Studying LLM Performance on Closed- and Open-source Data"
+authors: Toufique Ahmed, Christian Bird, Premkumar Devanbu, Saikat Chakraborty
+conference:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2402.15100"}
+tags: ["Transformers"]
+---
+Large Language models (LLMs) are finding wide use in software engineering practice. These models are extremely data-hungry, and are largely trained on open-source (OSS) code distributed with permissive licenses. In terms of actual use however, a great deal of software development still occurs in the for-profit/proprietary sphere, where the code under development is not, and never has been, in the public domain; thus, many developers, do their work, and use LLMs, in settings where the models may not be as familiar with the code under development. In such settings, do LLMs work as well as they do for OSS code? If not, what are the differences? When performance differs, what are the possible causes, and are there work-arounds? In this paper, we examine this issue using proprietary, closed-source software data from Microsoft, where most proprietary code is in C# and C++. We find that performance for C# changes little from OSS --> proprietary code, but does significantly reduce for C++; we find that this difference is attributable to differences in identifiers. We also find that some performance degradation, in some cases, can be ameliorated efficiently by in-context learning.
diff --git a/_publications/ahmed2033improving.markdown b/_publications/ahmed2033improving.markdown
new file mode 100644
index 00000000..1f55b183
--- /dev/null
+++ b/_publications/ahmed2033improving.markdown
@@ -0,0 +1,17 @@
+---
+layout: publication
+title: "Improving Few-Shot Prompts with Relevant Static Analysis Products"
+authors: Toufique Ahmed, Kunal Suresh Pai, Premkumar Devanbu, Earl T. Barr
+conference: 
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2304.06815"}
+tags: ["summarization", "Transformer"]
+---
+Large Language Models (LLM) are a new class of computation engines, "programmed" via prompt engineering. We are still learning how to best "program" these LLMs to help developers. We start with the intuition that developers tend to consciously and unconsciously have a collection of semantics facts in mind when working on coding tasks. Mostly these are shallow, simple facts arising from a quick read. For a function, examples of facts might include parameter and local variable names, return expressions, simple pre- and post-conditions, and basic control and data flow, etc.
+
+One might assume that the powerful multi-layer architecture of transformer-style LLMs makes them inherently capable of doing this simple level of "code analysis" and extracting such information, implicitly, while processing code: but are they, really? If they aren't, could explicitly adding this information help? Our goal here is to investigate this question, using the code summarization task and evaluate whether automatically augmenting an LLM's prompt with semantic facts explicitly, actually helps.
+
+Prior work shows that LLM performance on code summarization benefits from few-shot samples drawn either from the same-project or from examples found via information retrieval methods (such as BM25). While summarization performance has steadily increased since the early days, there is still room for improvement: LLM performance on code summarization still lags its performance on natural-language tasks like translation and text summarization.
+
+We find that adding semantic facts actually does help! This approach improves performance in several different settings suggested by prior work, including for two different Large Language Models. In most cases, improvement nears or exceeds 2 BLEU; for the PHP language in the challenging CodeSearchNet dataset, this augmentation actually yields performance surpassing 30 BLEU. 
diff --git a/_publications/alet2021largescale.markdown b/_publications/alet2021largescale.markdown
new file mode 100644
index 00000000..36c6cec8
--- /dev/null
+++ b/_publications/alet2021largescale.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: A large-scale benchmark for few-shot program induction and synthesis
+authors: Ferran Alet, Javier Lopez-Contreras,  James Koppel,  Maxwell Nye,   Armando Solar-Lezama,  Tomas Lozano-Perez,  Leslie Kaelbling, Joshua Tenenbaum
+conference: ICML
+year: 2021
+additional_links:
+   - {name: "PMLR", url: "http://proceedings.mlr.press/v139/alet21a.html"}
+   - {name: "website", url: "https://lis.csail.mit.edu/progres"}
+tags: ["dataset", "synthesis"]
+---
+A landmark challenge for AI is to learn flexible, powerful representations from small numbers of examples. 
+On an important class of tasks, hypotheses in the form of programs provide extreme generalization capabilities from surprisingly few examples. However, whereas large natural few-shot learning image benchmarks have spurred progress in meta-learning for deep networks, there is no comparably big, natural program-synthesis dataset that can play a similar role. This is because, whereas images are relatively easy to label from internet meta-data or annotated by non-experts, generating meaningful input-output examples for program induction has proven hard to scale. In this work, we propose a new way of leveraging unit tests and natural inputs for small programs as meaningful input-output examples for each sub-program of the overall program. This allows us to create a large-scale naturalistic few-shot program-induction benchmark and propose new challenges in this domain. The evaluation of multiple program induction and synthesis algorithms points to shortcomings of current methods and suggests multiple avenues for future work.
diff --git a/_publications/allal2022santacoder.markdown b/_publications/allal2022santacoder.markdown
new file mode 100644
index 00000000..f2ba994c
--- /dev/null
+++ b/_publications/allal2022santacoder.markdown
@@ -0,0 +1,19 @@
+---
+layout: publication
+title: "SantaCoder: don’t reach for the stars!"
+authors: Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muenninghoff, Mayank Mishra, Alex Gu, Manan Den, Longesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Terry Yue Zhuo, Francesco De Toni, Bernanrdo Garcia del Rio, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Michael Lappert, Ian Yu, Paulo Villegas, Jia Li, David Lansy, Huu Nguyen, Danish Contractor, Luis Villa, Daniel Fried, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Arjun Guha, Harm de Vries, Leonadro von Werra
+conference:
+year: 2022
+tags: ["Transformer"]
+---
+The BigCode project is an open-scientific collaboration working on the responsible development of large language models for code.1 This tech report describes the progress of the collaboration until December 2022, outlining the current state of the Personally Identifiable Information (PII)
+redaction pipeline, the experiments conducted to de-risk the model architecture, and the experiments investigating better preprocessing methods for the training data. We train 1.1B parameter models on the Java,
+JavaScript, and Python subsets of The Stack (Kocetkov et al., 2022) and
+evaluate the models on MultiPL-E (Cassano et al., 2022), a text2code
+benchmark available in 18 programming languages. We find that more
+aggressive filtering of near-duplicates can further boost performance and,
+surprisingly, that selecting files from repositories with 5+ GitHub stars
+deteriorates performance significantly. Our best model outperforms previous open-source multilingual code generation models (InCoder-6.7B and
+CodeGen-Multi-2.7B) in both left-to-right generation and infilling on the
+Java, JavaScript, and Python portions of MultiPL-E, despite being a substantially smaller model. All models are released under an OpenRAIL
+license at https://hf.co/bigcode
diff --git a/_publications/allamanis2013mining.markdown b/_publications/allamanis2013mining.markdown
index 2a702619..e2f50eb1 100644
--- a/_publications/allamanis2013mining.markdown
+++ b/_publications/allamanis2013mining.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Mining Source Code Repositories at Massive Scale Using Language Modeling "
-authors: M. Allamanis, C. Sutton
+authors: Miltiadis Allamanis, Charles Sutton
 conference: MSR
 year: 2013
-bibkey: allamanis2013mining
 additional_links:
    - {name: "PDF", url: "http://homepages.inf.ed.ac.uk/csutton/publications/msr2013.pdf"}
    - {name: "data", url: "http://groups.inf.ed.ac.uk/cup/javaGithub/"}
diff --git a/_publications/allamanis2014learning.markdown b/_publications/allamanis2014learning.markdown
index e9b74953..3e47d53b 100644
--- a/_publications/allamanis2014learning.markdown
+++ b/_publications/allamanis2014learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Learning Natural Coding Conventions
-authors: M. Allamanis, E. T. Barr, C. Bird, C. Sutton
+authors: Miltiadis Allamanis, Earl T. Barr, Christian Bird, Charles Sutton
 conference: FSE
 year: 2014
-bibkey: allamanis2014learning
 additional_links:
    - {name: "PDF", url: "http://homepages.inf.ed.ac.uk/csutton/publications/naturalize.pdf"}
    - {name: "ArXiV", url: "http://arxiv.org/abs/1402.4182"}
diff --git a/_publications/allamanis2014mining.markdown b/_publications/allamanis2014mining.markdown
index 00f25973..f675c48b 100644
--- a/_publications/allamanis2014mining.markdown
+++ b/_publications/allamanis2014mining.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: "Mining Idioms from Source Code"
-authors: M. Allamanis, C. Sutton
+authors: Miltiadis Allamanis, Charles Sutton
 conference: FSE
 year: 2014
-bibkey: allamanis2014mining
 additional_links:
    - {name: "PDF", url: "http://homepages.inf.ed.ac.uk/csutton/publications/idioms.pdf"}
    - {name: "ArXiV", url: "http://arxiv.org/abs/1404.0417"}
    - {name: "data", url: "http://groups.inf.ed.ac.uk/cup/idioms/haggisClassUsersDataset.zip"}
-tags: ["pattern mining", "grammar", "AST"]
+tags: ["pattern mining", "grammar", "grammar"]
 ---
 We present the first method for automatically mining code idioms from a corpus of previously written, idiomatic software projects. We take the view that a code idiom is a syntactic fragment that recurs across projects and has a single semantic purpose. Idioms may have metavariables, such as the body of a for loop. Modern IDEs commonly provide facilities for manually defining idioms and inserting them on demand, but this does not help programmers to write idiomatic code in languages or using libraries with which they are unfamiliar. We present Haggis, a system for mining code idioms that builds on recent advanced techniques from statistical natural language processing, namely, nonparametric Bayesian probabilistic tree substitution grammars. We apply Haggis to several of the most popular open source projects from GitHub. We present a wide range of evidence that the resulting idioms are semantically meaningful, demonstrating that they do indeed recur across software projects and that they occur more frequently in illustrative code examples collected from a Q&A site. Manual examination of the most common idioms indicate that they describe important program concepts, including object creation, exception handling, and resource management.
diff --git a/_publications/allamanis2015bimodal.markdown b/_publications/allamanis2015bimodal.markdown
index e1a64794..95276a20 100644
--- a/_publications/allamanis2015bimodal.markdown
+++ b/_publications/allamanis2015bimodal.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: A Bimodal Modelling of Source Code and Natural Language
-authors: M. Allamanis, D. Tarlow, A. D. Gordon, Y. Wei
+authors: Miltiadis Allamanis, Daniel Tarlow, Andrew Gordon, Yi Wei
 conference: ICML
 year: 2015
-bibkey: allamanis2015bimodal
 additional_links:
    - {name: "Supplementary Material", url: "https://miltos.allamanis.com/publicationfiles/allamanis2015bimodal/supplementary.pdf"}
    - {name: "Presentation Video", url: "http://videolectures.net/icml2015_allamanis_natural_language/"}
-tags: ["search", "grammar", "AST", "bimodal"]
+tags: ["search", "grammar", "grammar", "bimodal"]
 ---
 We consider the problem of building probabilistic models that jointly 
 model short natural language utterances and source code snippets. The
diff --git a/_publications/allamanis2015suggesting.markdown b/_publications/allamanis2015suggesting.markdown
index 1dd6429b..69940618 100644
--- a/_publications/allamanis2015suggesting.markdown
+++ b/_publications/allamanis2015suggesting.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Suggesting Accurate Method and Class Names
-authors: M. Allamanis, E. T. Barr, C. Bird, C. Sutton
+authors: Miltiadis Allamanis, Earl T. Barr, Christian Bird, Charles Sutton
 conference: FSE
 year: 2015
-bibkey: allamanis2015suggesting
 additional_links:
    - {name: "PDF", url: "http://homepages.inf.ed.ac.uk/csutton/publications/accurate-method-and-class.pdf"}
    - {name: "website", url: "http://groups.inf.ed.ac.uk/cup/naturalize"}
diff --git a/_publications/allamanis2016convolutional.markdown b/_publications/allamanis2016convolutional.markdown
index bde8bfb5..4c686728 100644
--- a/_publications/allamanis2016convolutional.markdown
+++ b/_publications/allamanis2016convolutional.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: A Convolutional Attention Network for Extreme Summarization of Source Code
-authors: M. Allamanis, H. Peng, C. Sutton
+authors: Miltiadis Allamanis, Hao Peng, Charles Sutton
 conference: ICML
 year: 2016
-bibkey: allamanis2016convolutional
 additional_links:
    - {name: "website", url: "http://groups.inf.ed.ac.uk/cup/codeattention/"}
    - {name: "code", url: "https://github.com/mast-group/convolutional-attention"}
diff --git a/_publications/allamanis2017mining.markdown b/_publications/allamanis2017mining.markdown
index d4f0a1e5..d1328249 100644
--- a/_publications/allamanis2017mining.markdown
+++ b/_publications/allamanis2017mining.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Mining Semantic Loop Idioms from Big Code
-authors: M. Allamanis, E. T. Barr, C. Bird, M. Marron, C. Sutton
-conference: ""
+authors: Miltiadis Allamanis, Earl T. Barr, Christian Bird, Mark Marron, Charles Sutton
+conference: "TSE"
 year: 2017
-bibkey: allamanis2017mining
 additional_links:
    - {name: "MSR Technical Report", url: "https://www.microsoft.com/en-us/research/publication/mining-semantic-loop-idioms-big-code/"}
    - {name: "website", url: "http://groups.inf.ed.ac.uk/cup/semantic-idioms/"}
diff --git a/_publications/allamanis2017smartpaste.markdown b/_publications/allamanis2017smartpaste.markdown
index 1b9730e6..bbcf290d 100644
--- a/_publications/allamanis2017smartpaste.markdown
+++ b/_publications/allamanis2017smartpaste.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "SmartPaste: Learning to Adapt Source Code"
-authors: M. Allamanis, M. Brockscmidt
+authors: Miltiadis Allamanis, Marc Brockschmidt
 conference: ""
 year: 2017
-bibkey: allamanis2017smartpaste
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1705.07867"}
 tags: ["representation", "variable misuse"]
diff --git a/_publications/allamanis2018learning.markdown b/_publications/allamanis2018learning.markdown
index 410aea11..91319c2b 100644
--- a/_publications/allamanis2018learning.markdown
+++ b/_publications/allamanis2018learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Learning to Represent Programs with Graphs
-authors: M. Allamanis, M. Brockscmidt, M. Khademi
+authors: Miltiadis Allamanis, Marc Brockschmidt, Mahmoud Khademi
 conference: "ICLR"
 year: 2018
-bibkey: allamanis2018learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1711.00740"}
    - {name: "GGNN Code", url: "https://github.com/Microsoft/gated-graph-neural-network-samples"}
diff --git a/_publications/allamanis2019adverse.markdown b/_publications/allamanis2019adverse.markdown
index 03c51fa9..70dc46f5 100644
--- a/_publications/allamanis2019adverse.markdown
+++ b/_publications/allamanis2019adverse.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: "The Adverse Effects of Code Duplication in Machine Learning Models of Code"
-authors: M. Allamanis
+authors: Miltiadis Allamanis
 conference:
 year: 2019
-bibkey: allamanis2019adverse
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1812.06469"}
    - {name: "Dataset Errata", url: "https://dpupublicdata.blob.core.windows.net/duplicates/errata.zip"}
    - {name: "Tool", url: "https://github.com/Microsoft/near-duplicate-code-detector"}
-tags: ["dataset"]
+tags: ["dataset", "evaluation"]
 ---
 The field of big code relies on mining large corpora of code to perform some learning task. A significant threat to this approach has been recently identified by Lopes et al. (2017) who found a large amount of code duplication on GitHub. However, the impact of code duplication has not been noticed by researchers devising machine learning models for source code. In this article, we study the effect of code duplication to machine learning models showing that reported metrics are sometimes inflated by up to 100% when testing on duplicated code corpora compared to the performance on de-duplicated corpora which more accurately represent how machine learning models of code are used by software engineers. We present an "errata" for widely used datasets, list best practices for collecting code corpora and evaluating machine learning models on them, and release tools to help the community avoid this problem in future research.
diff --git a/_publications/allamanis2020typilus.markdown b/_publications/allamanis2020typilus.markdown
index 81723b44..c2e44f4f 100644
--- a/_publications/allamanis2020typilus.markdown
+++ b/_publications/allamanis2020typilus.markdown
@@ -4,7 +4,6 @@ title: "Typilus: Neural Type Hints"
 authors: Miltiadis Allamanis, Earl T. Barr, Soline Ducousso, Zheng Gao
 conference: PLDI
 year: 2020
-bibkey: allamanis2020typilus
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.10657"}
    - {name: "Dataset", url: "https://github.com/typilus/typilus"}
diff --git a/_publications/allamanis2021self.markdown b/_publications/allamanis2021self.markdown
new file mode 100644
index 00000000..9d366bac
--- /dev/null
+++ b/_publications/allamanis2021self.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Self-Supervised Bug Detection and Repair"
+authors: Miltiadis Allamanis, Henry Jackson-Flux, Marc Brockschmidt
+conference: NeurIPS
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.12787"}
+tags: ["GNN", "Transformer", "defect", "repair"]
+---
+Machine learning-based program analyses have recently shown the promise of integrating formal and probabilistic reasoning towards aiding software development. However, in the absence of large annotated corpora, training these analyses is challenging. Towards addressing this, we present BugLab, an approach for self-supervised learning of bug detection and repair. BugLab co-trains two models: (1) a detector model that learns to detect and repair bugs in code, (2) a selector model that learns to create buggy code for the detector to use as training data. A Python implementation of BugLab improves by up to 30% upon baseline methods on a test dataset of 2374 real-life bugs and finds 19 previously unknown bugs in open-source software. 
diff --git a/_publications/alon2018code2seq.markdown b/_publications/alon2018code2seq.markdown
index ae5948fb..b5d91a09 100644
--- a/_publications/alon2018code2seq.markdown
+++ b/_publications/alon2018code2seq.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "code2seq: Generating Sequences from Structured Representations of Code"
-authors: U. Alon, O. Levy, E. Yahav
+authors: Uri Alon, Omer Levy, Eran Yahav
 conference: ICLR
 year: 2019
-bibkey: alon2018code2seq
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1808.01400"}
 tags: ["naming", "summarization", "representation"]
diff --git a/_publications/alon2018general.markdown b/_publications/alon2018general.markdown
index 670fa083..f0452f3c 100644
--- a/_publications/alon2018general.markdown
+++ b/_publications/alon2018general.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A General Path-Based Representation for Predicting Program Properties"
-authors: U. Alon, M. Zilberstein, O. Levy, E. Yahav
+authors: Uri Alon, Meital Zilberstein, Omer Levy, Eran Yahav
 conference: PLDI
 year: 2018
-bibkey: alon2018general
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1803.09544"}
 tags: ["naming", "representation"]
diff --git a/_publications/alon2019code2vec.markdown b/_publications/alon2019code2vec.markdown
index aa909bda..c7c01b39 100644
--- a/_publications/alon2019code2vec.markdown
+++ b/_publications/alon2019code2vec.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "code2vec: Learning Distributed Representations of Code"
-authors: U. Alon, O. Levy, E. Yahav
+authors: Uri Alon, Omer Levy, Eran Yahav
 conference: POPL
 year: 2019
-bibkey: alon2019code2vec
 additional_links:
    - {name: "Code", url: "https://github.com/tech-srl/code2vec"}
 tags: ["naming", "summarization", "representation"]
diff --git a/_publications/alon2019structural.markdown b/_publications/alon2019structural.markdown
index e3353148..797fc9fb 100644
--- a/_publications/alon2019structural.markdown
+++ b/_publications/alon2019structural.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "Structural Language Models for Any-Code Generation"
-authors: U. Alon, R. Sadaka, O. Levy, E. Yahav
+authors: Uri Alon, Roy Sadaka, Omer Levy, Eran Yahav
 conference:
 year: 2019
-bibkey: alond2019structural
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.00577"}
-tags: ["generation"]
+tags: ["code generation"]
 ---
 We address the problem of Any-Code Generation (AnyGen) - generating code without any restriction on the vocabulary or structure. The state-of-the-art in this problem is the sequence-to-sequence (seq2seq) approach, which treats code as a sequence and does not leverage any structural information. We introduce a new approach to AnyGen that leverages the strict syntax of programming languages to model a code snippet as a tree - structural language modeling (SLM). SLM estimates the probability of the program's abstract syntax tree (AST) by decomposing it into a product of conditional probabilities over its nodes. We present a neural model that computes these conditional probabilities by considering all AST paths leading to a target node. Unlike previous structural techniques that have severely restricted the kinds of expressions that can be generated, our approach can generate arbitrary expressions in any programming language. Our model significantly outperforms both seq2seq and a variety of existing structured approaches in generating Java and C# code. We make our code, datasets, and models available online. 
diff --git a/_publications/amodio2017neural.markdown b/_publications/amodio2017neural.markdown
index 7991ce12..b3258d4b 100644
--- a/_publications/amodio2017neural.markdown
+++ b/_publications/amodio2017neural.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Neural Attribute Machines for Program Generation"
-authors: M. Amodio, S. Chaudhuri, T. Reps
+authors: Matthew Amodio, Swarat Chaudhuri, Thomas W. Reps
 conference: 
 year: 2017
-bibkey: amodio2017neural
-tags: ["grammar", "generation", "representation"]
+tags: ["grammar", "code generation", "representation"]
 ---
 Recurrent neural networks have achieved remarkable success at generating sequences with complex structures, thanks to advances that include richer embeddings of input and cures for vanishing gradients. Trained only on sequences from a known grammar, though, they can still struggle to learn rules and constraints of the grammar. Neural Attribute Machines (NAMs) are equipped with a logical machine that represents the underlying grammar, which is used to teach the constraints to the neural machine by (i) augmenting the input sequence, and (ii) optimizing a custom loss function. Unlike traditional RNNs, NAMs are exposed to the grammar, as well as samples from the language of the grammar. During generation, NAMs make significantly fewer violations of the constraints of the underlying grammar than RNNs trained only on samples from the language of the grammar.
 
diff --git a/_publications/arakelyan2020towards.markdown b/_publications/arakelyan2020towards.markdown
index 100bc348..776e6834 100644
--- a/_publications/arakelyan2020towards.markdown
+++ b/_publications/arakelyan2020towards.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Towards Learning Representations of Binary Executable Files for Security Tasks"
-authors: S. Arakelyan, C. Hauser, E. Kline, A. Galstyan
+authors: Shushan Arakelyan, Sima Arasteh, Christophe Hauser, Erik Kline, Aram Galstyan
 conference: AAAI
 year: 2020
-bibkey: arakelyan2020towards
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2002.03388"}
 tags: ["GNN", "representation"]
diff --git a/_publications/ashwath2020predicting.markdown b/_publications/ashwath2020predicting.markdown
index 05964596..828c5fc5 100644
--- a/_publications/ashwath2020predicting.markdown
+++ b/_publications/ashwath2020predicting.markdown
@@ -4,10 +4,9 @@ title: Predicting Vulnerability in Large Codebases With Deep Code Representation
 authors: Anshul Tanwar, Krishna Sundaresan, Parmesh Ashwath, Prasanna Ganesan, Sathish Kumar Chandrasekaran, Sriram Ravi 
 conference: 
 year: 2020
-bibkey: ashwath2020predicting
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.12783"}
-tags: ["AST", "program analysis", "static analysis"]
+tags: ["grammar", "program analysis", "static analysis"]
 ---
 Currently, while software engineers write code for various modules, quite often, various types of errors - coding, logic, semantic, and others (most of which are not caught by compilation and other tools) get introduced. Some of these bugs might be found in the later stage of testing, and many times it is reported by customers on production code. Companies have to spend many resources, both money and time in finding and fixing the bugs which would have been avoided if coding was done right. Also, concealed flaws in software can lead to security vulnerabilities that potentially allow attackers to compromise systems and applications. Interestingly, same or similar issues/bugs, which were fixed in the past (although in different modules), tend to get introduced in production code again.
 We developed a novel AI-based system which uses the deep representation of Abstract Syntax Tree (AST) created from the source code and also the active feedback loop to identify and alert the potential bugs that could be caused at the time of development itself i.e. as the developer is writing new code (logic and/or function). This tool integrated with IDE as a plugin would work in the background, point out existing similar functions/code-segments and any associated bugs in those functions. The tool would enable the developer to incorporate suggestions right at the time of development, rather than waiting for UT/QA/customer to raise a defect.
diff --git a/_publications/aye2020learning.markdown b/_publications/aye2020learning.markdown
new file mode 100644
index 00000000..8a79d814
--- /dev/null
+++ b/_publications/aye2020learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning Autocompletion from Real-World Datasets"
+authors: Gareth Ari Aye, Seohyun Kim, Hongyu Li
+conference: 
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2011.04542"}
+tags: ["autocomplete"]
+---
+Code completion is a popular software development tool integrated into all major IDEs. Many neural language models have achieved promising results in completion suggestion prediction on synthetic benchmarks. However, a recent study When Code Completion Fails: a Case Study on Real-World Completions demonstrates that these results may not translate to improvements in real-world performance. To combat this effect, we train models on real-world code completion examples and find that these models outperform models trained on committed source code and working version snapshots by 12.8% and 13.8% accuracy respectively. We observe this improvement across modeling technologies and show through A/B testing that it corresponds to a 6.2% increase in programmers' actual autocompletion usage. Furthermore, our study characterizes a large corpus of logged autocompletion usages to investigate why training on real-world examples leads to stronger models.
diff --git a/_publications/aye2020sequence.markdown b/_publications/aye2020sequence.markdown
index 706b1d62..1a0a606e 100644
--- a/_publications/aye2020sequence.markdown
+++ b/_publications/aye2020sequence.markdown
@@ -4,7 +4,6 @@ title: "Sequence Model Design for Code Completion in the Modern IDE"
 authors: Gareth Ari Aye, Gail E. Kaiser
 conference: Optional
 year: 2020
-bibkey: aye2020sequence
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.05249"}
 tags: ["autocomplete"]
diff --git a/_publications/bai2021jointly.markdown b/_publications/bai2021jointly.markdown
new file mode 100644
index 00000000..5442a7a0
--- /dev/null
+++ b/_publications/bai2021jointly.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Jointly Learning to Repair Code and Generate Commit Message"
+authors: Jiaqi Bai, Long Zhou, Ambrosio Blanco, Shujie Liu, Furu Wei, Ming Zhou, Zhoujun Li
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2109.12296"}
+tags: ["edit", "Transformer"]
+---
+We propose a novel task of jointly repairing program codes and generating commit messages. Code repair and commit message generation are two essential and related tasks for software development. However, existing work usually performs the two tasks independently. We construct a multilingual triple dataset including buggy code, fixed code, and commit messages for this novel task. We provide the cascaded models as baseline, which are enhanced with different training approaches, including the teacher-student method, the multi-task method, and the back-translation method. To deal with the error propagation problem of the cascaded method, the joint model is proposed that can both repair the code and generate the commit message in a unified framework. Experimental results show that the enhanced cascaded model with teacher-student method and multitask-learning method achieves the best score on different metrics of automated code repair, and the joint model behaves better than the cascaded model on commit message generation. 
diff --git a/_publications/barchi2019code.markdown b/_publications/barchi2019code.markdown
new file mode 100644
index 00000000..1c66dc6b
--- /dev/null
+++ b/_publications/barchi2019code.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Code Mapping in Heterogeneous Platforms Using Deep Learning and LLVM-IR"
+authors: Francesco Barchi, Gianvito Urgese, Enrico Macii, Andrea Acquaviva
+conference: DAC
+year: 2019
+additional_links:
+   - {name: "ACM", url: "https://dl.acm.org/doi/10.1145/3316781.3317789"}
+   - {name: "code", url: "https://gitlab.com/ecs-lab/deepllvm"}
+tags: ["optimization", "program analysis", "static analysis", "natural language processing"]
+---
+Modern heterogeneous platforms require compilers capable of choosing the appropriate device for the execution of program portions. This paper presents a machine learning method designed for supporting mapping decisions through the analysis of the program source code represented in LLVM assembly language (IR) for exploiting the advantages offered by this generalised and optimised representation. To evaluate our solution, we trained an LSTM neural network on OpenCL kernels compiled in LLVM-IR and processed with our tokenizer capable of filtering less-informative tokens. We tested the network that reaches an accuracy of 85% in distinguishing the best computational unit.
diff --git a/_publications/barchi2021exploration.markdown b/_publications/barchi2021exploration.markdown
new file mode 100644
index 00000000..bba80a87
--- /dev/null
+++ b/_publications/barchi2021exploration.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Exploration of Convolutional Neural Network models for source code classification"
+authors: Francesco Barchi, Emanuele Parisi, Gianvito Urgese, Elisa Ficarra, Andrea Acquaviva
+journal: Engineering Applications of Artificial Intelligence
+year: 2021
+additional_links:
+   - {name: "ScienceDirect", url: "https://www.sciencedirect.com/science/article/pii/S0952197620303353"}
+   - {name: "code", url: "https://gitlab.com/ecs-lab/deepllvm"}
+tags: ["optimization", "static analysis", "program analysis", "language model"]
+---
+The application of Artificial Intelligence is becoming common in many engineering fields. Among them, one of the newest and rapidly evolving is software generation, where AI can be used to automatically optimise the implementation of an algorithm for a given computing platform. In particular, Deep Learning technologies can be used to the decide how to allocate pieces of code to hardware platforms with multiple cores and accelerators, that are common in high performance and edge computing applications. In this work, we explore the use of Convolutional Neural Networks (CNN)s to analyse the application source code and decide the best compute unit to minimise the execution time. We demonstrate that CNN models can be successfully applied to source code classification, providing higher accuracy with consistently reduced learning time with respect to state-of-the-art methods. Moreover, we show the robustness of the method with respect to source code pre-processing, compiler options and hyper-parameters selection.
diff --git a/_publications/barchi2022deep.markdown b/_publications/barchi2022deep.markdown
new file mode 100644
index 00000000..0f508efa
--- /dev/null
+++ b/_publications/barchi2022deep.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Deep Learning Approaches to Source Code Analysis for Optimization of Heterogeneous Systems: Recent Results, Challenges and Opportunities"
+authors: Francesco Barchi, Emanuele Parisi, Andrea Bartolini, Andrea Acquaviva
+journal: Journal of Low Power Electronics and Applications
+year: 2022
+additional_links:
+   - {name: "MDPI", url: "https://www.mdpi.com/2079-9268/12/3/37"}
+tags: ["optimization", "review"]
+---
+To cope with the increasing complexity of digital systems programming, deep learning techniques have recently been proposed to enhance software deployment by analysing source code for different purposes, ranging from performance and energy improvement to debugging and security assessment. As embedded platforms for cyber-physical systems are characterised by increasing heterogeneity and parallelism, one of the most challenging and specific problems is efficiently allocating computational kernels to available hardware resources. In this field, deep learning applied to source code can be a key enabler to face this complexity. However, due to the rapid development of such techniques, it is not easy to understand which of those are suitable and most promising for this class of systems. For this purpose, we discuss recent developments in deep learning for source code analysis, and focus on techniques for kernel mapping on heterogeneous platforms, highlighting recent results, challenges and opportunities for their applications to cyber-physical systems.
diff --git a/_publications/bareiss2022code.markdown b/_publications/bareiss2022code.markdown
new file mode 100644
index 00000000..9d2578fc
--- /dev/null
+++ b/_publications/bareiss2022code.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Code Generation Tools (Almost) for Free? A Study of Few-Shot, Pre-Trained Language Models on Code"
+authors: Patrick Bareiß, Beatriz Souza, Marcelo d'Amorim, Michael Pradel
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2206.01335"}
+tags: ["Transformer"]
+---
+Few-shot learning with large-scale, pre-trained language models is a powerful way to answer questions about code, e.g., how to complete a given code example, or even generate code snippets from scratch. The success of these models raises the question whether they could serve as a basis for building a wide range code generation tools. Traditionally, such tools are built manually and separately for each task. Instead, few-shot learning may allow to obtain different tools from a single pre-trained language model by simply providing a few examples or a natural language description of the expected tool behavior. This paper studies to what extent a state-of-the-art, pre-trained language model of code, Codex, may serve this purpose. We consider three code manipulation and code generation tasks targeted by a range of traditional tools: (i) code mutation; (ii) test oracle generation from natural language documentation; and (iii) test case generation. For each task, we compare few-shot learning to a manually built tool. Our results show that the model-based tools complement (code mutation), are on par (test oracle generation), or even outperform their respective traditionally built tool (test case generation), while imposing far less effort to develop them. By comparing the effectiveness of different variants of the model-based tools, we provide insights on how to design an appropriate input ("prompt") to the model and what influence the size of the model has. For example, we find that providing a small natural language description of the code generation task is an easy way to improve predictions. Overall, we conclude that few-shot language models are surprisingly effective, yet there is still more work to be done, such as exploring more diverse ways of prompting and tackling even more involved tasks.
diff --git a/_publications/barke2022grounded.markdown b/_publications/barke2022grounded.markdown
new file mode 100644
index 00000000..2af4be2f
--- /dev/null
+++ b/_publications/barke2022grounded.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Grounded Copilot: How Programmers Interact with Code-Generating Models"
+authors: Shraddha Barke, Michael B. James, Nadia Polikarpova
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2206.15000"}
+tags: ["human evaluation", "synthesis"]
+---
+Powered by recent advances in code-generating models, AI assistants like Github Copilot promise to change the face of programming forever. But what is this new face of programming? We present the first grounded theory analysis of how programmers interact with Copilot, based on observing 20 participants--with a range of prior experience using the assistant--as they solve diverse programming tasks across four languages. Our main finding is that interactions with programming assistants are bimodal: in acceleration mode, the programmer knows what to do next and uses Copilot to get there faster; in exploration mode, the programmer is unsure how to proceed and uses Copilot to explore their options. Based on our theory, we provide recommendations for improving the usability of future AI programming assistants.
diff --git a/_publications/barone2017parallel.markdown b/_publications/barone2017parallel.markdown
index 4c9b1338..ea0da7a1 100644
--- a/_publications/barone2017parallel.markdown
+++ b/_publications/barone2017parallel.markdown
@@ -1,10 +1,12 @@
 ---
 layout: publication
 title: "A parallel corpus of Python functions and documentation strings for automated code documentation and code generation"
-authors: A.V.M. Barone, R. Sennrich
-conference: ArXiV 1707.02275
+authors: Antonio Valerio Miceli Barone, Rico Sennrich
+conference:
 year: 2017
-bibkey: barone2017parallel
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1707.02275"}
+   - {name: "code", url: "https://github.com/EdinburghNLP/code-docstring-corpus"}
 tags: ["documentation", "summarization", "dataset"]
 ---
 Automated documentation of programming source code and automated code generation from natural language are challenging tasks of both practical and scientific interest. Progress in these areas has been limited by the low availability of parallel corpora of code and natural language descriptions, which tend to be small and constrained to specific domains.
diff --git a/_publications/bavarian2022efficient.markdown b/_publications/bavarian2022efficient.markdown
new file mode 100644
index 00000000..ab873f1e
--- /dev/null
+++ b/_publications/bavarian2022efficient.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Efficient Training of Language Models to Fill in the Middle"
+authors: Mohammad Bavarian, Heewoo Jun, Nikolas Tezak, John Schulman, Christine McLeavey, Jerry Tworek, Mark Chen
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2207.14255"}
+tags: ["Transformer", "language model"]
+---
+We show that autoregressive language models can learn to infill text after we apply a straightforward transformation to the dataset, which simply moves a span of text from the middle of a document to its end. While this data augmentation has garnered much interest in recent years, we provide extensive evidence that training models with a large fraction of data transformed in this way does not harm the original left-to-right generative capability, as measured by perplexity and sampling evaluations across a wide range of scales. Given the usefulness, simplicity, and efficiency of training models to fill-in-the-middle (FIM), we suggest that future autoregressive language models be trained with FIM by default. To this end, we run a series of ablations on key hyperparameters, such as the data transformation frequency, the structure of the transformation, and the method of selecting the infill span. We use these ablations to prescribe strong default settings and best practices to train FIM models. We have released our best infilling model trained with best practices in our API, and release our infilling benchmarks to aid future research.
diff --git a/_publications/bavishi2017context2name.markdown b/_publications/bavishi2017context2name.markdown
index dd31c836..1cc5a99f 100644
--- a/_publications/bavishi2017context2name.markdown
+++ b/_publications/bavishi2017context2name.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "Context2Name: A Deep Learning-Based Approach to Infer Natural Variable Names from Usage Contexts"
-authors: R. Bavishi, M. Pradel, K. Sen
+authors: Rohan Bavishi, Michael Pradel, Koushik Sen
 conference: 
 year: 2017
-bibkey: bavishi2017context2name
 additional_links:
-   - {name: "ArXiV", url: "https://arxiv.org/pdf/1809.05193.pdf"}
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1809.05193"}
 tags: ["naming"]
 ---
 Most of the JavaScript code deployed in the wild has been minified, a process in which identifier names are replaced
diff --git a/_publications/bavishi2019autopandas.markdown b/_publications/bavishi2019autopandas.markdown
index a656a5e3..6cb91d21 100644
--- a/_publications/bavishi2019autopandas.markdown
+++ b/_publications/bavishi2019autopandas.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "AutoPandas: neural-backed generators for program synthesis"
-authors: R. Bavishi, C. Lemieux, R. Fox, K. Sen, I. Stoica
+authors: Rohan Bavishi, Caroline Lemieux, Roy Fox, Koushik Sen, Ion Stoica
 conference: OOPSLA
 year: 2019
-bibkey: bavishi2019autopandas
 tags: ["synthesis", "GNN", "API"]
 ---
 Developers nowadays have to contend with a growing number of APIs. While in the long-term they are very useful to developers, many modern APIs have an incredibly steep learning curve, due to their hundreds of functions handling many arguments, obscure documentation, and frequently changing semantics. For APIs that perform data transformations, novices can often provide an I/O example demonstrating the desired transformation, but may be stuck on how to translate it to the API. A programming-by-example synthesis engine that takes such I/O examples and directly produces programs in the target API could help such novices. Such an engine presents unique challenges due to the breadth of real-world APIs, and the often-complex constraints over function arguments. We present a generator-based synthesis approach to contend with these problems. This approach uses a program candidate generator, which encodes basic constraints on the space of programs. We introduce neural-backed operators which can be seamlessly integrated into the program generator. To improve the efficiency of the search, we simply use these operators at non-deterministic decision points, instead of relying on domain-specific heuristics. We implement this technique for the Python pandas library in AutoPandas. AutoPandas supports 119 pandas dataframe transformation functions. We evaluate AutoPandas on 26 real-world benchmarks and find it solves 17 of them.
diff --git a/_publications/beltramelli2017pix2code.markdown b/_publications/beltramelli2017pix2code.markdown
index ad976d25..207d169b 100644
--- a/_publications/beltramelli2017pix2code.markdown
+++ b/_publications/beltramelli2017pix2code.markdown
@@ -1,11 +1,12 @@
 ---
 layout: publication
 title: "pix2code: Generating Code from a Graphical User Interface Screenshot"
-authors: T. Beltramelli
-conference: ArXiV 1705.07962
+authors: Tony Beltramelli
+conference:
 year: 2017
-bibkey: beltramelli2017pix2code
-tags: ["generation", "bimodal"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1705.07962"}
+tags: ["code generation", "bimodal"]
 ---
 Transforming a graphical user interface screenshot created by a designer into computer code is a typical task conducted by a developer in order to build customized software, websites and mobile applications. In this paper, we show that Deep Learning techniques can be leveraged to automatically generate code given a graphical user interface screenshot as input. Our model is able to generate code targeting three different platforms (i.e. iOS, Android and web-based technologies) from a single input image with over 77% of accuracy.
 
diff --git a/_publications/bennun2018neural.markdown b/_publications/bennun2018neural.markdown
index f3dd3f83..3d0ecd10 100644
--- a/_publications/bennun2018neural.markdown
+++ b/_publications/bennun2018neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Code Comprehension: A Learnable Representation of Code Semantics"
-authors: T. Ben-Nun A. S. Jakobovits, T. Hoefler
-conference: NIPS
+authors: Tal Ben-Nun, Alice Shoshana Jakobovits, Torsten Hoefler
+conference: NeurIPS
 year: 2018
-bibkey: bennun2018neural
 tags: ["representation"]
 ---
 With the recent success of embeddings in natural language processing, research has been conducted into applying similar methods to code analysis. Most works attempt to process the code directly or use a syntactic tree representation, treating it like sentences written in a natural language. However, none of the existing methods are sufficient to comprehend program semantics robustly, due to structural features such as function calls, branching, and interchangeable order of statements. In this paper, we propose a novel processing technique to learn code semantics, and apply it to a variety of program analysis tasks. In particular, we stipulate that a robust distributional hypothesis of code applies to both human- and machine-generated programs. Following this hypothesis, we define an embedding space, inst2vec, based on an Intermediate Representation (IR) of the code that is independent of the source programming language. We provide a novel definition of contextual flow for this IR, leveraging both the underlying data- and control-flow of the program. We then analyze the embeddings qualitatively using analogies and clustering, and evaluate the learned representation on three different high-level tasks. We show that with a single RNN architecture and pre-trained fixed embeddings, inst2vec outperforms specialized approaches for performance prediction (compute device mapping, optimal thread coarsening); and algorithm classification from raw code (104 classes), where we set a new state-of-the-art.
diff --git a/_publications/berabi2021tfix.markdown b/_publications/berabi2021tfix.markdown
new file mode 100644
index 00000000..0447842e
--- /dev/null
+++ b/_publications/berabi2021tfix.markdown
@@ -0,0 +1,28 @@
+---
+layout: publication
+title: "TFix: Learning to Fix Coding Errors with a Text-to-Text Transformer"
+authors: Berkay Berabi, Jingxuan He, Veselin Raychev, Martin Vechev
+conference: ICML
+year: 2021
+additional_links:
+   - {name: "Code & Dataset", url: "https://github.com/eth-sri/TFix"}
+tags: ["repair"]
+---
+
+The problem of fixing errors in programs has attracted substantial interest over the years. The
+key challenge for building an effective code fixing tool is to capture a wide range of errors and
+meanwhile maintain high accuracy. In this paper, we address this challenge and present a new
+learning-based system, called TFix. TFix works
+directly on program text and phrases the problem of code fixing as a text-to-text task. In turn,
+this enables it to leverage a powerful Transformer
+based model pre-trained on natural language and
+fine-tuned to generate code fixes (via a large, high-quality dataset obtained from GitHub commits).
+TFix is not specific to a particular programming
+language or class of defects and, in fact, improved
+its precision by simultaneously fine-tuning on 52
+different error types reported by a popular static
+analyzer. Our evaluation on a massive dataset of
+JavaScript programs shows that TFix is practically
+effective: it is able to synthesize code that fixes
+the error in ∼67 percent of cases and significantly
+outperforms existing learning-based approaches.
diff --git a/_publications/berabi2024deepcode.markdown b/_publications/berabi2024deepcode.markdown
new file mode 100644
index 00000000..6f55041d
--- /dev/null
+++ b/_publications/berabi2024deepcode.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DeepCode AI Fix: Fixing Security Vulnerabilities with Large Language Models"
+authors: Berkay Berabi, Alexey Gronskiy, Veselin Raychev, Gishor Sivanrupan, Victor Chibotaru, Martin Vechev
+conference:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2402.13291"}
+tags: ["repair", "vulnerability"]
+---
+The automated program repair field has attracted substantial interest over the years, but despite significant research efforts, creating a system that works well for complex semantic bugs such as security vulnerabilities has proven difficult. A promising direction to solve this challenge is by leveraging large language models (LLMs), which are increasingly used to solve various programming tasks. In this paper, we investigate the effectiveness of LLMs for solving code-repair task. We show that the task is difficult as it requires the model to learn long-range code relationships, a task that inherently relies on extensive amounts of training data. At the same time, creating a large, clean dataset for complex program bugs and their corresponding fixes is non-trivial. We propose a technique to address these challenges with a new approach for querying and fine-tuning LLMs. The idea is to use program analysis to limit the LLM's attention mechanism on the portions of code needed to perform the fix, drastically reducing the amount of required training data. Concretely, for training and inference, rather than feeding the entire program to the LLM, we reduce its code to a much shorter snippet that contains the reported defect together with the necessary context - and use that instead. Our evaluation shows that this code reduction approach substantially improves available models such as GPT-4 using few-shot learning, as well as fine-tuning models. To train and evaluate our system, we created a comprehensive code fixing dataset by extensively labeling 156 bug patterns (including 40 security rules), requiring complex interprocedural dataflow to discover. Our best system with Mixtral-8x7B can remove more than 80% of the reported defects while exactly matching the human fix in between 10 and 50% of cases, outperforming baselines based on GPT-3.5 and GPT-4, or based on window-based models like TFix.
diff --git a/_publications/bhatia2016automated.markdown b/_publications/bhatia2016automated.markdown
index 8d4ea418..69af9e05 100644
--- a/_publications/bhatia2016automated.markdown
+++ b/_publications/bhatia2016automated.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Automated Correction for Syntax Errors in Programming Assignments using Recurrent Neural Networks"
-authors: S. Bhatia, R. Singh
+authors: Sahil Bhatia, Rishabh Singh
 conference: 
 year: 2016
-bibkey: bhatia2016automated
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1603.06129"}
 tags: ["repair"]
 ---
 We present a method for automatically generating repair feedback for syntax errors for introductory programming problems. Syntax errors constitute one of the largest classes of errors (34%) in our dataset of student submissions obtained from a MOOC course on edX. The previous techniques for generating automated feedback on programming assignments have focused on functional correctness and style considerations of student programs. These techniques analyze the program AST of the program and then perform some dynamic and symbolic analyses to compute repair feedback. Unfortunately, it is not possible to generate ASTs for student programs with syntax errors and therefore the previous feedback techniques are not applicable in repairing syntax errors. We present a technique for providing feedback on syntax errors that uses Recurrent neural networks (RNNs) to model syntactically valid token sequences. Our approach is inspired from the recent work on learning language models from Big Code (large code corpus). For a given programming assignment, we first learn an RNN to model all valid token sequences using the set of syntactically correct student submissions. Then, for a student submission with
diff --git a/_publications/bhatia2018neurosymbolic.markdown b/_publications/bhatia2018neurosymbolic.markdown
index f32c238d..b4d9c49c 100644
--- a/_publications/bhatia2018neurosymbolic.markdown
+++ b/_publications/bhatia2018neurosymbolic.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neuro-symbolic program corrector for introductory programming assignments"
-authors: S. Bhatia, P. Kohli, R. Singh
+authors: Sahil Bhatia, Pushmeet Kohli, Rishabh Singh
 conference: ICSE
 year: 2018
-bibkey: bhatia2018neurosymbolic
 tags: ["repair"]
 ---
 Automatic correction of programs is a challenging problem with numerous real world applications in security, verification, and education. One application that is becoming increasingly important is the correction of student submissions in online courses for providing feedback. Most existing program repair techniques analyze Abstract Syntax Trees (ASTs) of programs, which are unfortunately unavailable for programs with syntax errors. In this paper, we propose a novel Neuro-symbolic approach that combines neural networks with constraint-based reasoning. Specifically, our method first uses a Recurrent Neural Network (RNN) to perform syntax repairs for the buggy programs; subsequently, the resulting syntactically-fixed programs are repaired using constraint-based techniques to ensure functional correctness. The RNNs are trained using a corpus of syntactically correct submissions for a given programming assignment, and are then queried to fix syntax errors in an incorrect programming submission by replacing or inserting the predicted tokens at the error location. We evaluate our technique on a dataset comprising of over 14,500 student submissions with syntax errors. Our method is able to repair syntax errors in 60% (8689) of submissions, and finds functionally correct repairs for 23.8% (3455) submissions.
diff --git a/_publications/bhoopchand2016learning.markdown b/_publications/bhoopchand2016learning.markdown
index 88482aee..6dbbb7d3 100644
--- a/_publications/bhoopchand2016learning.markdown
+++ b/_publications/bhoopchand2016learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Python Code Suggestion with a Sparse Pointer Network"
-authors: A. Bhoopchand, T. Rocktäschel, E.T. Barr, S. Riedel
-conference: ArXiV 1611.08307
+authors: Avishkar Bhoopchand, Tim Rocktaschel, Earl Barr, Sebastian Riedel
+conference:
 year: 2016
-bibkey: bhoopchand2016learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/pdf/1611.08307.pdf"}
 tags: ["language model", "autocomplete"]
diff --git a/_publications/bian2020sinkfinder.markdown b/_publications/bian2020sinkfinder.markdown
new file mode 100644
index 00000000..4ada09f4
--- /dev/null
+++ b/_publications/bian2020sinkfinder.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "SinkFinder: harvesting hundreds of unknown interesting function pairs with just one seed"
+authors: Pan Bian, Bin Liang, Jianjun Huang, Wenchang Shi, Xidong Wang, Jian Zhang
+conference: FSE
+year: 2020
+tags: ["program analysis"]
+---
+Mastering the knowledge about security-sensitive functions that can potentially result in bugs is valuable to detect them. However, identifying this kind of functions is not a trivial task. Introducing machine learning-based techniques to do the task is a natural choice. Unfortunately, the approach also requires considerable prior knowledge, e.g., sufficient labelled training samples. In practice, the requirement is often hard to meet.
+
+In this paper, to solve the problem, we propose a novel and practical method called SinkFinder to automatically discover function pairs that we are interested in, which only requires very limited prior knowledge. SinkFinder first takes just one pair of well-known interesting functions as the initial seed to infer enough positive and negative training samples by means of sub-word word embedding. By using these samples, a support vector machine classifier is trained to identify more interesting function pairs. Finally, checkers equipped with the obtained knowledge can be easily developed to detect bugs in target systems. The experiments demonstrate that SinkFinder can successfully discover hundreds of interesting functions and detect dozens of previously unknown bugs from large-scale systems, such as Linux, OpenSSL and PostgreSQL.
diff --git a/_publications/bibaev2022all.markdown b/_publications/bibaev2022all.markdown
new file mode 100644
index 00000000..b1d3ed73
--- /dev/null
+++ b/_publications/bibaev2022all.markdown
@@ -0,0 +1,18 @@
+---
+layout: publication
+title: "All You Need Is Logs: Improving Code Completion by Learning from Anonymous IDE Usage Logs"
+authors: Vitaliy Bibaev, Alexey Kalina, Vadim Lomshakov, Yaroslav Golubev, Alexander Bezzubov, Nikita Povarov, Timofey Bryksin
+conference: ESEC/FSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2205.10692"}
+tags: ["autocomplete"]
+---
+We propose an approach for collecting completion usage logs from the users in an IDE and using them to train a machine learning based model for ranking completion candidates.
+We developed a set of features that describe completion candidates and their context, and deployed their anonymized collection in the Early Access Program of IntelliJ-based IDEs.
+We used the logs to collect a dataset of code completions from users, and employed it to train a ranking CatBoost model.
+Then, we evaluated it in two settings: on a held-out set of the collected completions and in a separate A/B test on two different groups of users in the IDE.
+Our evaluation shows that using a simple ranking model trained on the past user behavior logs significantly improved code completion experience.
+Compared to the default heuristics-based ranking, our model demonstrated a decrease in the number of typing actions necessary to perform the completion in the IDE from 2.073 to 1.832.
+The approach adheres to privacy requirements and legal constraints, since it does not require collecting personal information, performing all the necessary anonymization on the client's side.
+Importantly, it can be improved continuously: implementing new features, collecting new data, and evaluating new models - this way, we have been using it in production since the end of 2020.
\ No newline at end of file
diff --git a/_publications/bichsel2016statistical.markdown b/_publications/bichsel2016statistical.markdown
index 79512c6e..fa2a0a7b 100644
--- a/_publications/bichsel2016statistical.markdown
+++ b/_publications/bichsel2016statistical.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Statistical Deobfuscation of Android Applications"
-authors: B. Bichsel, V. Raychev, P. Tsankov, M. Vechev
+authors: Benjamin Bichsel, Veselin Raychev, Petar Tsankov, Martin Vechev
 conference: CCS
 year: 2016
-bibkey: bichsel2016statistical
 tags: ["deobfuscation", "naming"]
 ---
 This work presents a new approach for deobfuscating Android APKs based on probabilistic learning of large code bases (termed "Big Code"). The key idea is to learn a probabilistic model over thousands of non-obfuscated Android applications and to use this probabilistic model to deobfuscate new, unseen Android APKs. The concrete focus of the paper is on reversing layout obfuscation, a popular transformation which renames key program elements such as classes, packages, and methods, thus making it difficult to understand what the program does. Concretely, the paper: (i) phrases the layout deobfuscation problem of Android APKs as structured prediction in a probabilistic graphical model, (ii) instantiates this model with a rich set of features and constraints that capture the Android setting, ensuring both semantic equivalence and high prediction accuracy, and (iii) shows how to leverage powerful inference and learning algorithms to achieve overall precision and scalability of the probabilistic predictions.
diff --git a/_publications/bieber2020learning.markdown b/_publications/bieber2020learning.markdown
new file mode 100644
index 00000000..6163e0e2
--- /dev/null
+++ b/_publications/bieber2020learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning to Execute Programs with Instruction Pointer Attention Graph Neural Networks"
+authors: David Bieber, Charles Sutton, Hugo Larochelle, Daniel Tarlow
+conference: NeurIPS
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2010.12621"}
+tags: ["representation", "dynamic"]
+---
+Graph neural networks (GNNs) have emerged as a powerful tool for learning software engineering tasks including code completion, bug finding, and program repair. They benefit from leveraging program structure like control flow graphs, but they are not well-suited to tasks like program execution that require far more sequential reasoning steps than number of GNN propagation steps. Recurrent neural networks (RNNs), on the other hand, are well-suited to long sequential chains of reasoning, but they do not naturally incorporate program structure and generally perform worse on the above tasks. Our aim is to achieve the best of both worlds, and we do so by introducing a novel GNN architecture, the Instruction Pointer Attention Graph Neural Networks (IPA-GNN), which achieves improved systematic generalization on the task of learning to execute programs using control flow graphs. The model arises by considering RNNs operating on program traces with branch decisions as latent variables. The IPA-GNN can be seen either as a continuous relaxation of the RNN model or as a GNN variant more tailored to execution. To test the models, we propose evaluating systematic generalization on learning to execute using control flow graphs, which tests sequential reasoning and use of program structure. More practically, we evaluate these models on the task of learning to execute partial programs, as might arise if using the model as a heuristic function in program synthesis. Results show that the IPA-GNN outperforms a variety of RNN and GNN baselines on both tasks. 
diff --git a/_publications/bieber2022static.markdown b/_publications/bieber2022static.markdown
new file mode 100644
index 00000000..a1edbbc5
--- /dev/null
+++ b/_publications/bieber2022static.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Static Prediction of Runtime Errors by Learning to Execute Programs with External Resource Descriptions"
+authors: David Bieber, Rishab Goel, Daniel Zheng, Hugo Larochelle, Daniel Tarlow
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.03771"}
+   - {name: "Dataset", url: "https://github.com/google-research/runtime-error-prediction"}
+tags: ["dataset", "defect"]
+---
+The execution behavior of a program often depends on external resources, such as program inputs or file contents, and so cannot be run in isolation. Nevertheless, software developers benefit from fast iteration loops where automated tools identify errors as early as possible, even before programs can be compiled and run. This presents an interesting machine learning challenge: can we predict runtime errors in a "static" setting, where program execution is not possible? Here, we introduce a real-world dataset and task for predicting runtime errors, which we show is difficult for generic models like Transformers. We approach this task by developing an interpreter-inspired architecture with an inductive bias towards mimicking program executions, which models exception handling and "learns to execute" descriptions of the contents of external resources. Surprisingly, we show that the model can also predict the location of the error, despite being trained only on labels indicating the presence/absence and kind of error. In total, we present a practical and difficult-yet-approachable challenge problem related to learning program execution and we demonstrate promising new capabilities of interpreter-inspired machine learning models for code. 
diff --git a/_publications/bielik2016phog.markdown b/_publications/bielik2016phog.markdown
index 6829b87e..a623524c 100644
--- a/_publications/bielik2016phog.markdown
+++ b/_publications/bielik2016phog.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "PHOG: Probabilistic Model for Code"
-authors: P. Bielik, V. Raychev, M. Vechev
+authors: Pavol Bielik, Veselin Raychev, Martin Vechev
 conference: ICML
 year: 2016
-bibkey: bielik2016phog
-tags: ["grammar", "generation", "language model"]
+tags: ["grammar", "code generation", "language model"]
 ---
 We introduce a new generative model for code called probabilistic higher order grammar (PHOG). PHOG generalizes probabilistic context free grammars (PCFGs) by allowing conditioning of a production rule beyond the parent non-terminal, thus capturing rich contexts relevant to programs. Even though PHOG is more powerful than a PCFG, it can be learned from data just as efficiently. We trained a PHOG model on a large JavaScript code corpus and show that it is more precise than existing models, while similarly fast. As a result, PHOG can immediately benefit existing programming tools based on probabilistic models of code.
diff --git a/_publications/bielik2020adversarial.markdown b/_publications/bielik2020adversarial.markdown
index 698d36a7..f219573e 100644
--- a/_publications/bielik2020adversarial.markdown
+++ b/_publications/bielik2020adversarial.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Adversarial Robustness for Code"
-authors: P. Bielik, M. Vechev
+authors: Pavol Bielik, Martin Vechev
 conference:
 year: 2020
-bibkey: bielik2020adversarial 
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2002.04694"}
 tags: ["adversarial", "types"]
diff --git a/_publications/bouzenia2023tracefixer.markdown b/_publications/bouzenia2023tracefixer.markdown
new file mode 100644
index 00000000..26b08036
--- /dev/null
+++ b/_publications/bouzenia2023tracefixer.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "TraceFixer: Execution Trace-Driven Program Repair"
+authors: Islem Bouzenia, Yangruibo Ding, Kexin Pei, Baishakhi Ray, Michael Pradel
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2304.12743"}
+tags: ["Transformer", "repair", "dynamic"]
+---
+When debugging unintended program behavior, developers can often identify the point in the execution where the actual behavior diverges from the desired behavior. For example, a variable may get assigned a wrong value, which then negatively influences the remaining computation. Once a developer identifies such a divergence, how to fix the code so that it provides the desired behavior? This paper presents TraceFixer, a technique for predicting how to edit source code so that it does not diverge from the expected behavior anymore. The key idea is to train a neural program repair model that not only learns from source code edits but also exploits excerpts of runtime traces. The input to the model is a partial execution trace of the incorrect code, which can be obtained automatically through code instrumentation, and the correct state that the program should reach at the divergence point, which the user provides, e.g., in an interactive debugger. Our approach fundamentally differs from current program repair techniques, which share a similar goal but exploit neither execution traces nor information about the desired program state. We evaluate TraceFixer on single-line mistakes in Python code. After training the model on hundreds of thousands of code edits created by a neural model that mimics real-world bugs, we find that exploiting execution traces improves the bug-fixing ability by 13% to 20% (depending on the dataset, within the top-10 predictions) compared to a baseline that learns from source code edits only. Applying TraceFixer to 20 real-world Python bugs shows that the approach successfully fixes 10 of them.
diff --git a/_publications/bouzenia2024repairagent.markdown b/_publications/bouzenia2024repairagent.markdown
new file mode 100644
index 00000000..9796ab25
--- /dev/null
+++ b/_publications/bouzenia2024repairagent.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair"
+authors: Islem Bouzenia, Premkumar Devanbu, Michael Pradel
+conference:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2403.17134"}
+tags: ["repair"]
+---
+Automated program repair has emerged as a powerful technique to mitigate the impact of software bugs on system reliability and user experience. This paper introduces RepairAgent, the first work to address the program repair challenge through an autonomous agent based on a large language model (LLM). Unlike existing deep learning-based approaches, which prompt a model with a fixed prompt or in a fixed feedback loop, our work treats the LLM as an agent capable of autonomously planning and executing actions to fix bugs by invoking suitable tools. RepairAgent freely interleaves gathering information about the bug, gathering repair ingredients, and validating fixes, while deciding which tools to invoke based on the gathered information and feedback from previous fix attempts. Key contributions that enable RepairAgent include a set of tools that are useful for program repair, a dynamically updated prompt format that allows the LLM to interact with these tools, and a finite state machine that guides the agent in invoking the tools. Our evaluation on the popular Defects4J dataset demonstrates RepairAgent's effectiveness in autonomously repairing 164 bugs, including 39 bugs not fixed by prior techniques. Interacting with the LLM imposes an average cost of 270,000 tokens per bug, which, under the current pricing of OpenAI's GPT-3.5 model, translates to 14 cents of USD per bug. To the best of our knowledge, this work is the first to present an autonomous, LLM-based agent for program repair, paving the way for future agent-based techniques in software engineering.
diff --git a/_publications/brach2024can.markdown b/_publications/brach2024can.markdown
new file mode 100644
index 00000000..99b25d3e
--- /dev/null
+++ b/_publications/brach2024can.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: Can Large Language Model Detect Plagiarism in Source Code?
+authors: William Brach, Kristián Košťál, Michal Ries
+conference: FLLM
+year: 2024
+additional_links:
+  - {name: "IEEE", url: "https://ieeexplore.ieee.org/abstract/document/10852497"}
+  - {name: "website", url: "https://www.researchgate.net/profile/Kristian-Kostal/publication/386176004_Can_Large_Language_Model_Detect_Plagiarism_in_Source_Code/links/67479110a7fbc259f1935bcb/Can-Large-Language-Model-Detect-Plagiarism-in-Source-Code.pdf"}
+  - {name: "code", url: "https://github.com/fiit-ba/llm-plagiarism-check"}
+tags: ["code similarity", "large language models", "LLM","plagiarism detection", "natural language processing"]
+---
+The issue of code plagiarism represents a significant challenge in the academic environment. This study examines the potential of large language models (LLMs) in improving the detection of code plagiarism. The performance of several LLMs, including GPT-4o, GPT3.5 Turbo, LLaMA 3, and CodeLlama, is evaluated in comparison to conventional tools, such as JPlag, across a range of levels of code plagiarism. The findings of our study illustrate that state-of-the-art LLMs are able to outperform traditional methods, particularly in the detection of sophisticated forms of plagiarism. GPT-4o exhibited the highest overall accuracy (78.70%) and an F1 score of 86.97%. It is important to note that open-source models, such as LLaMA 3 (accuracy 71.53%, F1 score 82.75%), demonstrated the ability to detect the most complex forms of plagiarism with the same accuracy as GPT-4o. While these results demonstrate the promising potential of LLMs in code similarity analysis, it is also evident that higher false positive rates may be an inherent limitation, emphasizing the need for human oversight. This study contributes valuable insights into the application of AI in maintaining code integrity and academic honesty, paving the way for more effective, interpretable, and fair plagiarism detection systems in software development education and practice.
\ No newline at end of file
diff --git a/_publications/brauckmann2020compiler.markdown b/_publications/brauckmann2020compiler.markdown
new file mode 100644
index 00000000..8fff3600
--- /dev/null
+++ b/_publications/brauckmann2020compiler.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Compiler-based graph representations for deep learning models of code"
+authors: Alexander Brauckmann, Andres Goens, Sebastian Ertel, Jeronimo Castrillon
+conference: CC
+year: 2020
+additional_links:
+   - {name: "ACM", url: "https://dl.acm.org/doi/abs/10.1145/3377555.3377894"}
+tags: ["representation", "compilation", "optimization", "GNN"]
+---
+In natural language processing, novel methods in deep learning, like recurrent neural networks (RNNs) on sequences of words, have been very successful. These methods have also been used recently for tasks in compiler optimization, like heterogeneous mapping of OpenCL kernels or predicting thread coarsening factors for optimal execution times. In contrast to natural languages, programming languages usually have a well-defined structure. This structure is what enables compilers to reason about programs on the foundations of graphs, such as abstract syntax trees (ASTs) or control-data flow graphs (CDFGs).
+In this paper, we argue that we should use these graph structures instead of word sequences for learning compiler optimization tasks. To this end we apply recently proposed graph neural networks (GNNs) for learning predictive compiler tasks on two representations based on ASTs and CDFGs. Experimental results show how these representations improve upon the accuracy of the state-of-the-art in the task of heterogeneous OpenCL mapping, while providing orders of magnitude faster inference times, which are crucial for compiler optimizations. When testing on benchmark suites not included for training, our graph-based methods significantly outperform the state-of-the art by 12 percentage points in terms of accuracy, and are the only ones to perform better than a random mapping. When testing on the task of predicting thread coarsening factors, we expose current limitations of deep learning in compilers. We show how all of the deep learning approaches proposed so far, including our graph-based models, fail to produce an overall speedup with their predictions.
diff --git a/_publications/brauckmann2020compy.markdown b/_publications/brauckmann2020compy.markdown
new file mode 100644
index 00000000..d6a78022
--- /dev/null
+++ b/_publications/brauckmann2020compy.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "ComPy-Learn: A toolbox for exploring machine learning representations for compilers"
+authors: Alexander Brauckmann, Andrés Goens, Jeronimo Castrillon
+conference: FDL
+year: 2020
+additional_links:
+   - {name: "IEEE", url: "https://ieeexplore.ieee.org/abstract/document/9232946"}
+   - {name: "Code", url: "https://github.com/tud-ccc/compy-learn"}
+tags: ["representation", "compilation", "optimization", "GNN"]
+---
+Deep Learning methods have not only shown to improve software performance in compiler heuristics, but also e.g. to improve security in vulnerability prediction or to boost developer productivity in software engineering tools. A key to the success of such methods across these use cases is the expressiveness of the representation used to abstract from the program code. Recent work has shown that different such representations have unique advantages in terms of performance. However, determining the best-performing one for a given task is often not obvious and requires empirical evaluation.
+Therefore, we present ComPy-Learn, a toolbox for conveniently defining, extracting, and exploring representations of program code. With syntax-level language information from the Clang compiler frontend and low-level information from the LLVM compiler backend, the tool supports the construction of linear and graph representations and enables an efficient search for the best-performing representation and model for tasks on program code.
diff --git a/_publications/briem2020offside.markdown b/_publications/briem2020offside.markdown
index dae3ce2a..8c78845e 100644
--- a/_publications/briem2020offside.markdown
+++ b/_publications/briem2020offside.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "OffSide: Learning to Identify Mistakes in Boundary Conditions"
-authors: J. A. Briem, J. Smit, H. Sellik, P. Rapoport, G. Gousios, M. Aniche
+authors: Jón Arnar Briem, Jordi Smit, Hendrig Sellik, Pavel Rapoport, Georgios Gousios, Maurício Aniche.
 conference: "2nd Workshop on Testing for Deep Learning and Deep Learning for Testing"
 year: 2020
-bibkey: briem2020offside
 additional_links:
    - {name: "Preprint", url: "https://pure.tudelft.nl/portal/files/71196834/deeptest_2020.pdf"}
 tags: ["defect"]
diff --git a/_publications/brockschmidt2019generative.markdown b/_publications/brockschmidt2019generative.markdown
index 6d28767c..0f418fbb 100644
--- a/_publications/brockschmidt2019generative.markdown
+++ b/_publications/brockschmidt2019generative.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: Generative Code Modeling with Graphs
-authors: M. Brockscmidt, M. Allamanis A. L. Gaunt, O. Polozov
-conference: "ICLR"
+authors: Marc Brockschmidt, Miltiadis Allamanis, Alexander L. Gaunt, Oleksandr Polozov
+conference: ICLR
 year: 2019
-bibkey: brockschmidt2019generative
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1805.08490"}
    - {name: "OpenReview", url: "https://openreview.net/forum?id=Bke4KsA5FX"}
    - {name: "Code", url: "https://github.com/Microsoft/graph-based-code-modelling"}
-tags: ["grammar", "generation", "GNN"]
+tags: ["grammar", "code generation", "GNN"]
 ---
 Generative models forsource code are an interesting structured prediction problem, requiring to reason about both hard syntactic and semantic constraints as well as about natural, likely programs. We present a novel model for this problem that uses a graph to represent the intermediate state of the generated output. Our model generates code by interleaving grammar-driven expansion steps with graph augmentation and neural message passing steps. An experimental evaluation shows that our new model can generate semantically meaningful expressions, outperforming a range of strong baselines.
diff --git a/_publications/brody2020structural.markdown b/_publications/brody2020structural.markdown
new file mode 100644
index 00000000..e4c78664
--- /dev/null
+++ b/_publications/brody2020structural.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "A Structural Model for Contextual Code Changes"
+authors: Shaked Brody, Uri Alon, Eran Yahav
+conference: OOPSLA
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2005.13209"}
+   - {name: "Code", url: "https://github.com/tech-srl/c3po"}
+tags: ["edit", "grammar", "autocomplete"]
+---
+We address the problem of predicting edit completions based on a learned model that was trained on past edits. Given a code snippet that is partially edited, our goal is to predict a completion of the edit for the rest of the snippet. We refer to this task as the EditCompletion task and present a novel approach for tackling it. The main idea is to directly represent structural edits. This allows us to model the likelihood of the edit itself, rather than learning the likelihood of the edited code. We represent an edit operation as a path in the program's Abstract Syntax Tree (AST), originating from the source of the edit to the target of the edit. Using this representation, we present a powerful and lightweight neural model for the EditCompletion task. We conduct a thorough evaluation, comparing our approach to a variety of representation and modeling approaches that are driven by multiple strong models such as LSTMs, Transformers, and neural CRFs. Our experiments show that our model achieves 28% relative gain over state-of-the-art sequential models and 2× higher accuracy than syntactic models that learn to generate the edited code instead of modeling the edits directly. Our code, dataset, and trained models are publicly available at https://github.com/tech-srl/c3po/ . 
diff --git a/_publications/bruch2009learning.markdown b/_publications/bruch2009learning.markdown
index fa545a63..dc42e4cc 100644
--- a/_publications/bruch2009learning.markdown
+++ b/_publications/bruch2009learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning from Examples to Improve Code Completion Systems"
-authors: M. Bruch, M. Monperrus, and M. Mezini
+authors: Marcel Bruch, Martin Monperrus, Mira Mezini.
 conference: ESEC/FSE 
 year: 2009
-bibkey: bruch2009learning
 tags: ["autocomplete"]
 ---
 The suggestions made by current IDE’s code completion features are based exclusively on static type system of the programming language. As a result, often proposals are made which are irrelevant for a particular working context. Also, these suggestions are ordered alphabetically rather than by their relevance in a particular context. In this paper, we present intelligent code completion systems that learn from existing code repositories. We have implemented three such systems, each using the information contained in
diff --git a/_publications/buech2019learning.markdown b/_publications/buech2019learning.markdown
index 7d136335..7f91f818 100644
--- a/_publications/buech2019learning.markdown
+++ b/_publications/buech2019learning.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: Learning-based Recursive Aggregation of Abstract Syntax Trees for Code Clone Detection
-authors: L. Büch, A. Andrzejak
+authors: Lutz Büch, Artur Andrzejak
 conference: SANER
 year: 2019
-bibkey: buech2019learning
 additional_links:
    - {name: "IEEEexplore", url: "https://ieeexplore.ieee.org/document/8668039"}
    - {name: "website_pdf", url: "https://pvs.ifi.uni-heidelberg.de/publications/"}
    - {name: "TR", url: "https://pvs.ifi.uni-heidelberg.de/fileadmin/papers/2019/Buech-Andrzejak-SANER2019.pdf"}
-tags: ["AST", "grammar", "clone"]
+tags: ["grammar", "grammar", "clone"]
 ---
 Code clone detection remains a crucial challenge in maintaining software projects. Many classic approaches rely on handcrafted aggregation schemes, while recent work uses supervised or unsupervised learning. In this work, we study several aspects of aggregation schemes for code clone detection based on supervised learning. To this aim, we implement an AST-based Recursive Neural Network. Firstly, our ablation study shows the influence of model choices and hyperparameters. We introduce error scaling as a way to effectively and efficiently address the class imbalance problem arising in code clone detection. Secondly, we study the influence of pretrained embeddings representing nodes in ASTs. We show that simply averaging all node vectors of a given AST yields strong baseline aggregation scheme. Further, learned AST aggregation schemes greatly benefit from pretrained node embeddings. Finally, we show the importance of carefully separating training and test data by clone clusters, to reliably measure generalization of models learned with supervision.
\ No newline at end of file
diff --git a/_publications/bui2018bilateral.markdown b/_publications/bui2018bilateral.markdown
index 02e94880..c28c00d5 100644
--- a/_publications/bui2018bilateral.markdown
+++ b/_publications/bui2018bilateral.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Bilateral Dependency Neural Networks for Cross-Language Algorithm Classification"
-authors: N. D. Q. Bui, Y. Yu, L. Jiang
+authors: Nghi D. Q. Bui, Yijun Yu, Lingxiao Jiang
 conference: SANER
 year: 2018
-bibkey: bui2018bilateral
 additional_links:
    - {name: "TR", url: "http://oro.open.ac.uk/58410/1/bui19saner.pdf"}
 tags: ["representation"]
diff --git a/_publications/bui2018cross.markdown b/_publications/bui2018cross.markdown
index 1add251e..0fba31d3 100644
--- a/_publications/bui2018cross.markdown
+++ b/_publications/bui2018cross.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Cross-Language Learning for Program Classification using Bilateral Tree-Based Convolutional Neural Networks"
-authors: N. Bui, L. Jiang, Y. Yu
+authors: Nghi D. Q. Bui, Lingxiao Jiang, Yijun Yu
 conference: NLSE
 year: 2018
-bibkey: bui2018cross
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1710.06159"}
 tags: ["representation", "grammar"]
diff --git a/_publications/bui2018hierarchical.markdown b/_publications/bui2018hierarchical.markdown
index dde477e5..39e35bf8 100644
--- a/_publications/bui2018hierarchical.markdown
+++ b/_publications/bui2018hierarchical.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Hierarchical Learning of Cross-Language Mappings through Distributed Vector Representations for Code"
-authors: N. D. Q. Bui, L. Jiang
+authors: Nghi D. Q. Bui, Lingxiao Jiang
 conference: ICSE
 year: 2018
-bibkey: bui2018hierarchical
 additional_links:
    - {name: "PDF", url: "https://arxiv.org/abs/1803.04715"}
    - {name: "code", url: "https://github.com/bdqnghi/hierarchical-programming-language-mapping"}
diff --git a/_publications/bui2019learning.markdown b/_publications/bui2019learning.markdown
index dbcd4dca..fb1cc20b 100644
--- a/_publications/bui2019learning.markdown
+++ b/_publications/bui2019learning.markdown
@@ -4,7 +4,6 @@ title: "SAR: Learning Cross-Language API Mappings with Little Knowledge"
 authors: N. D. Q. Bui, Y. Yu, L. Jiang
 conference: FSE
 year: 2019
-bibkey: bui2019learning
 additional_links:
    - {name: "PDF", url: "https://bdqnghi.github.io/files/FSE_2019.pdf"}
    - {name: "code", url: "https://github.com/bdqnghi/SAR_API_mapping"}
diff --git a/_publications/bui2021efficient.markdown b/_publications/bui2021efficient.markdown
new file mode 100644
index 00000000..4d8b193b
--- /dev/null
+++ b/_publications/bui2021efficient.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Self-Supervised Contrastive Learning for Code Retrieval and Summarization via Semantic-Preserving Transformations"
+authors: Nghi D. Q. Bui, Yijun Yu, Lingxiao Jiang
+conference: SIGIR
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.02731"}
+tags: ["pretraining", "search"]
+---
+We propose Corder, a self-supervised contrastive learning framework for source code model. Corder is designed to alleviate the need of labeled data for code retrieval and code summarization tasks. The pre-trained model of Corder can be used in two ways: (1) it can produce vector representation of code which can be applied to code retrieval tasks that do not have labeled data; (2) it can be used in a fine-tuning process for tasks that might still require label data such as code summarization. The key innovation is that we train the source code model by asking it to recognize similar and dissimilar code snippets through a contrastive learning objective. To do so, we use a set of semantic-preserving transformation operators to generate code snippets that are syntactically diverse but semantically equivalent. Through extensive experiments, we have shown that the code models pretrained by Corder substantially outperform the other baselines for code-to-code retrieval, text-to-code retrieval, and code-to-text summarization tasks.
diff --git a/_publications/bui2021infercode.markdown b/_publications/bui2021infercode.markdown
new file mode 100644
index 00000000..77b9a2fb
--- /dev/null
+++ b/_publications/bui2021infercode.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "InferCode: Self-Supervised Learning of Code Representations by Predicting Subtrees"
+authors: Nghi D. Q. Bui, Yijun Yu, Lingxiao Jiang
+conference: ICSE
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2012.07023"}
+tags: ["representation"]
+---
+Building deep learning models on source code has found many successful software engineering applications, such as code search, code comment generation, bug detection, code migration, and so on. Current learning techniques, however, have a major drawback that these models are mostly trained on datasets labeled for particular downstream tasks, and code representations may not be suitable for other tasks. While some techniques produce representations from unlabeled code, they are far from satisfactory when applied to downstream tasks. Although certain techniques generate representations from unlabeled code when applied to downstream tasks they are far from satisfactory. This paper proposes InferCode to overcome the limitation by adapting the self-supervised learning mechanism to build source code model. The key novelty lies in training code representations by predicting automatically identified subtrees from the context of the ASTs. Subtrees in ASTs are treated with InferCode as the labels for training code representations without any human labeling effort or the overhead of expensive graph construction, and the trained representations are no longer tied to any specific downstream tasks or code units. We trained an InferCode model instance using the Tree-based CNN as the encoder of a large set of Java code and applied it to downstream unsupervised tasks such as code clustering, code clone detection, cross-language code search or reused under a transfer learning scheme to continue training the model weights for supervised tasks such as code classification and method name prediction. Compared to previous code learning techniques applied to the same downstream tasks, such as Code2Vec, Code2Seq, ASTNN, higher performance results are achieved using our pre-trained InferCode model with a significant margin for most tasks including those involving different programming languages. 
diff --git a/_publications/cai2020tag.markdown b/_publications/cai2020tag.markdown
index 188428a0..aa61417d 100644
--- a/_publications/cai2020tag.markdown
+++ b/_publications/cai2020tag.markdown
@@ -4,7 +4,6 @@ title: "TAG : Type Auxiliary Guiding for Code Comment Generation"
 authors: Ruichu Cai, Zhihao Liang, Boyan Xu, Zijian Li, Yuexing Hao, Yao Chen
 conference: ACL
 year: 2020
-bibkey: cai2020tag
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.02835"}
 tags: ["bimodal", "documentation"]
diff --git a/_publications/cambronero2019deep.markdown b/_publications/cambronero2019deep.markdown
index a203cf6b..0fc8ec8c 100644
--- a/_publications/cambronero2019deep.markdown
+++ b/_publications/cambronero2019deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "When Deep Learning Met Code Search"
-authors: J. Cambronero, H. Li, S. Kim, K. Sen, S. Chandra
+authors: Jose Cambronero, Hongyu Li, Seohyun Kim, Koushik Sen, Satish Chandra
 conference: 
 year: 2019
-bibkey: cambronero2019deep
 additional_links:
 - {name: "ArXiV", url: "https://arxiv.org/abs/1905.03813"}
 tags: ["search"]
diff --git a/_publications/campbell2014syntax.markdown b/_publications/campbell2014syntax.markdown
index 573695c4..233df2b6 100644
--- a/_publications/campbell2014syntax.markdown
+++ b/_publications/campbell2014syntax.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Syntax Errors Just Aren’t Natural: Improving Error Reporting with Language Models"
-authors: J. C. Campbell, A. Hindle, J. N. Amaral
+authors: Joshua Charles Campbell, Abram Hindle, José Nelson Amaral
 conference: MSR
 year: 2014
-bibkey: campbell2014syntax
 tags: ["repair", "language model"]
 ---
 A frustrating aspect of software development is that compiler error messages often fail to locate the actual cause of a syntax error. An errant semicolon or brace can result in
diff --git a/_publications/casey2024survey.markdown b/_publications/casey2024survey.markdown
new file mode 100644
index 00000000..9e9e2c2f
--- /dev/null
+++ b/_publications/casey2024survey.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "A Survey of Source Code Representations for Machine Learning-Based Cybersecurity Tasks"
+authors: Beatrice Casey, Joanna C. S. Santos, George Perry
+conference:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2403.10646"}
+tags: ["survey", "cybersecurity", "vulnerability"]
+---
+Machine learning techniques for cybersecurity-related software engineering tasks are becoming increasingly popular. The representation of source code is a key portion of the technique that can impact the way the model is able to learn the features of the source code. With an increasing number of these techniques being developed, it is valuable to see the current state of the field to better understand what exists and what's not there yet. This paper presents a study of these existing ML-based approaches and demonstrates what type of representations were used for different cybersecurity tasks and programming languages. Additionally, we study what types of models are used with different representations. We have found that graph-based representations are the most popular category of representation, and Tokenizers and Abstract Syntax Trees (ASTs) are the two most popular representations overall. We also found that the most popular cybersecurity task is vulnerability detection, and the language that is covered by the most techniques is C. Finally, we found that sequence-based models are the most popular category of models, and Support Vector Machines (SVMs) are the most popular model overall.
diff --git a/_publications/cassano2023can.markdown b/_publications/cassano2023can.markdown
new file mode 100644
index 00000000..37fc1248
--- /dev/null
+++ b/_publications/cassano2023can.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Can It Edit? Evaluating the Ability of Large Language Models to Follow Code Editing Instructions"
+authors: Federico Cassano, Luisa Li, Akul Sethi, Noah Shinn, Abby Brennan-Jones, Jacob Ginesin, Edward Berman, George Chakhnashvili, Anton Lozhkov, Carolyn Jane Anderson, Arjun Guha
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2312.12450"}
+tags: ["editing"]
+---
+A significant amount of research is focused on developing and evaluating large language models for a variety of code synthesis tasks. These include synthesizing code from natural language, synthesizing tests from code, and synthesizing explanations of code. In contrast, the behavior of instructional code editing with LLMs is understudied. These are tasks in which the model is provided a block of code and an instruction to modify the code. The editing instruction may ask for a feature to be added or removed, describe a bug and ask for a fix, or ask for a different kind of solution. We introduce a carefully crafted benchmark of code editing tasks and use it to evaluate several cutting edge LLMs. Our evaluation exposes a significant gap between the capabilities of state-of-the-art open and closed models. For example, even GPT-3.5-Turbo is better than the best open model at code editing tasks. We also introduce a new, carefully curated, permissively licensed training dataset of code editing tasks coupled with natural language instructions. Using this training dataset, we show that we can fine-tune open Code LLMs to significantly improve their code editing capabilities, closing the gap between open and closed models. All code, data, and models are available at https://github.com/nuprl/CanItEdit.
diff --git a/_publications/cerulo2013hidden.markdown b/_publications/cerulo2013hidden.markdown
index 3f4e7847..3e1984c3 100644
--- a/_publications/cerulo2013hidden.markdown
+++ b/_publications/cerulo2013hidden.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Hidden Markov Model to Detect Coded Information Islands in Free Text"
-authors: L. Cerulo, M. Ceccarelli, M. Di Penta, G. Canfora
+authors: Luigi Cerulo, Michele Ceccarelli, Massimiliano Di Penta, Gerardo Canfora
 conference: SCAM 
 year: 2013
-bibkey: cerulo2013hidden
 tags: ["information extraction"]
 ---
 Emails and issue reports capture useful knowledge about development practices, bug fixing, and change activities. Extracting such a content is challenging, due to the mix-up of
diff --git a/_publications/cerulo2015irish.markdown b/_publications/cerulo2015irish.markdown
index 917819d5..91051445 100644
--- a/_publications/cerulo2015irish.markdown
+++ b/_publications/cerulo2015irish.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Irish: A Hidden Markov Model to detect coded information islands in free text"
-authors: L. Cerulo, M. Di Penta, A. Bacchelli, M, Ceccarelli, G. Canfora
+authors: Luigi Cerulo, Michele Ceccarelli, Massimiliano Di Penta, Gerardo Canfora
 conference: Science of Computer Programming
 year: 2015
-bibkey: cerulo2015irish
 tags: ["information extraction"]
 ---
 Developers’ communication, as contained in emails, issue trackers, and forums, is a precious source of information to support the development process. For example, it can
diff --git a/_publications/chae2016automatically.markdown b/_publications/chae2016automatically.markdown
index 4aa36681..fe879ef7 100644
--- a/_publications/chae2016automatically.markdown
+++ b/_publications/chae2016automatically.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Automatically generating features for learning program analysis heuristics"
-authors: K. Chae, H. Oh, K. Heo, H. Yang
+authors: Kwonsoo Chae, Hakjoo Oh, Kihong Heo, Hongseok Yang
 conference: 
 year: 2016
-bibkey: chae2016automatically
+additional_links:
+- {name: "ArXiV", url: "https://arxiv.org/abs/1612.09394"}
 tags: ["representation"]
 ---
 We present a technique for automatically generating features for data-driven program analyses. Recently data-driven approaches for building a program analysis have been proposed, which mine existing codebases and automatically learn heuristics for finding a cost-effective abstraction for a given analysis task. Such approaches reduce the burden of the analysis designers, but they do not remove it completely; they still leave the highly nontrivial task of designing so called features to the hands of the designers. Our technique automates this feature design process. The idea is to use programs as features after reducing and abstracting them. Our technique goes through selected program-query pairs in codebases, and it reduces and abstracts the program in each pair to a few lines of code, while ensuring that the analysis behaves similarly for the original and the new programs with respect to the query. Each reduced program serves as a boolean feature for program-query pairs. This feature evaluates to true for a given program-query pair when (as a program) it is included in the program part of the pair. We have implemented our approach for three real-world program analyses. Our experimental evaluation shows that these analyses with automatically-generated features perform comparably to those with manually crafted features. 
diff --git a/_publications/chakraborty2018tree2tree.markdown b/_publications/chakraborty2018tree2tree.markdown
index 0807bf31..a3e302cb 100644
--- a/_publications/chakraborty2018tree2tree.markdown
+++ b/_publications/chakraborty2018tree2tree.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "CODIT: Code Editing with Tree-Based Neural Machine Translation"
-authors: S. Chakraborty, M. Allamanis, B. Ray
+authors: Saikat Chakraborty, Miltiadis Allamanis, Baishakhi Ray
 conference: 
 year: 2018
-bibkey: chakraborty2018tree2tree
 additional_links:
 - {name: "ArXiV", url: "https://arxiv.org/abs/1810.00314"}
-tags: ["grammar", "AST", "repair", "generation"]
+tags: ["grammar", "grammar", "repair", "code generation"]
 ---
 The way developers edit day-to-day code tends to be repetitive, often using existing code elements. Many researchers have tried to automate repetitive code changes by learning from specific change templates which are applied to limited scope. The advancement of Neural Machine Translation (NMT) and the availability of vast open-source evolutionary data opens up the possibility of automatically learning those templates from the wild. However, unlike natural languages, for which NMT techniques were originally devised, source code and its changes have certain properties. For instance, compared to natural language, source code vocabulary can be significantly larger. Further, good changes in code do not break its syntactic structure. Thus, deploying state-of-the-art NMT models without adapting the methods to the source code domain yields sub-optimal results. To this end, we propose a novel Tree based NMT system to model source code changes and learn code change patterns from the wild. We realize our model with a change suggestion engine: CODIT and train the model with more than 30k real-world changes and evaluate it on 6k patches. Our evaluation shows the effectiveness of CODIT in learning and suggesting patches.CODIT also shows promise generating bug fix patches. 
diff --git a/_publications/chakraborty2020deep.markdown b/_publications/chakraborty2020deep.markdown
new file mode 100644
index 00000000..5cd552a1
--- /dev/null
+++ b/_publications/chakraborty2020deep.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Deep Learning based Vulnerability Detection: Are We There Yet?"
+authors: Saikat Chakraborty, Rahul Krishna, Yangruibo Ding, Baishakhi Ray
+conference: TSE
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.07235"}
+tags: ["defect", "survey"]
+---
+Automated detection of software vulnerabilities is a fundamental problem in software security. Existing program analysis techniques either suffer from high false positives or false negatives. Recent progress in Deep Learning (DL) has resulted in a surge of interest in applying DL for automated vulnerability detection. Several recent studies have demonstrated promising results achieving an accuracy of up to 95% at detecting vulnerabilities. In this paper, we ask, "how well do the state-of-the-art DL-based techniques perform in a real-world vulnerability prediction scenario?". To our surprise, we find that their performance drops by more than 50%. A systematic investigation of what causes such precipitous performance drop reveals that existing DL-based vulnerability prediction approaches suffer from challenges with the training data (e.g., data duplication, unrealistic distribution of vulnerable classes, etc.) and with the model choices (e.g., simple token-based models). As a result, these approaches often do not learn features related to the actual cause of the vulnerabilities. Instead, they learn unrelated artifacts from the dataset (e.g., specific variable/function names, etc.). Leveraging these empirical findings, we demonstrate how a more principled approach to data collection and model design, based on realistic settings of vulnerability prediction, can lead to better solutions. The resulting tools perform significantly better than the studied baseline: up to 33.57% boost in precision and 128.38% boost in recall compared to the best performing model in the literature. Overall, this paper elucidates existing DL-based vulnerability prediction systems' potential issues and draws a roadmap for future DL-based vulnerability prediction research. In that spirit, we make available all the artifacts supporting our results: https://git.io/Jf6IA
diff --git a/_publications/chakraborty2021multimodal.markdown b/_publications/chakraborty2021multimodal.markdown
new file mode 100644
index 00000000..7583d85b
--- /dev/null
+++ b/_publications/chakraborty2021multimodal.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "On Multi-Modal Learning of Editing Source Code"
+authors: Saikat Chakraborty, Baishakhi Ray
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.06645"}
+tags: ["Transformer", "edit"]
+---
+In recent years, Neural Machine Translator (NMT) has shown promise in automatically editing source code. Typical NMT based code editor only considers the code that needs to be changed as input and suggests developers with a ranked list of patched code to choose from - where the correct one may not always be at the top of the list. While NMT based code editing systems generate a broad spectrum of plausible patches, the correct one depends on the developers' requirement and often on the context where the patch is applied. Thus, if developers provide some hints, using natural language, or providing patch context, NMT models can benefit from them. As a proof of concept, in this research, we leverage three modalities of information: edit location, edit code context, commit messages (as a proxy of developers' hint in natural language) to automatically generate edits with NMT models. To that end, we build MODIT, a multi-modal NMT based code editing engine. With in-depth investigation and analysis, we show that developers' hint as an input modality can narrow the search space for patches and outperform state-of-the-art models to generate correctly patched code in top-1 position.
diff --git a/_publications/chen2019capturing.markdown b/_publications/chen2019capturing.markdown
index cc9c5624..29faae20 100644
--- a/_publications/chen2019capturing.markdown
+++ b/_publications/chen2019capturing.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Capturing source code semantics via tree-based convolution over API-enhanced AST"
-authors: L. Chen, W. Ye, S. Zheng
+authors: Long Chen, Wei Ye, Shikun Zhang
 conference: Computing Frontiers
 year: 2019
-bibkey: chen2019capturing
-tags: ["AST", "representation"]
+tags: ["grammar", "representation"]
 ---
 When deep learning meets big code, a key question is how to efficiently learn a distributed representation for source code that can capture its semantics effectively. We propose to use tree-based convolution over API-enhanced AST. To demonstrate the effectiveness of our approach, we apply it to detect semantic clones---code fragments with similar semantics but dissimilar syntax. Experiment results show that our approach outperforms an existing state-of-the-art approach that uses tree-based LSTM, with an increase of 0.39 and 0.12 in F1-score on OJClone and BigCloneBench respectively. We further propose architectures that incorporate our approach for code search and code summarization.
\ No newline at end of file
diff --git a/_publications/chen2019literature.markdown b/_publications/chen2019literature.markdown
index b79cf95e..bc000eb0 100644
--- a/_publications/chen2019literature.markdown
+++ b/_publications/chen2019literature.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Literature Study of Embeddings on Source Code"
-authors: Z. Chen, M. Monperrus
+authors: Zimin Chen, Martin Monperrus
 conference: 
 year: 2019
-bibkey: chen2019literature
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1904.03061"}
 tags: ["representation"]
diff --git a/_publications/chen2019mining.markdown b/_publications/chen2019mining.markdown
index 9ec36793..e85e9fe8 100644
--- a/_publications/chen2019mining.markdown
+++ b/_publications/chen2019mining.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Mining Likely Analogical APIs across Third-Party Libraries via Large-Scale Unsupervised API Semantics Embedding"
-authors:  C. Chen ; Z. Xing ; Y. Liu, K. L. Xiong Ong 
+authors: Chunyang Chen, Zhenchang Xing, Yang Liu, Kent Ong Long Xiong
 conference: TSE
 year: 2019
-bibkey: chen2019mining
 tags: ["API", "representation"]
 ---
 Establishing API mappings between third-party libraries is a prerequisite step for library migration tasks. Manually establishing API mappings is tedious due to the large number of APIs to be examined. Having an automatic technique to create a database of likely API mappings can significantly ease the task. Unfortunately, existing techniques either adopt supervised learning mechanism that requires already-ported or functionality similar applications across major programming languages or platforms, which are difficult to come by for an arbitrary pair of third-party libraries, or cannot deal with lexical gap in the API descriptions of different libraries. To overcome these limitations, we present an unsupervised deep learning based approach to embed both API usage semantics and API description (name and document) semantics into vector space for inferring likely analogical API mappings between libraries. Based on deep learning models trained using tens of millions of API call sequences, method names and comments of 2.8 millions of methods from 135,127 GitHub projects, our approach significantly outperforms other deep learning or traditional information retrieval (IR) methods for inferring likely analogical APIs. We implement a proof-of-concept website which can recommend analogical APIs for 583,501 APIs of 111 pairs of analogical Java libraries with diverse functionalities. This scale of third-party analogical-API database has never been achieved before.
\ No newline at end of file
diff --git a/_publications/chen2019sequencer.markdown b/_publications/chen2019sequencer.markdown
index 29a99a28..efc91e44 100644
--- a/_publications/chen2019sequencer.markdown
+++ b/_publications/chen2019sequencer.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair"
-authors: Z. Chen, S. Kommrusch, M. Tufano, L. Pouchet, D. Poshyvanyk, M. Monperrus
+authors: Zimin Chen, Steve Kommrusch, Michele Tufano, Louis-Noël Pouchet, Denys Poshyvanyk, Martin Monperrus
 conference: 
 year: 2019
-bibkey: chen2019sequencer
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1901.01808"}
-tags: ["repair", "generation"]
+tags: ["repair", "code generation"]
 ---
 This paper presents a novel end-to-end approach to program repair based on sequence-to-sequence learning. We devise, implement, and evaluate a system, called SequenceR, for fixing bugs based on sequence-to-sequence learning on source code. This approach uses the copy mechanism to overcome the unlimited vocabulary problem that occurs with big code. Our system is data-driven; we train it on 35,578 commits, carefully curated from open-source repositories. We evaluate it on 4,711 independent real bug fixes, as well on the Defects4J benchmark used in program repair research. SequenceR is able to perfectly predict the fixed line for 950/4711 testing samples. It captures a wide range of repair operators without any domain-specific top-down design.
diff --git a/_publications/chen2021evaluating.markdown b/_publications/chen2021evaluating.markdown
new file mode 100644
index 00000000..c2981b1e
--- /dev/null
+++ b/_publications/chen2021evaluating.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Evaluating Large Language Models Trained on Code"
+authors: Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde, Jared Kaplan, Harri Edwards, Yura Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, Will Guss, Alex Nichol, Igor Babuschkin, Suchir Balaji, Shantanu Jain, Andrew Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, Wojciech Zaremba
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2107.03374"}
+   - {name: "Dataset", url: "https://github.com/openai/human-eval"}
+tags: ["language model", "synthesis"]
+---
+We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of our problems with 100 samples per problem. Careful investigation of our model reveals its limitations, including difficulty with docstrings describing long chains of operations and with binding operations to variables. Finally, we discuss the potential broader impacts of deploying powerful code generation technologies, covering safety, security, and economics. 
diff --git a/_publications/chen2021plur.markdown b/_publications/chen2021plur.markdown
new file mode 100644
index 00000000..645015dc
--- /dev/null
+++ b/_publications/chen2021plur.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "PLUR: A Unifying, Graph-Based View of Program Learning, Understanding, and Repair"
+authors: Zimin Chen, Vincent J Hellendoorn, Pascal Lamblin, Petros Maniatis, Pierre-Antoine Manzagol, Daniel Tarlow, Subhodeep Moitra
+conference: NeurIPS
+year: 2021
+additional_links:
+   - {name: "NeurIPS Proceedings", url: "https://proceedings.neurips.cc/paper/2021/hash/c2937f3a1b3a177d2408574da0245a19-Abstract.html"}
+tags: ["repair"]
+---
+Machine learning for understanding and editing source code has recently attracted significant interest, with many developments in new models, new code representations, and new tasks.This proliferation can appear disparate and disconnected, making each approach seemingly unique and incompatible, thus obscuring the core machine learning challenges and contributions.In this work, we demonstrate that the landscape can be significantly simplified by taking a general approach of mapping a graph to a sequence of tokens and pointers.Our main result is to show that 16 recently published tasks of different shapes can be cast in this form, based on which a single model architecture achieves near or above state-of-the-art results on nearly all tasks, outperforming custom models like code2seq and alternative generic models like Transformers.This unification further enables multi-task learning and a series of cross-cutting experiments about the importance of different modeling choices for code understanding and repair tasks.The full framework, called PLUR, is easily extensible to more tasks, and will be open-sourced (https://github.com/google-research/plur).
diff --git a/_publications/chen2022codet.markdown b/_publications/chen2022codet.markdown
new file mode 100644
index 00000000..446c6796
--- /dev/null
+++ b/_publications/chen2022codet.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeT: Code Generation with Generated Tests"
+authors: Bei Chen, Fengji Zhang, Anh Nguyen, Daoguang Zan, Zeqi Lin, Jian-Guang Lou, Weizhu Chen
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2207.10397"}
+tags: ["synthesis", "Transformer", "execution"]
+---
+Given a programming problem, pre-trained language models such as Codex have demonstrated the ability to generate multiple different code solutions via sampling. However, selecting a correct or best solution from those samples still remains a challenge. While an easy way to verify the correctness of a code solution is through executing test cases, producing high-quality test cases is prohibitively expensive. In this paper, we explore the use of pre-trained language models to automatically generate test cases, calling our method CodeT: Code generation with generated Tests. CodeT executes the code solutions using the generated test cases, and then chooses the best solution based on a dual execution agreement with both the generated test cases and other generated solutions. We evaluate CodeT on five different pre-trained models with both HumanEval and MBPP benchmarks. Extensive experimental results demonstrate CodeT can achieve significant, consistent, and surprising improvements over previous methods. For example, CodeT improves the pass@1 on HumanEval to 65.8%, an increase of absolute 18.8% on the code-davinci-002 model, and an absolute 20+% improvement over previous state-of-the-art results.
diff --git a/_publications/chen2022learning.md b/_publications/chen2022learning.md
new file mode 100644
index 00000000..56f2e380
--- /dev/null
+++ b/_publications/chen2022learning.md
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning to Reverse DNNs from AI Programs Automatically"
+authors: Simin Chen, Hamed Khanpour, Cong Liu, Wei Yang
+conference: IJCAI-ECAI 2022
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/pdf/2205.10364"}
+tags: ["Reverse Engineering", "Binary Code"]
+---
+With the privatization deployment of DNNs on edge devices, the security of on-device DNNs has raised significant concern. To quantify the model leakage risk of on-device DNNs automatically, we propose NNReverse, the first learning-based method which can reverse DNNs from AI programs without domain knowledge. NNReverse trains a representation model to represent the semantics of binary code for DNN layers. By searching the most similar function in our database, NNReverse infers the layer type of a given function’s binary code. To represent assembly instructions semantics precisely, NNReverse proposes a more finegrained embedding model to represent the textual and structural-semantic of assembly functions.
diff --git a/_publications/chen2023diversevul.markdown b/_publications/chen2023diversevul.markdown
new file mode 100644
index 00000000..274da617
--- /dev/null
+++ b/_publications/chen2023diversevul.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "DiverseVul: A New Vulnerable Source Code Dataset for Deep Learning Based Vulnerability Detection"
+authors: Yizheng Chen, Zhoujie Ding, Xinyun Chen, David Wagner
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2304.00409"}
+tags: ["dataset", "Transformer", "vulnerability"]
+---
+We propose and release a new vulnerable source code dataset. We curate the dataset by crawling security issue websites, extracting vulnerability-fixing commits and source codes from the corresponding projects. Our new dataset contains 150 CWEs, 26,635 vulnerable functions, and 352,606 non-vulnerable functions extracted from 7,861 commits. Our dataset covers 305 more projects than all previous datasets combined. We show that increasing the diversity and volume of training data improves the performance of deep learning models for vulnerability detection.
+Combining our new dataset with previous datasets, we present an analysis of the challenges and promising research directions of using deep learning for detecting software vulnerabilities. We study 11 model architectures belonging to 4 families. Our results show that deep learning is still not ready for vulnerability detection, due to high false positive rate, low F1 score, and difficulty of detecting hard CWEs. In particular, we demonstrate an important generalization challenge for the deployment of deep learning-based models.
+However, we also identify hopeful future research directions. We demonstrate that large language models (LLMs) are the future for vulnerability detection, outperforming Graph Neural Networks (GNNs) with manual feature engineering. Moreover, developing source code specific pre-training objectives is a promising research direction to improve the vulnerability detection performance.
diff --git a/_publications/chen2023supersonic.markdown b/_publications/chen2023supersonic.markdown
new file mode 100644
index 00000000..053333e2
--- /dev/null
+++ b/_publications/chen2023supersonic.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Supersonic: Learning to Generate Source Code Optimizations in C/C++"
+authors: Zimin Chen, Sen Fang, Martin Monperrus
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2309.14846"}
+tags: ["optimization"]
+---
+Software optimization refines programs for resource efficiency while preserving functionality. Traditionally, it is a process done by developers and compilers. This paper introduces a third option, automated optimization at the source code level. We present Supersonic, a neural approach targeting minor source code modifications for optimization. Using a seq2seq model, Supersonic is trained on C/C++ program pairs ($x_{t}$, $x_{t+1}$), where $x_{t+1}$ is an optimized version of $x_{t}$, and outputs a diff. Supersonic's performance is benchmarked against OpenAI's GPT-3.5-Turbo and GPT-4 on competitive programming tasks. The experiments show that Supersonic not only outperforms both models on the code optimization task but also minimizes the extent of the change with a model more than 600x smaller than GPT-3.5-Turbo and 3700x smaller than GPT-4.
diff --git a/_publications/chen2024ppm.md b/_publications/chen2024ppm.md
new file mode 100644
index 00000000..bbd5e083
--- /dev/null
+++ b/_publications/chen2024ppm.md
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "PPM: Automated Generation of Diverse Programming Problems for Benchmarking Code Generation Models"
+authors: Simin Chen, Xiaoning Feng, Xiaohong Han, Cong Liu, Wei Yang
+conference: FSE 2024
+year: 2024
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2401.15545"}
+   - {name: "Code", url: "https://github.com/SeekingDream/PPM"}
+tags: ["benchmarking", "evaluation"]
+---
+In recent times, a plethora of Large Code Generation Models (LCGMs) have been proposed, showcasing significant potential in assisting developers with complex programming tasks. Benchmarking LCGMs necessitates the creation of a set of diverse programming problems, and each problem comprises the prompt (including the task description), canonical solution, and test inputs. The existing methods for constructing such a problem set can be categorized into two main types: manual methods and perturbation-based methods. However, manual methods demand high effort and lack scalability, while also risking data integrity due to LCGMs' potentially contaminated data collection, and perturbation-based approaches mainly generate semantically homogeneous problems with the same canonical solutions and introduce typos that can be easily auto-corrected by IDE, making them ineffective and unrealistic. In this work, we propose the idea of programming problem merging (PPM) and provide two implementation of this idea, we utilize our tool on two widely-used datasets and compare it against nine baseline methods using eight code generation models. The results demonstrate the effectiveness of our tool in generating more challenging, diverse, and natural programming problems, comparing to the baselines.
diff --git a/_publications/chibotaru2019scalable.markdown b/_publications/chibotaru2019scalable.markdown
index d16db206..f8455538 100644
--- a/_publications/chibotaru2019scalable.markdown
+++ b/_publications/chibotaru2019scalable.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Scalable Taint Specification Inference with Big Code"
-authors: V. Chibotaru, B. Bichsel, V. Raychev, M. Vechev
+authors: V. Chibotaru, B. Bichsel, Veselin Raychev, Martin Vechev
 conference: PLDI
 year: 2019
-bibkey: chibotaru2019scalable
 tags: ["defect", "program analysis"]
 ---
 We present a new scalable, semi-supervised method for inferring
diff --git a/_publications/chirkova2020empirical.markdown b/_publications/chirkova2020empirical.markdown
new file mode 100644
index 00000000..9dfe7c87
--- /dev/null
+++ b/_publications/chirkova2020empirical.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Empirical Study of Transformers for Source Code"
+authors: Nadezhda Chirkova, Sergey Troshin
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2010.07987"}
+tags: ["Transformer"]
+---
+Initially developed for natural language processing (NLP), Transformers are now widely used for source code processing, due to the format similarity between source code and text. In contrast to natural language, source code is strictly structured, i. e. follows the syntax of the programming language. Several recent works develop Transformer modifications for capturing syntactic information in source code. The drawback of these works is that they do not compare to each other and all consider different tasks. In this work, we conduct a thorough empirical study of the capabilities of Transformers to utilize syntactic information in different tasks. We consider three tasks (code completion, function naming and bug fixing) and re-implement different syntax-capturing modifications in a unified framework. We show that Transformers are able to make meaningful predictions based purely on syntactic information and underline the best practices of taking the syntactic information into account for improving the performance of the model. 
diff --git a/_publications/chirkova2021embeddings.markdown b/_publications/chirkova2021embeddings.markdown
new file mode 100644
index 00000000..0306161c
--- /dev/null
+++ b/_publications/chirkova2021embeddings.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "On the Embeddings of Variables in Recurrent Neural Networks for Source Code"
+authors: Nadezhda Chirkova
+conference: NAACL
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2010.12693"}
+   - {name: "Code", url: "https://github.com/nadiinchi/dynamic_embeddings"}
+tags: ["autocomplete"]
+---
+Source code processing heavily relies on the methods widely used in natural language processing (NLP), but involves specifics that need to be taken into account to achieve higher quality. An example of this specificity is that the semantics of a variable is defined not only by its name but also by the contexts in which the variable occurs. In this work, we develop dynamic embeddings, a recurrent mechanism that adjusts the learned semantics of the variable when it obtains more information about the variable’s role in the program. We show that using the proposed dynamic embeddings significantly improves the performance of the recurrent neural network, in code completion and bug fixing tasks. 
diff --git a/_publications/chow2023beware.markdown b/_publications/chow2023beware.markdown
new file mode 100644
index 00000000..dd246b6b
--- /dev/null
+++ b/_publications/chow2023beware.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Beware of the Unexpected: Bimodal Taint Analysis"
+authors: Yiu Wai Chow, Max Schäfer, Michael Pradel
+conference: ISSTA
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2301.10545"}
+tags: ["static analysis"]
+---
+Static analysis is a powerful tool for detecting security vulnerabilities and other programming problems. Global taint tracking, in particular, can spot vulnerabilities arising from complicated data flow across multiple functions. However, precisely identifying which flows are problematic is challenging, and sometimes depends on factors beyond the reach of pure program analysis, such as conventions and informal knowledge. For example, learning that a parameter `name` of an API function `locale` ends up in a file path is surprising and potentially problematic. In contrast, it would be completely unsurprising to find that a parameter `command` passed to an API function `execaCommand` is eventually interpreted as part of an operating-system command. This paper presents Fluffy, a bimodal taint analysis that combines static analysis, which reasons about data flow, with machine learning, which probabilistically determines which flows are potentially problematic. The key idea is to let machine learning models predict from natural language information involved in a taint flow, such as API names, whether the flow is expected or unexpected, and to inform developers only about the latter. We present a general framework and instantiate it with four learned models, which offer different trade-offs between the need to annotate training data and the accuracy of predictions. We implement Fluffy on top of the CodeQL analysis framework and apply it to 250K JavaScript projects. Evaluating on five common vulnerability types, we find that Fluffy achieves an F1 score of 0.85 or more on four of them across a variety of datasets. 
diff --git a/_publications/ciurumelea2020suggesting.markdown b/_publications/ciurumelea2020suggesting.markdown
index e17521b6..f5bac3eb 100644
--- a/_publications/ciurumelea2020suggesting.markdown
+++ b/_publications/ciurumelea2020suggesting.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Suggesting Comment Completions for Python using Neural Language Models"
-authors: A. Ciurumelea, S. Proksch, H.C. Gall
+authors: Adelina Ciurumelea; Sebastian Proksch; Harald C. Gall
 conference: SANER
 year: 2020
-bibkey: ciurumelea2020suggesting
 additional_links:
    - {name: "IEEE Xplore", url: "https://ieeexplore.ieee.org/abstract/document/9054866"}
 tags: ["bimodal", "autocomplete", "documentation"]
diff --git a/_publications/clement2020pymt5.markdown b/_publications/clement2020pymt5.markdown
new file mode 100644
index 00000000..496e0b32
--- /dev/null
+++ b/_publications/clement2020pymt5.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "PyMT5: multi-mode translation of natural language and Python code with transformers"
+authors: Colin B. Clement, Dawn Drain, Jonathan Timcheck, Alexey Svyatkovskiy, Neel Sundaresan
+conference: EMNLP
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2010.03150"}
+tags: ["bimodal", "code generation", "summarization", "documentation", "language model", "pretraining"]
+---
+Simultaneously modeling source code and natural language has many exciting applications in automated software development and understanding. Pursuant to achieving such technology, we introduce PyMT5, the Python method text-to-text transfer transformer, which is trained to translate between all pairs of Python method feature combinations: a single model that can both predict whole methods from natural language documentation strings (docstrings) and summarize code into docstrings of any common style. We present an analysis and modeling effort of a large-scale parallel corpus of 26 million Python methods and 7.7 million method-docstring pairs, demonstrating that for docstring and method generation, PyMT5 outperforms similarly-sized auto-regressive language models (GPT2) which were English pre-trained or randomly initialized. On the CodeSearchNet test set, our best model predicts 92.1% syntactically correct method bodies, achieved a BLEU score of 8.59 for method generation and 16.3 for docstring generation (summarization), and achieved a ROUGE-L F-score of 24.8 for method generation and 36.7 for docstring generation.
diff --git a/_publications/clement2021distilling.markdown b/_publications/clement2021distilling.markdown
new file mode 100644
index 00000000..7bdca1ff
--- /dev/null
+++ b/_publications/clement2021distilling.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Distilling Transformers for Neural Cross-Domain Search"
+authors: Colin B. Clement, Chen Wu, Dawn Drain, Neel Sundaresan
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.03322"}
+tags: ["search", "Transformer"]
+---
+Pre-trained transformers have recently clinched top spots in the gamut of natural language tasks and pioneered solutions to software engineering tasks. Even information retrieval has not been immune to the charm of the transformer, though their large size and cost is generally a barrier to deployment. While there has been much work in streamlining, caching, and modifying transformer architectures for production, here we explore a new direction: distilling a large pre-trained translation model into a lightweight bi-encoder which can be efficiently cached and queried. We argue from a probabilistic perspective that sequence-to-sequence models are a conceptually ideal---albeit highly impractical---retriever. We derive a new distillation objective, implementing it as a data augmentation scheme. Using natural language source code search as a case study for cross-domain search, we demonstrate the validity of this idea by significantly improving upon the current leader of the CodeSearchNet challenge, a recent natural language code search benchmark. 
diff --git a/_publications/clement2021long.markdown b/_publications/clement2021long.markdown
new file mode 100644
index 00000000..fe8c6a82
--- /dev/null
+++ b/_publications/clement2021long.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Long-Range Modeling of Source Code Files with eWASH: Extended Window Access by Syntax Hierarchy"
+authors: Colin B. Clement, Shuai Lu, Xiaoyu Liu, Michele Tufano, Dawn Drain, Nan Duan, Neel Sundaresan, Alexey Svyatkovskiy
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2109.08780"}
+tags: ["Transformer", "language model", "code generation"]
+---
+Statistical language modeling and translation with transformers have found many successful applications in program understanding and generation tasks, setting high benchmarks for tools in modern software development environments. The finite context window of these neural models means, however, that they will be unable to leverage the entire relevant context of large files and packages for any given task. While there are many efforts to extend the context window, we introduce an architecture-independent approach for leveraging the syntactic hierarchies of source code for incorporating entire file-level context into a fixed-length window. Using concrete syntax trees of each source file we extract syntactic hierarchies and integrate them into context window by selectively removing from view more specific, less relevant scopes for a given task. We evaluate this approach on code generation tasks and joint translation of natural language and source code in Python programming language, achieving a new state-of-the-art in code completion and summarization for Python in the CodeXGLUE benchmark. We also introduce new CodeXGLUE benchmarks for user-experience-motivated tasks: code completion with normalized literals, method body completion/code summarization conditioned on file-level context. 
diff --git a/_publications/commit2vec2019lozoya.markdown b/_publications/commit2vec2019lozoya.markdown
index e4eb0269..4f4952d3 100644
--- a/_publications/commit2vec2019lozoya.markdown
+++ b/_publications/commit2vec2019lozoya.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Commit2Vec: Learning Distributed Representations of Code Changes"
-authors: R. C. Lozoya, A. Baumann, A. Sabetta, M. Bezzi
+authors: Adelina Ciurumelea; Sebastian Proksch; Harald C. Gall
 conference: 
 year: 2019
-bibkey: commit2vec2019lozoya
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1911.07605"}
 tags: ["edit"]
diff --git a/_publications/compton2020embedding.markdown b/_publications/compton2020embedding.markdown
index e68edf19..add2927f 100644
--- a/_publications/compton2020embedding.markdown
+++ b/_publications/compton2020embedding.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Embedding Java Classes with code2vec: Improvements from Variable Obfuscation"
-authors: R. Compton, E. Frank, P. Patros, A. Koay
+authors: Rhys Compton, Eibe Frank, Panos Patros, Abigail Koay
 conference: MSR
 year: 2020
-bibkey: compton2020embedding
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.02942"}
 tags: ["naming", "adversarial"]
diff --git a/_publications/corley2015exploring.markdown b/_publications/corley2015exploring.markdown
index e1a260dd..ffe6ff9b 100644
--- a/_publications/corley2015exploring.markdown
+++ b/_publications/corley2015exploring.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Exploring the Use of Deep Learning for Feature Location"
-authors: C.S. Corley, K. Damevski, N.A. Kraft
+authors: Christopher S. Corley, Kostadin Damevski, Nicholas A. Kraft
 conference: 
 year: 2015
-bibkey: corley2015exploring
 tags: ["feature location", "representation"]
 ---
 Deep learning models are a class of neural networks. Relative to n-gram models, deep learning models can capture more complex statistical patterns based on smaller training corpora. In this paper we explore the use of a particular deep learning model, document vectors (DVs), for feature location. DVs seem well suited to use with source code, because they both capture the influence of context on each term in a corpus and map terms into a continuous semantic space that encodes semantic relationships such as synonymy. We present preliminary results that show that a feature location technique (FLT) based on DVs can outperform an analogous FLT based on latent Dirichlet allocation (LDA) and then suggest several directions for future work on the use of deep learning models to improve developer effectiveness in feature location.
diff --git a/_publications/cummins2017end.markdown b/_publications/cummins2017end.markdown
index df5ec485..4783168f 100644
--- a/_publications/cummins2017end.markdown
+++ b/_publications/cummins2017end.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "End-to-end Deep Learning of Optimization Heuristics"
-authors: C. Cummins, P. Petoumenos, Z. Wang, H. Leather
+authors: Chris Cummins, Pavlos Petoumenos, Zheng Wang, Hugh Leather
 conference: 
 year: 2017
-bibkey: cummins2017end
 tags: ["optimization"]
 ---
 Accurate automatic optimization heuristics are necessary for dealing with the complexity and diversity of modern hardware and software. Machine learning is a proven technique for learning such heuristics, but its success is bound by the quality of the features used. These features must be hand crafted by developers through a combination of expert domain knowledge and trial and error. This makes the quality of the final model directly dependent on the skill and available time of the system architect.
diff --git a/_publications/cummins2017synthesizing.markdown b/_publications/cummins2017synthesizing.markdown
index 466d05b7..4e869e2c 100644
--- a/_publications/cummins2017synthesizing.markdown
+++ b/_publications/cummins2017synthesizing.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Synthesizing benchmarks for predictive modeling"
-authors: C. Cummin, P. Petoumenos, Z. Wang, H. Leather
+authors: Chris Cummins, Pavlos Petoumenos, Zheng Wang, Hugh Leather
 conference: CGO
 year: 2017
-bibkey: cummins2017synthesizing
-tags: ["optimization", "generation"]
+tags: ["optimization", "code generation"]
 ---
 Predictive modeling using machine learning is an effective method for building compiler heuristics, but there is a shortage of benchmarks. Typical machine learning experiments outside of the compilation field train over thousands or millions of examples. In machine learning for compilers, however, there are typically only a few dozen common benchmarks available. This limits the quality of learned models, as they have very sparse training data for what are often high-dimensional feature spaces. What is needed is a way to generate an unbounded number of training programs that finely cover the feature space. At the same time the generated programs must be similar to the types of programs that human developers actually write, otherwise the learning will target the wrong parts of the feature space. We mine open source repositories for program fragments and apply deep learning techniques to automatically construct models for how humans write programs. We sample these models to generate an unbounded number of runnable training programs. The quality of the programs is such that even human developers struggle to distinguish our generated programs from hand-written code. We use our generator for OpenCL programs, CLgen, to automatically synthesize thousands of programs and show that learning over these improves the performance of a state of the art predictive model by 1.27x. In addition, the fine covering of the feature space automatically exposes weaknesses in the feature design which are invisible with the sparse training examples from existing benchmark suites. Correcting these weaknesses further increases performance by 4.30x.
diff --git a/_publications/cummins2018compiler.markdown b/_publications/cummins2018compiler.markdown
index ea453708..f30bbfdb 100644
--- a/_publications/cummins2018compiler.markdown
+++ b/_publications/cummins2018compiler.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Compiler Fuzzing through Deep Learning"
-authors: C. Cummins, P. Petoumenos, H. Leather, A. Murray
+authors: Chris Cummins, Pavlos Petoumenos, Alastair Murray, Hugh Leather
 conference: ISSTA
 year: 2018
-bibkey: cummins2018compiler
-tags: ["fuzzing", "generation"]
+tags: ["fuzzing", "code generation"]
 ---
 Random program generation — fuzzing — is an effective technique
 for discovering bugs in compilers but successful fuzzers require
diff --git a/_publications/cummins2020programl.markdown b/_publications/cummins2020programl.markdown
new file mode 100644
index 00000000..71e2073f
--- /dev/null
+++ b/_publications/cummins2020programl.markdown
@@ -0,0 +1,17 @@
+---
+layout: publication
+title: "ProGraML: Graph-based Deep Learning for Program Optimization and Analysis"
+authors: Chris Cummins, Zacharias V. Fisches, Tal Ben-Nun, Torsten Hoefler, Hugh Leather
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2003.10536"}
+   - {name: "Dataset", url: "https://zenodo.org/record/4122437"}
+   - {name: "Code", url: "https://github.com/ChrisCummins/ProGraML"}
+tags: ["dataset", "GNN"]
+---
+The increasing complexity of computing systems places a tremendous burden on optimizing compilers, requiring ever more accurate and aggressive optimizations. Machine learning offers significant benefits for constructing optimization heuristics but there remains a gap between what state-of-the-art methods achieve and the performance of an optimal heuristic. Closing this gap requires improvements in two key areas: a representation that accurately captures the semantics of programs, and a model architecture with sufficient expressiveness to reason about this representation.
+
+We introduce ProGraML - Program Graphs for Machine Learning - a novel graph-based program representation using a low level, language agnostic, and portable format; and machine learning models capable of performing complex downstream tasks over these graphs. The ProGraML representation is a directed attributed multigraph that captures control, data, and call relations, and summarizes instruction and operand types and ordering. Message Passing Neural Networks propagate information through this structured representation, enabling whole-program or per-vertex classification tasks.
+
+ProGraML provides a general-purpose program representation that equips learnable models to perform the types of program analysis that are fundamental to optimization. To this end, we evaluate the performance of our approach first on a suite of traditional compiler analysis tasks: control flow reachability, dominator trees, data dependencies, variable liveness, and common subexpression detection. On a benchmark dataset of 250k LLVM-IR files covering six source programming languages, ProGraML achieves an average 94.0 F1 score, significantly outperforming the state-of-the-art approaches. We then apply our approach to two high-level tasks - heterogeneous device mapping and program classification - setting new state-of-the-art performance in both. 
diff --git a/_publications/cvitkovic2018open.markdown b/_publications/cvitkovic2018open.markdown
index fac564b3..69243134 100644
--- a/_publications/cvitkovic2018open.markdown
+++ b/_publications/cvitkovic2018open.markdown
@@ -1,10 +1,12 @@
 ---
 layout: publication
 title: "Open Vocabulary Learning on Source Code with a Graph-Structured Cache"
-authors: M. Cvitkovic, B. Singh, A. Anandkumar
+authors: Milan Cvitkovic, Badal Singh, Anima Anandkumar
 conference:
 year: 2018
 bibkey: cvitkovic2018open.markdown
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1810.08305"}
 tags: ["GNN", "variable misuse", "defect", "representation"]
 ---
 Machine learning models that take computer program source code as input typically use Natural Language Processing (NLP) techniques. However, a major challenge is that code is written using an open, rapidly changing vocabulary due to, e.g., the coinage of new variable and method names. Reasoning over such a vocabulary is not something for which most NLP methods are designed. We introduce a Graph-Structured Cache to address this problem; this cache contains a node for each new word the model encounters with edges connecting each word to its occurrences in the code. We find that combining this graph-structured cache strategy with recent Graph-Neural-Network-based models for supervised learning on code improves the models' performance on a code completion task and a variable naming task --- with over 100% relative improvement on the latter --- at the cost of a moderate increase in computation time.
diff --git a/_publications/dam2016deep.markdown b/_publications/dam2016deep.markdown
index da3c44c6..540bd510 100644
--- a/_publications/dam2016deep.markdown
+++ b/_publications/dam2016deep.markdown
@@ -1,10 +1,10 @@
 ---
 layout: publication
 title: "A deep language model for software code"
-authors: H. K. Dam, T. Tran, T. Pham
-conference: ArXiV 1608.02715
+authors: Hoa Khanh Dam, Truyen Tran, Trang Pham
 year: 2016
-bibkey: dam2016deep
-tags: ["language model", "generation"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1608.02715"}
+tags: ["language model", "code generation"]
 ---
 Existing language models such as n-grams for software code often fail to capture a long context where dependent code elements scatter far apart. In this paper, we propose a novel approach to build a language model for software code to address this particular issue. Our language model, partly inspired by human memory, is built upon the powerful deep learning-based Long Short Term Memory architecture that is capable of learning long-term dependencies which occur frequently in software code. Results from our intrinsic evaluation on a corpus of Java projects have demonstrated the effectiveness of our language model. This work contributes to realizing our vision for DeepSoft, an end-to-end, generic deep learning-based framework for modeling software and its development process. 
diff --git a/_publications/dash2018refinym.markdown b/_publications/dash2018refinym.markdown
index 80bc6151..9b51981c 100644
--- a/_publications/dash2018refinym.markdown
+++ b/_publications/dash2018refinym.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "RefiNym: Using Names to Refine Types"
-authors: S. Dash, M. Allamanis, E. T. Barr
+authors: Santanu Dash, Miltiadis Allamanis, Earl T. Barr
 conference: FSE
 year: 2018
-bibkey: dash2018refinym
 tags: ["program analysis", "types"]
 ---
 Source code is bimodal: it combines a formal algorithmic channel and a natural language channel of identifiers and comments. In this work, we model the bimodality of code with name lows, an assignment low graph augmented to track identiier names. Conceptual types are logically distinct types that do not always coincide with program types. Passwords and URLs are example conceptual types that can share the program type string. Our tool, RefiNym, is an unsupervised method that mines a lattice of conceptual types from name lows and reiies them into distinct nominal types. For string, RefiNym inds and splits conceptual types originally merged into a single type, reducing the number of same-type variables per scope from 8.7 to 2.2 while eliminating 21.9% of scopes that have more than one same-type variable in scope. This makes the code more self-documenting and frees the type system to prevent a developer from inadvertently assigning data across conceptual types.
diff --git a/_publications/david2019neural.markdown b/_publications/david2019neural.markdown
index 784f467a..c60396c1 100644
--- a/_publications/david2019neural.markdown
+++ b/_publications/david2019neural.markdown
@@ -1,13 +1,12 @@
 ---
 layout: publication
 title: "Neural Reverse Engineering of Stripped Binaries"
-authors: Y. David, U. Alon, E. Yahav
+authors: Yaniv David, Uri Alon, Eran Yahav
 conference: ICLR
 year: 2019
-bibkey: david2019neural
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1902.09122"}
-tags: ["naming", "deobfuscation"]
+tags: ["naming", "deobfuscation", "GNN"]
 ---
 We address the problem of predicting procedure names in stripped executables which contain no debug information.
 Predicting procedure names can dramatically ease the task of reverse engineering, saving precious time and human effort. 
diff --git a/_publications/defreez2018path.markdown b/_publications/defreez2018path.markdown
index db744a7b..2b1ce28a 100644
--- a/_publications/defreez2018path.markdown
+++ b/_publications/defreez2018path.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Path-Based Function Embedding and its Application to Specification Mining"
-authors: D. DeFreez, A. V. Thakur, C. Rubio-González
+authors: Daniel DeFreez, Aditya V. Thakur, Cindy Rubio-González
 conference: ICSE
 year: 2018
-bibkey: defreez2018path
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1802.07779"}
 tags: ["program analysis", "representation"]
 ---
 Identifying the relationships among program elements is useful
diff --git a/_publications/derezendemartins2020concra.md b/_publications/derezendemartins2020concra.md
new file mode 100644
index 00000000..d488e542
--- /dev/null
+++ b/_publications/derezendemartins2020concra.md
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "CoNCRA: A Convolutional Neural Network Code Retrieval Approach"
+authors: Marcelo de Rezende Martins, Marco Aurélio Gerosa
+conference: SBES '20
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.01959"}
+   - {name: "code", url: "https://github.com/mrezende/concra"}
+tags: ["search"]
+---
+Software developers routinely search for code using general-purpose search engines. However, these search engines cannot find code semantically unless it has an accompanying description. We propose a technique for semantic code search: A Convolutional Neural Network approach to code retrieval (CoNCRA). Our technique aims to find the code snippet that most closely matches the developer's intent, expressed in natural language. We evaluated our approach's efficacy on a dataset composed of questions and code snippets collected from Stack Overflow. Our preliminary results showed that our technique, which prioritizes local interactions (words nearby), improved the state-of-the-art (SOTA) by 5% on average, retrieving the most relevant code snippets in the top 3 (three) positions by almost 80% of the time. Therefore, our technique is promising and can improve the efficacy of semantic code retrieval.
+
diff --git a/_publications/devanbu2020deep.markdown b/_publications/devanbu2020deep.markdown
new file mode 100644
index 00000000..a53054b4
--- /dev/null
+++ b/_publications/devanbu2020deep.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Deep Learning & Software Engineering: State of Research and Future Directions"
+authors: Prem Devanbu, Matthew Dwyer, Sebastian Elbaum, Michael Lowry, Kevin Moran, Denys Poshyvanyk, Baishakhi Ray, Rishabh Singh, Xiangyu Zhang
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.08525"}
+tags: ["survey"]
+---
+Given the current transformative potential of research that sits at the intersection of Deep Learning (DL) and Software Engineering (SE), an NSF-sponsored community workshop was conducted in co-location with the 34th IEEE/ACM International Conference on Automated Software Engineering (ASE'19) in San Diego, California. The goal of this workshop was to outline high priority areas for cross-cutting research. While a multitude of exciting directions for future work were identified, this report provides a general summary of the research areas representing the areas of highest priority which were discussed at the workshop. The intent of this report is to serve as a potential roadmap to guide future work that sits at the intersection of SE & DL. 
diff --git a/_publications/devlin2017semantic.markdown b/_publications/devlin2017semantic.markdown
index e3e353b3..a7e2e76b 100644
--- a/_publications/devlin2017semantic.markdown
+++ b/_publications/devlin2017semantic.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Semantic Code Repair using Neuro-Symbolic Transformation Networks"
-authors: J. Devlin, J. Uesato, R. Shingh, P. Kohli
+authors: Jacob Devlin, Jonathan Uesato, Rishabh Singh, Pushmeet Kohli
 conference:
 year: 2017
-bibkey: devlin2017semantic
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1710.11054"}
 tags: ["repair"]
diff --git a/_publications/deze2021mulcode.markdown b/_publications/deze2021mulcode.markdown
new file mode 100644
index 00000000..5043d4fb
--- /dev/null
+++ b/_publications/deze2021mulcode.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "MulCode: A Multi-task Learning Approach for Source Code Understanding"
+authors: Deze Wang, Yue Yu, Shanshan Li, Wei Dong, Ji Wang, Liao Qing
+conference: SANER
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://yuyue.github.io/res/paper/mulcode_saner2021.pdf"}
+tags: ["representation"]
+---
+Recent years have witnessed the significant rise of Deep Learning (DL) techniques applied to source code. Researchers exploit DL for a multitude of tasks and achieve impressive results. However, most tasks are explored separately, resulting in a lack of generalization of the solutions. In this work, we propose MulCode, a multi-task learning approach for source code understanding that learns unified representation space for tasks, with the pre-trained BERT model for the token sequence and the Tree-LSTM model for abstract syntax trees. Furthermore, we integrate two source code views into a hybrid representation via the attention mechanism and set learnable uncertainty parameters to adjust the tasks’ relationship. We train and evaluate MulCode in three downstream tasks: comment classification, author attribution, and duplicate function detection. In all tasks, MulCode outperforms the state-of-theart techniques. Moreover, experiments on three unseen tasks demonstrate the generalization ability of MulCode compared with state-of-the-art embedding methods.
\ No newline at end of file
diff --git a/_publications/deze2022bridging.markdown b/_publications/deze2022bridging.markdown
new file mode 100644
index 00000000..3f524ad9
--- /dev/null
+++ b/_publications/deze2022bridging.markdown
@@ -0,0 +1,14 @@
+---
+layout: publication
+title: "Bridging Pre-trained Models and Downstream Tasks for Source Code Understanding"
+authors: Deze Wang, Zhouyang Jia, Shanshan Li, Yue Yu, Yun Xiong, Wei Dong, Xiangke Liao
+conference: ICSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2112.02268"}
+   - {name: "code", url: "https://github.com/wangdeze18/DACL"}
+tags: ["representation", "language model"]
+---
+With the great success of pre-trained models, the pretrain-then-finetune paradigm has been widely adopted on downstream tasks for source code understanding. However, compared to costly training a large-scale model from scratch, how to effectively adapt pre-trained models to a new task has not been fully explored. In this paper, we propose an approach to bridge pre-trained models and code-related tasks. We exploit semantic-preserving transformation to enrich downstream data diversity, and help pre-trained models learn semantic features that are invariant to these semantically equivalent transformations. Further, we introduce curriculum learning to organize the transformed data in an easy-to-hard manner to fine-tune existing pre-trained models.
+
+We apply our approach to a range of pre-trained models, and they significantly outperform the state-of-the-art models on tasks for source code understanding, such as algorithm classification, code clone detection, and code search. Our experiments even show that without heavy pre-training on code data, natural language pre-trained model RoBERTa fine-tuned with our lightweight approach could outperform or rival existing code pre-trained models fine-tuned on the above tasks, such as CodeBERT and GraphCodeBERT. This finding suggests that there is still much room for improvement in code pre-trained models.
diff --git a/_publications/dinella2020hoppity.markdown b/_publications/dinella2020hoppity.markdown
index c34d59bd..6a034946 100644
--- a/_publications/dinella2020hoppity.markdown
+++ b/_publications/dinella2020hoppity.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Hoppity: Learning Bug Detection and Repair"
-authors: E. Dinella, H. Dai, Z. Li, M. Naik, L. Song, K. Wang
+authors: Elizabeth Dinella, Hanjun Dai, Ziyang Li, Mayur Naik, Le Song, Ke Wang
 conference: ICLR
 year: 2020
-bibkey: dinella2020hoppity
 additional_links:
    - {name: "OpenReview", url: "https://openreview.net/forum?id=SJeqs6EFvB&noteId=SJeqs6EFvB"}
    - {name: "Demo", url: "https://hoppity.seas.upenn.edu/demo"}
diff --git a/_publications/dinella2021deepmerge.markdown b/_publications/dinella2021deepmerge.markdown
new file mode 100644
index 00000000..864355ab
--- /dev/null
+++ b/_publications/dinella2021deepmerge.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DeepMerge: Learning to Merge Programs"
+authors: Elizabeth Dinella, Todd Mytkowicz, Alexey Svyatkovskiy, Christian Bird, Mayur Naik, Shuvendu K. Lahiri
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.07569"}
+tags: ["edit", "repair"]
+---
+Program merging is ubiquitous in modern software development. Although commonly used in most version control systems, text-based merge algorithms are prone to producing spurious merge conflicts: they report a conflict even when program changes do not interfere with each other semantically. Spurious merge conflicts are costly to development as the need for manual intervention stalls modern continuous integration pipelines. We propose a novel data-driven approach to identify and resolve spurious merge conflicts with a sequence-to-sequence machine learning model. We realize our approach in a tool DeepMerge that uses a novel combination of (i) an edit-aware embedding of merge inputs and (ii) a variation of pointer networks to construct resolutions from input segments. We also propose an algorithm to extract ground truth manual resolutions from a code corpus and employ it to curate a dataset comprising 10,729 non-trivial resolutions in Javascript programs. Our evaluation shows that DeepMerge can predict correct resolutions with high precision (72%) and modest recall (34%) on the dataset overall, and high recall (78%) on merges comprising of upto 3 lines that comprise 24% of the dataset. 
diff --git a/_publications/dinella2022toga.markdown b/_publications/dinella2022toga.markdown
new file mode 100644
index 00000000..fbc8ff55
--- /dev/null
+++ b/_publications/dinella2022toga.markdown
@@ -0,0 +1,28 @@
+---
+layout: publication
+title: "TOGA: A Neural Method for Test Oracle Generation"
+authors: Elizabeth Dinella, Gabriel Ryan, Todd Mytkowicz, Shuvendu K. Lahiri
+conference: ICSE
+year: 2022
+additional_links:
+   - {name: "Preprint", url: "https://www.seas.upenn.edu/~edinella/icse-camera-ready.pdf"}
+tags: ["code generation", "Transformer", "test generation"]
+---
+Testing is widely recognized as an important stage of the software
+development lifecycle. Effective software testing can provide benefits such as bug finding, preventing regressions, and documentation.
+In terms of documentation, unit tests express a unit’s intended
+functionality, as conceived by the developer. A test oracle, typically expressed as an condition, documents the intended behavior
+of a unit under a given test prefix. Synthesizing a functional test
+oracle is a challenging problem, as it must capture the intended
+functionality rather than the implemented functionality.
+In this paper, we propose TOGA (a neural method for Test Oracle
+GenerAtion), a unified transformer-based neural approach to infer
+both exceptional and assertion test oracles based on the context of
+the focal method. Our approach can handle units with ambiguous
+or missing documentation, and even units with a missing implementation. We evaluate our approach on both oracle inference accuracy
+and functional bug-finding. Our technique improves accuracy by
+33% over existing oracle inference approaches, achieving 96% overall accuracy on a held out test dataset. Furthermore, we show that
+when integrated with a automated test generation tool (EvoSuite),
+our approach finds 57 real world bugs in large-scale Java programs,
+including 30 bugs that are not found by any other automated testing
+method in our evaluation
diff --git a/_publications/ding2019asm2vec.markdown b/_publications/ding2019asm2vec.markdown
index c048ef5a..56e2405a 100644
--- a/_publications/ding2019asm2vec.markdown
+++ b/_publications/ding2019asm2vec.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Asm2Vec: Boosting Static Representation Robustness for Binary Clone Search against Code Obfuscation and Compiler Optimization"
-authors: S. H. H. Ding, B. C. M. Fung, P. Charland
+authors: Steven H. H. Ding, Benjamin C. M. Fung, Philippe Charland
 conference: IEEE Symposium on Security and Privacy
 year: 2019
-bibkey: ding2019asm2vec
 tags: ["representation", "clone"]
 ---
 Reverse engineering is a manually intensive but necessary technique for understanding the inner workings of new malware, finding vulnerabilities in existing systems, and detecting patent infringements in released software. An assembly clone search engine facilitates the work of reverse engineers by identifying those duplicated or known parts. However, it is challenging to design a robust clone search engine, since there exist various compiler optimization options and code obfuscation techniques that make logically similar assembly functions appear to be very different. A practical clone search engine relies on a robust vector representation of assembly code. However, the existing clone search approaches, which rely on a manual feature engineering process to form a feature vector for an assembly function, fail to consider the relationships between features and identify those unique patterns that can statistically distinguish assembly functions. To address this problem, we propose to jointly learn the lexical semantic relationships and the vector representation of assembly functions based on assembly code. We have developed an assembly code representation learning model \emph{Asm2Vec}. It only needs assembly code as input and does not require any prior knowledge such as the correct mapping between assembly functions. It can find and incorporate rich semantic relationships among tokens appearing in assembly code. We conduct extensive experiments and benchmark the learning model with state-of-the-art static and dynamic clone search approaches. We show that the learned representation is more robust and significantly outperforms existing methods against changes introduced by obfuscation and optimizations.
diff --git a/_publications/ding2021contrastive.markdown b/_publications/ding2021contrastive.markdown
new file mode 100644
index 00000000..0c80ed08
--- /dev/null
+++ b/_publications/ding2021contrastive.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Contrastive Learning for Source Code with Structural and Functional Properties"
+authors: Yangruibo Ding, Luca Buratti, Saurabh Pujar, Alessandro Morari, Baishakhi Ray, Saikat Chakraborty
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2110.03868"}
+tags: ["representation", "pretraining", "Transformer"]
+---
+Pre-trained transformer models have recently shown promises for understanding the source code. Most existing works expect to understand code from the textual features and limited structural knowledge of code. However, the program functionalities sometimes cannot be fully revealed by the code sequence, even with structure information. Programs can contain very different tokens and structures while sharing the same functionality, but changing only one or a few code tokens can introduce unexpected or malicious program behaviors while preserving the syntax and most tokens. In this work, we present BOOST, a novel self-supervised model to focus pre-training based on the characteristics of source code. We first employ automated, structure-guided code transformation algorithms that generate (i.) functionally equivalent code that looks drastically different from the original one, and (ii.) textually and syntactically very similar code that is functionally distinct from the original. We train our model in a way that brings the functionally equivalent code closer and distinct code further through a contrastive learning objective. To encode the structure information, we introduce a new node-type masked language model objective that helps the model learn about structural context. We pre-train BOOST with a much smaller dataset than the state-of-the-art models, but our small models can still match or outperform these large models in code understanding and generation tasks. 
diff --git a/_publications/ding2023static.markdown b/_publications/ding2023static.markdown
new file mode 100644
index 00000000..9d0c4fc8
--- /dev/null
+++ b/_publications/ding2023static.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "A Static Evaluation of Code Completion by Large Language Models"
+authors: Hantian Ding, Varun Kumar, Yuchen Tian, Zijian Wang, Rob Kwiatkowski, Xiaopeng Li, Murali Krishna Ramanathan, Baishakhi Ray, Parminder Bhatia, Sudipta Sengupta, Dan Roth, Bing Xiang
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2306.03203"}
+tags: ["LLM", "static analysis"]
+---
+Large language models trained on code have shown great potential to increase productivity of software developers. Several execution-based benchmarks have been proposed to evaluate functional correctness of model-generated code on simple programming problems. Nevertheless, it is expensive to perform the same evaluation on complex real-world projects considering the execution cost. On the contrary, static analysis tools such as linters, which can detect errors without running the program, haven't been well explored for evaluating code generation models. In this work, we propose a static evaluation framework to quantify static errors in Python code completions, by leveraging Abstract Syntax Trees. Compared with execution-based evaluation, our method is not only more efficient, but also applicable to code in the wild. For experiments, we collect code context from open source repos to generate one million function bodies using public models. Our static analysis reveals that Undefined Name and Unused Variable are the most common errors among others made by language models. Through extensive studies, we also show the impact of sampling temperature, model size, and context on static errors in code completions.
diff --git a/_publications/doderlein2022piloting.markdown b/_publications/doderlein2022piloting.markdown
new file mode 100644
index 00000000..cbe23003
--- /dev/null
+++ b/_publications/doderlein2022piloting.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Piloting Copilot and Codex: Hot Temperature, Cold Prompts, or Black Magic?"
+authors: Jean-Baptiste Döderlein, Mathieu Acher, Djamel Eddine Khelladi, Benoit Combemale
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2210.14699"}
+tags: ["Transformer"]
+---
+Language models are promising solutions for tackling increasing complex problems. In software engineering, they recently attracted attention in code assistants, with programs automatically written in a given programming language from a programming task description in natural language. They have the potential to save time and effort when writing code. However, these systems are currently poorly understood, preventing them from being used optimally. In this paper, we investigate the various input parameters of two language models, and conduct a study to understand if variations of these input parameters (e.g. programming task description and the surrounding context, creativity of the language model, number of generated solutions) can have a significant impact on the quality of the generated programs. We design specific operators for varying input parameters and apply them over two code assistants (Copilot and Codex) and two benchmarks representing algorithmic problems (HumanEval and LeetCode). Our results showed that varying the input parameters can significantly improve the performance of language models. However, there is a tight dependency when varying the temperature, the prompt and the number of generated solutions, making potentially hard for developers to properly control the parameters to obtain an optimal result. This work opens opportunities to propose (automated) strategies for improving performance.
diff --git a/_publications/dong2023codescore.markdown b/_publications/dong2023codescore.markdown
new file mode 100644
index 00000000..f749e0fb
--- /dev/null
+++ b/_publications/dong2023codescore.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeScore: Evaluating Code Generation by Learning Code Execution"
+authors: Yihong Dong, Jiazheng Ding, Xue Jiang, Zhuo Li, Ge Li, Zhi Jin
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2301.09043"}
+tags: ["Transformer", "evaluation"]
+---
+A proper code evaluation metric (CEM) profoundly impacts the evolution of code generation, which is an important research field in NLP and software engineering. Prevailing CEMs can be categorized into match-based CEMs (e.g., BLEU, Accuracy, and CodeBLEU) and execution-based CEMs (e.g., AvgPassRatio and Pass@k), but both of them suffer from some issues. The former only measures differences in surface form regardless of the functional equivalence of codes, while the latter has huge execution overheads, including collecting expensive test cases, resolving tedious execution dependencies, and enormous execution time. To address these issues, in this paper, we propose CodeScore, an efficient and effective CEM for code generation, which estimates test case PassRatio of generated code without executing code. We also present a framework named UniCE for training unified code evaluation models by learning code execution, i.e., learning PassRatio and Executability of generated code. In order to learn code execution comprehensively, we construct more than 100 test cases for each task in several popular benchmark datasets, covering MBPP, APPS, and HumanEval. Experimental results show that CodeScore has obtained a state-of-the-art correlation with execution-based CEMs. CodeScore is strongly correlated with AvgPassPatio, and binary CodeScore is moderately correlated with Pass@1. In particular, CodeScore eliminates the need for test cases and execution dependencies in inference, and CodeScore reduces execution time by three orders of magnitude compared to AvgPassPatio and Pass@1.
diff --git a/_publications/drain2021deepdebug.markdown b/_publications/drain2021deepdebug.markdown
new file mode 100644
index 00000000..5de543e5
--- /dev/null
+++ b/_publications/drain2021deepdebug.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DeepDebug: Fixing Python Bugs Using Stack Traces, Backtranslation, and Code Skeletons"
+authors: Dawn Drain, Colin B. Clement, Guillermo Serrato, Neel Sundaresan
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.09352"}
+tags: ["repair", "Transformer"]
+---
+The joint task of bug localization and program repair is an integral part of the software development process. In this work we present DeepDebug, an approach to automated debugging using large, pretrained transformers. We begin by training a bug-creation model on reversed commit data for the purpose of generating synthetic bugs. We apply these synthetic bugs toward two ends. First, we directly train a backtranslation model on all functions from 200K repositories. Next, we focus on 10K repositories for which we can execute tests, and create buggy versions of all functions in those repositories that are covered by passing tests. This provides us with rich debugging information such as stack traces and print statements, which we use to finetune our model which was pretrained on raw source code. Finally, we strengthen all our models by expanding the context window beyond the buggy function itself, and adding a skeleton consisting of that function's parent class, imports, signatures, docstrings, and method bodies, in order of priority. On the QuixBugs benchmark, we increase the total number of fixes found by over 50%, while also decreasing the false positive rate from 35% to 5% and decreasing the timeout from six hours to one minute. On our own benchmark of executable tests, our model fixes 68% of all bugs on its first attempt without using traces, and after adding traces it fixes 75% on first attempt. We will open-source our framework and validation set for evaluating on executable tests. 
diff --git a/_publications/drain2021generating.markdown b/_publications/drain2021generating.markdown
new file mode 100644
index 00000000..562dd561
--- /dev/null
+++ b/_publications/drain2021generating.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Generating Bug-Fixes Using Pretrained Transformers"
+authors: Dawn Drain, Chen Wu, Alexey Svyatkovskiy, Neel Sundaresan
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2104.07896"}
+tags: ["Transformer", "repair"]
+---
+Detecting and fixing bugs are two of the most important yet frustrating parts of the software development cycle. Existing bug detection tools are based mainly on static analyzers, which rely on mathematical logic and symbolic reasoning about the program execution to detect common types of bugs. Fixing bugs is typically left out to the developer. In this work we introduce DeepDebug: a data-driven program repair approach which learns to detect and fix bugs in Java methods mined from real-world GitHub repositories. We frame bug-patching as a sequence-to-sequence learning task consisting of two steps: (i) denoising pretraining, and (ii) supervised finetuning on the target translation task. We show that pretraining on source code programs improves the number of patches found by 33% as compared to supervised training from scratch, while domain-adaptive pretraining from natural language to code further improves the accuracy by another 32%. We refine the standard accuracy evaluation metric into non-deletion and deletion-only fixes, and show that our best model generates 75% more non-deletion fixes than the previous state of the art. In contrast to prior work, we attain our best results when generating raw code, as opposed to working with abstracted code that tends to only benefit smaller capacity models. Finally, we observe a subtle improvement from adding syntax embeddings along with the standard positional embeddings, as well as with adding an auxiliary task to predict each token's syntactic class. Despite focusing on Java, our approach is language agnostic, requiring only a general-purpose parser such as tree-sitter. 
diff --git a/_publications/edelmann2019neural.markdown b/_publications/edelmann2019neural.markdown
index 037c0bf8..30931717 100644
--- a/_publications/edelmann2019neural.markdown
+++ b/_publications/edelmann2019neural.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Neural-Network Guided Expression Transformation"
-authors: R. Edelmann, V. Kuncak
+authors: Romain Edelmann, Viktor Kunčak
 conference:
 year: 2019
-bibkey: edelmann2019neural
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1902.02194"}
 tags: ["optimization", "grammar"]
 ---
 Optimizing compilers, as well as other translator systems, often work by rewriting expressions according to equivalence preserving rules. Given an input expression and its optimized form, finding the sequence of rules that were applied is a non-trivial task. Most of the time, the tools provide no proof, of any kind, of the equivalence between the original expression and its optimized form. In this work, we propose to reconstruct proofs of equivalence of simple mathematical expressions, after the fact, by finding paths of equivalence preserving transformations between expressions. We propose to find those sequences of transformations using a search algorithm, guided by a neural network heuristic. Using a Tree-LSTM recursive neural network, we learn a distributed representation of expressions where the Manhattan distance between vectors approximately corresponds to the rewrite distance between expressions. We then show how the neural network can be efficiently used to search for transformation paths, leading to substantial gain in speed compared to an uninformed exhaustive search. In one of our experiments, our neural-network guided search algorithm is able to solve more instances with a 2 seconds timeout per instance than breadth-first search does with a 5 minutes timeout per instance.
diff --git a/_publications/ederhardt2019unsupervised.markdown b/_publications/ederhardt2019unsupervised.markdown
index 977a4754..cee995d2 100644
--- a/_publications/ederhardt2019unsupervised.markdown
+++ b/_publications/ederhardt2019unsupervised.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Unsupervised Learning of API Aliasing Specifications"
-authors: J. Eberhardt, S. Steffen, V. Raychev, M. Vechev
+authors: Jan Eberhardt, Samuel Steffen, Veselin Raychev, Martin Vechev
 conference: PLDI
 year: 2019
-bibkey: ederhardt2019unsupervised
 tags: ["API", "program analysis"]
 ---
 Real world applications make heavy use of powerful libraries
diff --git a/_publications/efstathiou2019semantic.markdown b/_publications/efstathiou2019semantic.markdown
index 40631f96..be3479ef 100644
--- a/_publications/efstathiou2019semantic.markdown
+++ b/_publications/efstathiou2019semantic.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Semantic Source Code Models Using Identifier Embeddings"
-authors: V. Efstathiou, D. Spinellis
+authors: Vasiliki Efstathiou, Diomidis Spinellis
 conference: MSR
 year: 2019
-bibkey: efstathiou2019semantic
 tags: ["representation"]
 ---
 The emergence of online open source repositories in the recent years has led to an explosion in the volume of openly available source code, coupled with metadata that relate to a variety of software development activities. As an effect, in line with recent advances in machine learning research, software maintenance activities are switching from symbolic formal methods to data-driven methods. In this context, the rich semantics hidden in source code identifiers provide opportunities for building semantic representations of code which can assist tasks of code search and reuse. To this end, we deliver in the form of pretrained vector space models, distributed code representations for six popular programming languages, namely, Java, Python, PHP, C, C++, and C#. The models are produced using fastText, a state-of-the-art library for learning word representations. Each model is trained on data from a single programming language; the code mined for producing all models amounts to over 13.000 repositories. We indicate dissimilarities between natural language and source code, as well as variations in coding conventions in between the different programming languages we processed. We describe how these heterogeneities guided the data preprocessing decisions we took and the selection of the training parameters in the released models. Finally, we propose potential applications of the models and discuss limitations of the models. 
diff --git a/_publications/eghbali2022crystalbleu.markdown b/_publications/eghbali2022crystalbleu.markdown
new file mode 100644
index 00000000..488a5781
--- /dev/null
+++ b/_publications/eghbali2022crystalbleu.markdown
@@ -0,0 +1,34 @@
+---
+layout: publication
+title: "CrystalBLEU: Precisely and Efficiently Measuring the Similarity of Code"
+authors: Aryaz Eghbali, Michael Pradel
+conference: ASE
+year: 2022
+additional_links:
+   - {name: "Preprint", url: "https://www.software-lab.org/publications/ase2022_CrystalBLEU.pdf"}
+tags: ["evaluation"]
+---
+Recent years have brought a surge of work on predicting pieces
+of source code, e.g., for code completion, code migration, program
+repair, or translating natural language into code. All this work faces
+the challenge of evaluating the quality of a prediction w.r.t. some
+oracle, typically in the form of a reference solution. A common
+evaluation metric is the BLEU score, an n-gram-based metric originally proposed for evaluating natural language translation, but
+adopted in software engineering because it can be easily computed
+on any programming language and enables automated evaluation at
+scale. However, a key difference between natural and programming
+languages is that in the latter, completely unrelated pieces of code
+may have many common n-grams simply because of the syntactic
+verbosity and coding conventions of programming languages. We
+observe that these trivially shared n-grams hamper the ability of
+the metric to distinguish between truly similar code examples and
+code examples that are merely written in the same language. This
+paper presents CrystalBLEU, an evaluation metric based on BLEU,
+that allows for precisely and efficiently measuring the similarity of
+code. Our metric preserves the desirable properties of BLEU, such
+as being language-agnostic, able to handle incomplete or partially
+incorrect code, and efficient, while reducing the noise caused by
+trivially shared n-grams. We evaluate CrystalBLEU on two datasets
+from prior work and on a new, labeled dataset of semantically equivalent programs. Our results show that CrystalBLEU can distinguish
+similar from dissimilar code examples 1.9–4.5 times more effectively, when compared to the original BLEU score and a previously
+proposed variant of BLEU for code.
diff --git a/_publications/ellis2021dreamcoder.markdown b/_publications/ellis2021dreamcoder.markdown
new file mode 100644
index 00000000..bf34ff91
--- /dev/null
+++ b/_publications/ellis2021dreamcoder.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "DreamCoder: bootstrapping inductive program synthesis with wake-sleep library learning"
+authors: Kevin Ellis, Catherine Wong, Maxwell Nye, Mathias Sable-Meyer, Luc Cary, Lucas Morales, Luke Hewitt, Armando Solar-Lezama, Joshua B. Tenenbaum
+conference: 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation (PLDI 2021)
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2006.08381"}
+   - {name: "Paper", url: "https://dl.acm.org/doi/10.1145/3453483.3454080"}
+   - {name: "Code", url: "https://github.com/ellisk42/ec"}
+tags: ["synthesis", "search"]
+---
+We present a system for inductive program synthesis called DreamCoder, which inputs a corpus of synthesis problems each specified by one or a few examples, and automatically derives a library of program components and a neural search policy that can be used to efficiently solve other similar synthesis problems. The library and search policy bootstrap each other iteratively through a variant of "wake-sleep" approximate Bayesian learning. A new refactoring algorithm based on E-graph matching identifies common sub-components across synthesized programs, building a progressively deepening library of abstractions capturing the structure of the input domain. We evaluate on eight domains including classic program synthesis areas and AI tasks such as planning, inverse graphics, and equation discovery. We show that jointly learning the library and neural search policy leads to solving more problems, and solving them more quickly.
\ No newline at end of file
diff --git a/_publications/elnaggar2021codetrans.markdown b/_publications/elnaggar2021codetrans.markdown
new file mode 100644
index 00000000..b0b301b7
--- /dev/null
+++ b/_publications/elnaggar2021codetrans.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "CodeTrans: Towards Cracking the Language of Silicon's Code Through Self-Supervised Deep Learning and High Performance Computing"
+authors: Ahmed Elnaggar, Wei Ding, Llion Jones, Tom Gibbs, Tamas Feher, Christoph Angerer, Silvia Severini, Florian Matthes, Burkhard Rost
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2104.02443"}
+   - {name: "Code", url: "https://github.com/agemagician/CodeTrans"}
+   - {name: "Models", url: "https://huggingface.co/models?search=code_trans"}
+tags: ["Transformer"]
+---
+Currently, a growing number of mature natural language processing applications make people's life more convenient. Such applications are built by source code - the language in software engineering. However, the applications for understanding source code language to ease the software engineering process are under-researched. Simultaneously, the transformer model, especially its combination with transfer learning, has been proven to be a powerful technique for natural language processing tasks. These breakthroughs point out a promising direction for process source code and crack software engineering tasks. This paper describes CodeTrans - an encoder-decoder transformer model for tasks in the software engineering domain, that explores the effectiveness of encoder-decoder transformer models for six software engineering tasks, including thirteen sub-tasks. Moreover, we have investigated the effect of different training strategies, including single-task learning, transfer learning, multi-task learning, and multi-task learning with fine-tuning. CodeTrans outperforms the state-of-the-art models on all the tasks. To expedite future works in the software engineering domain, we have published our pre-trained models of CodeTrans. 
diff --git a/_publications/eniser2023automatically.markdown b/_publications/eniser2023automatically.markdown
new file mode 100644
index 00000000..cc664bbb
--- /dev/null
+++ b/_publications/eniser2023automatically.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Automatically Testing Functional Properties of Code Translation Models"
+authors: Hasan Ferit Eniser, Valentin Wüstholz, Maria Christakis
+conference: AAAI
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2309.12813"}
+tags: ["translation"]
+---
+Large language models are becoming increasingly practical for translating code across programming languages, a process known as $transpiling$. Even though automated transpilation significantly boosts developer productivity, a key concern is whether the generated code is correct. Existing work initially used manually crafted test suites to test the translations of a small corpus of programs; these test suites were later automated. In contrast, we devise the first approach for automated, functional, property-based testing of code translation models. Our general, user-provided specifications about the transpiled code capture a range of properties, from purely syntactic to purely semantic ones. As shown by our experiments, this approach is very effective in detecting property violations in popular code translation models, and therefore, in evaluating model quality with respect to given properties. We also go a step further and explore the usage scenario where a user simply aims to obtain a correct translation of some code with respect to certain properties without necessarily being concerned about the overall quality of the model. To this purpose, we develop the first property-guided search procedure for code translation models, where a model is repeatedly queried with slightly different parameters to produce alternative and potentially more correct translations. Our results show that this search procedure helps to obtain significantly better code translations.
diff --git a/_publications/feng2020codebert.markdown b/_publications/feng2020codebert.markdown
index e1af1b09..e1b088c6 100644
--- a/_publications/feng2020codebert.markdown
+++ b/_publications/feng2020codebert.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "CodeBERT: A Pre-Trained Model for Programming and Natural Languages"
-authors: Z. Feng, D. Guo, D. Tang, N. Duan, X. Feng, M. Gong, L. Shou, B. Qin, T. Liu, D. Jiang, M. Zhou
+authors: Zhangyin Feng, Daya Guo, Duyu Tang, Nan Duan, Xiaocheng Feng, Ming Gong, Linjun Shou, Bing Qin, Ting Liu, Daxin Jiang, Ming Zhou
 conference: 
 year: 2020
-bibkey: feng2020codebert
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2002.08155"}
 tags: ["pretraining"]
diff --git a/_publications/fernandes2019structured.markdown b/_publications/fernandes2019structured.markdown
index 483f049c..4d6130db 100644
--- a/_publications/fernandes2019structured.markdown
+++ b/_publications/fernandes2019structured.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Structured Neural Summarization"
-authors: P. Fernandes, M. Allamanis, M. Brockschmidt
+authors: Patrick Fernandes, Miltiadis Allamanis, Marc Brockschmidt
 conference: ICLR
 year: 2019
-bibkey: fernandes2019structured
 additional_links:
    - {name: "OpenReview", url: "https://openreview.net/forum?id=H1ersoRqtm"}
    - {name: "ArXiV", url: "https://arxiv.org/abs/1811.01824"}
diff --git a/_publications/fowkes2016parameter.markdown b/_publications/fowkes2016parameter.markdown
index 6d74583a..1de26dbb 100644
--- a/_publications/fowkes2016parameter.markdown
+++ b/_publications/fowkes2016parameter.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Parameter-Free Probabilistic API Mining across GitHub"
-authors: J. Fowkes, C. Sutton
+authors: Jaroslav Fowkes, Charles Sutton
 conference: FSE
 year: 2016
-bibkey: fowkes2016parameter
 tags: ["API", "pattern mining"]
 ---
 Existing API mining algorithms can be difficult to use as they require expensive parameter tuning and the returned set of API calls can be large, highly redundant and difficult to understand. To address this, we present PAM (Probabilistic API Miner), a near parameter-free probabilistic algorithm for mining the most interesting API call patterns. We show that PAM significantly outperforms both MAPO and UPMiner, achieving 69% test-set precision, at retrieving relevant API call sequences from GitHub. Moreover, we focus on libraries for which the developers have explicitly provided code examples, yielding over 300,000 LOC of hand-written API example code from the 967 client projects in the data set. This evaluation suggests that the hand-written examples actually have limited coverage of real API usages.
diff --git a/_publications/fowkes2017autofolding.markdown b/_publications/fowkes2017autofolding.markdown
index 0175710c..87323ada 100644
--- a/_publications/fowkes2017autofolding.markdown
+++ b/_publications/fowkes2017autofolding.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Autofolding for Source Code Summarization"
-authors: J. Fowkes, R. Ranca, M. Allamanis, M. Lapata, C. Sutton
+authors: Jaroslav Fowkes, Razan Ranca, Miltiadis Allamanis, Mirella Lapata, Charles Sutton
 conference: TSE
 year: 2017
-bibkey: fowkes2017autofolding
 tags: ["summarization"]
 ---
 Developers spend much of their time reading and browsing source code, raising new opportunities for summarization methods. Indeed, modern code editors provide code folding, which allows one to selectively hide blocks of code. However this is impractical to use as folding decisions must be made manually or based on simple rules. We introduce the
diff --git a/_publications/franks2015cacheca.markdown b/_publications/franks2015cacheca.markdown
index 518d326a..885597d8 100644
--- a/_publications/franks2015cacheca.markdown
+++ b/_publications/franks2015cacheca.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "CACHECA: A Cache Language Model Based Code Suggestion Tool"
-authors: C. Franks, Z. Tu, P. Devanbu, V. Hellendoorn
+authors: Christine Franks, Zhaopeng Tu, Premkumar Devanbu, Vincent Hellendoorn
 conference: ICSE
 year: 2015
-bibkey: franks2015cacheca
 tags: ["language model"]
 ---
 Nearly every Integrated Development Environment includes a form of code completion. The suggested completions ("suggestions") are typically based on information available at compile time, such as type signatures and variables in scope. A statistical approach, based on estimated models of code patterns in large code corpora, has been demonstrated to be effective at predicting tokens given a context. In this demo, we present CACHECA, an Eclipse plugin that combines the native suggestions with a statistical suggestion regime. We demonstrate that a combination of the two approaches more than doubles Eclipse’s suggestion accuracy. A video demonstration is available at [https://www.youtube.com/watch?v=3INk0N3JNtc](https://www.youtube.com/watch?v=3INk0N3JNtc).
diff --git a/_publications/fried2022incoder.markdown b/_publications/fried2022incoder.markdown
new file mode 100644
index 00000000..9364be5f
--- /dev/null
+++ b/_publications/fried2022incoder.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "InCoder: A Generative Model for Code Infilling and Synthesis"
+authors: Daniel Fried, Armen Aghajanyan, Jessy Lin, Sida Wang, Eric Wallace, Freda Shi, Ruiqi Zhong, Wen-tau Yih, Luke Zettlemoyer, Mike Lewis
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2204.05999"}
+tags: ["Transformer", "code generation", "naming", "summarization"]
+---
+Code is seldom written in a single left-to-right pass and is instead repeatedly edited and refined. We introduce InCoder, a unified generative model that can perform program synthesis (via left-to-right generation) as well as editing (via infilling). InCoder is trained to generate code files from a large corpus of permissively licensed code, where regions of code have been randomly masked and moved to the end of each file, allowing code infilling with bidirectional context. Our model is the first generative model that is able to directly perform zero-shot code infilling, which we evaluate on challenging tasks such as type inference, comment generation, and variable re-naming. We find that the ability to condition on bidirectional context substantially improves performance on these tasks, while still performing comparably on standard program synthesis benchmarks in comparison to left-to-right only models pretrained at similar scale. The InCoder models and code are publicly released at https://sites.google.com/view/incoder-code-models
diff --git a/_publications/fu2019coda.markdown b/_publications/fu2019coda.markdown
index 4d246f17..5bbe3573 100644
--- a/_publications/fu2019coda.markdown
+++ b/_publications/fu2019coda.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Coda: An End-to-End Neural Program Decompiler"
-authors: C. Fu, H. Chen, H. Liu, X. Chen, Y. Tian, F. Koushanfar, J. Zhao
+authors: Cheng Fu, Huili Chen, Haolan Liu, Xinyun Chen, Yuandong Tian, Farinaz Koushanfar, Jishen Zhao
 conference: NeurIPS
 year: 2019
-bibkey: fu2019coda
 additional_links:
    - {name: "Proceedings", url: "http://papers.nips.cc/paper/8628-coda-an-end-to-end-neural-program-decompiler"}
 tags: ["decompilation"]
diff --git a/_publications/gao2019neural.markdown b/_publications/gao2019neural.markdown
index 6a665efb..fd8c7f05 100644
--- a/_publications/gao2019neural.markdown
+++ b/_publications/gao2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Neural Model for Method Name Generation from Functional Description"
-authors: S. Gao, C. Chen, Z. Xing, Y. Ma, W. Song, S.W. Lin
+authors: Sa Gao, Chunyang Chen, Zhenchang Xing, Yukun Ma, Wen Song, Shang-Wei Lin
 conference: SANER
 year: 2019
-bibkey: gao2019neural
 tags: ["naming", "summarization"]
 ---
 The names of software artifacts, e.g., method names, are important for software understanding and maintenance, as good names can help developers easily understand others' code. However, the existing naming guidelines are difficult for developers, especially novices, to come up with meaningful, concise and compact names for the variables, methods, classes and files. With the popularity of open source, an enormous amount of project source code can be accessed, and the exhaustiveness and instability of manually naming methods could now be relieved by automatically learning a naming model from a large code repository. Nevertheless, building a comprehensive naming system is still challenging, due to the gap between natural language functional descriptions and method names. Specifically, there are three challenges: how to model the relationship between the functional descriptions and formal method names, how to handle the explosion of vocabulary when dealing with large repositories, and how to leverage the knowledge learned from large repositories to a specific project. To answer these questions, we propose a neural network to directly generate readable method names from natural language description. The proposed method is built upon the encoder-decoder framework with the attention and copying mechanisms. Our experiments show that our method can generate meaningful and accurate method names and achieve significant improvement over the state-of-the-art baseline models. We also address the cold-start problem using a training trick to utilize big data in GitHub for specific projects.
diff --git a/_publications/garg2022deepperf.markdown b/_publications/garg2022deepperf.markdown
new file mode 100644
index 00000000..4b2e6b28
--- /dev/null
+++ b/_publications/garg2022deepperf.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DeepPERF: A Deep Learning-Based Approach For Improving Software Performance"
+authors: Spandan Garg, Roshanak Zilouchian Moghaddam, Colin B. Clement, Neel Sundaresan, Chen Wu
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2206.13619"}
+tags: ["Transformer", "optimization"]
+---
+Improving software performance is an important yet challenging part of the software development cycle. Today, the majority of performance inefficiencies are identified and patched by performance experts. Recent advancements in deep learning approaches and the wide-spread availability of open source data creates a great opportunity to automate the identification and patching of performance problems. In this paper, we present DeepPERF, a transformer-based approach to suggest performance improvements for C# applications. We pretrain DeepPERF on English and Source code corpora and followed by finetuning for the task of generating performance improvement patches for C# applications. Our evaluation shows that our model can generate the same performance improvement suggestion as the developer fix in ~53% of the cases, getting ~34% of them verbatim in our expert-verified dataset of performance changes made by C# developers. Additionally, we evaluate DeepPERF on 50 open source C# repositories on GitHub using both benchmark and unit tests and find that our model is able to suggest valid performance improvements that can improve both CPU usage and Memory allocations. So far we've submitted 19 pull-requests with 28 different performance optimizations and 11 of these PRs have been approved by the project owners.
diff --git a/_publications/gharibi2024t5apr.markdown b/_publications/gharibi2024t5apr.markdown
new file mode 100644
index 00000000..7f4cb6be
--- /dev/null
+++ b/_publications/gharibi2024t5apr.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "T5APR: Empowering Automated Program Repair across Languages through Checkpoint Ensemble"
+authors: Reza Gharibi, Mohammad Hadi Sadreddini, Seyed Mostafa Fakhrahmad
+journal:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2309.15742"}
+  - {name: "Code", url: "https://github.com/h4iku/T5APR"}
+tags: ["repair", "Transformer"]
+---
+Automated program repair (APR) using deep learning techniques has become an important area of research in recent years, aiming to automatically generate bug-fixing patches that can improve software reliability and maintainability. However, most existing methods either target a single language or require high computational resources to train multilingual models. In this paper, we propose T5APR, a novel neural program repair approach that provides a unified solution for bug fixing across multiple programming languages. T5APR leverages CodeT5, a powerful pre-trained text-to-text transformer model, and adopts a checkpoint ensemble strategy to improve patch recommendation. We conduct comprehensive evaluations on six well-known benchmarks in four programming languages (Java, Python, C, JavaScript), demonstrating T5APR's competitiveness against state-of-the-art techniques. T5APR correctly fixes 1,985 bugs, including 1,442 bugs that none of the compared techniques has fixed. We further support the effectiveness of our approach by conducting detailed analyses, such as comparing the correct patch ranking among different techniques. The findings of this study demonstrate the potential of T5APR for use in real-world applications and highlight the importance of multilingual approaches in the field of APR.
diff --git a/_publications/gholamian2021naturalness.markdown b/_publications/gholamian2021naturalness.markdown
new file mode 100644
index 00000000..ca899847
--- /dev/null
+++ b/_publications/gholamian2021naturalness.markdown
@@ -0,0 +1,25 @@
+---
+layout: publication
+title: "On the Naturalness and Localness of Software Logs"
+authors: Sina Gholamian, Paul A. S. Ward
+conference:
+year: 2021
+tags: ["logging", "language model"]
+---
+Logs are an essential part  of  the  development  and
+maintenance  of  large  and  complex  software  systems  as  they
+contain rich information pertaining to the dynamic content and
+state  of  the  system.  As  such,  developers  and  practitioners  rely
+heavily  on  the  logs  to  monitor  their  systems.  In  parallel,  the
+increasing  volume  and  scale  of  the  logs,  due  to  the  growing
+complexity  of  modern  software  systems,  renders  the  traditional
+way  of  manual  log  inspection  insurmountable.  Consequently,  to
+handle  large  volumes  of  logs  efficiently  and  effectively,  various
+prior research aims to automate the analysis of log files. Thus, in
+this paper, we begin with the hypothesis that log files are natural
+and local and these attributes can be applied for automating log
+analysis tasks. We guide our research with six research questions
+with regards to the naturalness and localness of the log files, and
+present  a  case  study  on  anomaly  detection  and  introduce  a  tool
+for anomaly detection, called ANALOG, to demonstrate how our
+new  findings  facilitate  the  automated  analysis  of  logs.
diff --git a/_publications/glassman2015overcode.markdown b/_publications/glassman2015overcode.markdown
index 624f3593..4b331fd3 100644
--- a/_publications/glassman2015overcode.markdown
+++ b/_publications/glassman2015overcode.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "OverCode: visualizing variation in student solutions to programming problems at scale"
-authors: E.L. Glassman, J. Scott, R. Singh, P. Guo, and R.C. Miller 
+authors: Elena L. Glassman, Jeremy Scott, Rishabh Singh, Philip J. Guo, Robert C. Miller
 journal: TOCHI
 year: 2015
-bibkey: glassman2015overcode
 tags: ["repair"]
 ---
 In MOOCs, a single programming exercise may produce thousands of solutions from learners. Understanding solution variation is important for providing appropriate feedback to students at scale. The wide variation among these solutions can be a source of pedagogically valuable examples and can be used to refine the autograder for the exercise by exposing corner cases. We present OverCode, a system for visualizing and exploring thousands of programming solutions. OverCode uses both static and dynamic analysis to cluster similar solutions, and lets teachers further filter and cluster solutions based on different criteria. We evaluated OverCode against a nonclustering baseline in a within-subjects study with 24 teaching assistants and found that the OverCode interface allows teachers to more quickly develop a high-level view of students' understanding and misconceptions, and to provide feedback that is relevant to more students' solutions.
diff --git a/_publications/goens2019case.markdown b/_publications/goens2019case.markdown
index 506c4bad..45f3ceb2 100644
--- a/_publications/goens2019case.markdown
+++ b/_publications/goens2019case.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A case study on machine learning for synthesizing benchmarks"
-authors: A. Goens, A. Brauckmann, S. Ertel, C. Cummins, H. Leather, J. Castrillon
+authors: Andrés Goens, Alexander Brauckmann, Sebastian Ertel, Chris Cummins, Hugh Leather, Jeronimo Castrillon
 conference: MAPL
 year: 2019
-bibkey: goens2019case
-tags: ["generation"]
+tags: ["code generation"]
 ---
 Good benchmarks are hard to find because they require a substantial effort to keep them representative for the constantly changing challenges of a particular field. Synthetic benchmarks are a common approach to deal with this, and methods from machine learning are natural candidates for synthetic benchmark generation. In this paper we investigate the usefulness of machine learning in the prominent CLgen benchmark generator. We re-evaluate CLgen by comparing the benchmarks generated by the model with the raw data used to train it. This re-evaluation indicates that, for the use case considered, machine learning did not yield additional benefit over a simpler method using the raw data. We investigate the reasons for this and provide further insights into the challenges the problem could pose for potential future generators.
diff --git a/_publications/gros2020code.markdown b/_publications/gros2020code.markdown
new file mode 100644
index 00000000..91eeab15
--- /dev/null
+++ b/_publications/gros2020code.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Code to Comment \"Translation\": Data, Metrics, Baselining & Evaluation"
+authors: David Gros, Hariharan Sezhiyan, Premkumar Devanbu, Zhou Yu
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2010.01410"}
+tags: ["bimodal", "documentation"]
+---
+The relationship of comments to code, and in particular, the task of generating useful comments given the code, has long been of interest. The earliest approaches have been based on strong syntactic theories of comment-structures, and relied on textual templates. More recently, researchers have applied deep learning methods to this task, and specifically, trainable generative translation models which are known to work very well for Natural Language translation (e.g., from German to English). We carefully examine the underlying assumption here: that the task of generating comments sufficiently resembles the task of translating between natural languages, and so similar models and evaluation metrics could be used. We analyze several recent code-comment datasets for this task: CodeNN, DeepCom, FunCom, and DocString. We compare them with WMT19, a standard dataset frequently used to train state of the art natural language translators. We found some interesting differences between the code-comment data and the WMT19 natural language data. Next, we describe and conduct some studies to calibrate BLEU (which is commonly used as a measure of comment quality). using "affinity pairs" of methods, from different projects, in the same project, in the same class, etc; Our study suggests that the current performance on some datasets might need to be improved substantially. We also argue that fairly naive information retrieval (IR) methods do well enough at this task to be considered a reasonable baseline. Finally, we make some suggestions on how our findings might be used in future research in this area. 
diff --git a/_publications/gu2016deep.markdown b/_publications/gu2016deep.markdown
index 919699d2..45375266 100644
--- a/_publications/gu2016deep.markdown
+++ b/_publications/gu2016deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep API Learning"
-authors: X. Gu, H. Zhang, D. Zhang, S. Kim
+authors: Xiaodong Gu, Hongyu Zhang, Dongmei Zhang, Sunghun Kim.
 conference: FSE
 year: 2016
-bibkey: gu2016deep
 tags: ["API", "search"]
 ---
 Developers often wonder how to implement a certain functionality (e.g., how to parse XML files) using APIs. Obtaining an API usage sequence based on an API-related natural language query is very helpful in this regard. Given a query, existing approaches utilize information retrieval models to search for matching API sequences. These approaches treat queries and APIs as bag-of-words (i.e., keyword matching or word-to-word alignment) and lack a deep understanding of the semantics of the query.
diff --git a/_publications/gu2017deepam.markdown b/_publications/gu2017deepam.markdown
index b9194ac4..f7a62aa1 100644
--- a/_publications/gu2017deepam.markdown
+++ b/_publications/gu2017deepam.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "DeepAM: Migrate APIs with Multi-modal Sequence to Sequence Learning"
-authors: X. Gu, H. Zhang, D. Zhang, S. Kim
+authors: Xiaodong Gu, Hongyu Zhang, Dongmei Zhang, Sunghun Kim
 conference: IJCAI
 year: 2017
-bibkey: gu2017deepam
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1704.07734"}
 tags: ["API"]
 ---
 Computer programs written in one language are often required to be ported to other languages to support multiple devices and environments. When programs use language specific APIs (Application Programming Interfaces), it is very challenging to migrate these APIs to the corresponding APIs written in other languages. Existing approaches mine API mappings from projects that have corresponding versions in two languages. They rely on the sparse availability of bilingual projects, thus producing a limited number of API mappings. In this paper, we propose an intelligent system called DeepAM for automatically mining API mappings from a large-scale code corpus without bilingual projects. The key component of DeepAM is based on the multimodal sequence to sequence learning architecture that aims to learn joint semantic representations of bilingual API sequences from big source code data. Experimental results indicate that DeepAM significantly increases the accuracy of API mappings as well as the number of API mappings, when compared with the state-of-the-art approaches.
diff --git a/_publications/gu2018deep.markdown b/_publications/gu2018deep.markdown
index 93a56649..51727151 100644
--- a/_publications/gu2018deep.markdown
+++ b/_publications/gu2018deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Code Search"
-authors: X. Gu, H. Zhang, S. Kim
+authors: Xiaodong Gu, Hongyu Zhang, Sunghun Kim.
 conference: ICSE
 year: 2018
-bibkey: gu2018deep
 tags: ["search"]
 ---
 To implement a program functionality, developers can reuse previously written code snippets by searching through a large-scale codebase. Over the years, many code search tools have been proposed to help developers. The existing approaches often treat source code as textual documents and utilize information retrieval models to retrieve relevant code snippets that match a given query. These approaches mainly rely on the textual similarity between source code and natural language query. They lack a deep understanding of the semantics of queries and source code.
diff --git a/_publications/gui2022cross.markdown b/_publications/gui2022cross.markdown
new file mode 100644
index 00000000..5f491923
--- /dev/null
+++ b/_publications/gui2022cross.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Cross-Language Binary-Source Code Matching with Intermediate Representations"
+authors: Yi Gui, Yao Wan, Hongyu Zhang, Huifang Huang, Yulei Sui, Guandong Xu, Zhiyuan Shao, Hai Jin
+conference: SANER
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2201.07420"}
+   - {name: "Code", url: "https://github.com/CGCL-codes/naturalcc"}
+tags: ["code similarity", "clone"]
+---
+Binary-source code matching plays an important role in many security and software engineering related tasks such as malware detection, reverse engineering and vulnerability assessment. Currently, several approaches have been proposed for binary-source code matching by jointly learning the embeddings of binary code and source code in a common vector space. Despite much effort, existing approaches target on matching the binary code and source code written in a single programming language. However, in practice, software applications are often written in different programming languages to cater for different requirements and computing platforms. Matching binary and source code across programming languages introduces additional challenges when maintaining multi-language and multi-platform applications. To this end, this paper formulates the problem of cross-language binary-source code matching, and develops a new dataset for this new problem. We present a novel approach XLIR, which is a Transformer-based neural network by learning the intermediate representations for both binary and source code. To validate the effectiveness of XLIR, comprehensive experiments are conducted on two tasks of cross-language binary-source code matching, and cross-language source-source code matching, on top of our curated dataset. Experimental results and analysis show that our proposed XLIR with intermediate representations significantly outperforms other state-of-the-art models in both of the two tasks.
diff --git a/_publications/gulwani2014nlyze.markdown b/_publications/gulwani2014nlyze.markdown
index aa47e402..c2b48dfa 100644
--- a/_publications/gulwani2014nlyze.markdown
+++ b/_publications/gulwani2014nlyze.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "NLyze: Interactive Programming by Natural Language for SpreadSheet Data Analysis and Manipulation"
-authors: S. Gulwani, M. Marron
+authors: Sumit Gulwani, Mark Marron
 conference: SIGMOD
 year: 2014
-bibkey: gulwani2014nlyze
-tags: ["generation", "bimodal", "synthesis"]
+tags: ["code generation", "bimodal", "synthesis"]
 ---
 Millions of computer end users need to perform tasks over tabular spreadsheet data, yet lack the programming knowledge to do such tasks automatically. This paper describes
 the design and implementation of a robust natural language
diff --git a/_publications/guo2017semantically.markdown b/_publications/guo2017semantically.markdown
index 8eccc5e2..67d12c74 100644
--- a/_publications/guo2017semantically.markdown
+++ b/_publications/guo2017semantically.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Semantically enhanced software traceability using deep learning techniques"
-authors: J. Guo, J. Cheng, J. Cleland-Huang
+authors: Jin Guo, Jinghui Cheng, Jane Cleland-Huang
 conference: ICSE
 year: 2017
-bibkey: guo2017semantically
 tags: ["traceability", "representation"]
 ---
 In most safety-critical domains the need for traceability is prescribed by certifying bodies. Trace links are generally created among requirements, design, source code, test cases and other artifacts; however, creating such links manually is time consuming and error prone. Automated solutions use information retrieval and machine learning techniques to generate trace links; however, current techniques fail to understand semantics of the software artifacts or to integrate domain knowledge into the tracing process and therefore tend to deliver imprecise and inaccurate results. In this paper, we present a solution that uses deep learning to incorporate requirements artifact semantics and domain knowledge into the tracing solution. We propose a tracing network architecture that utilizes Word Embedding and Recurrent Neural Network (RNN) models to generate trace links. Word embedding learns word vectors that represent knowledge of the domain corpus and RNN uses these word vectors to learn the sentence semantics of requirements artifacts. We trained 360 different configurations of the tracing network using existing trace links in the Positive Train Control domain and identified the Bidirectional Gated Recurrent Unit (BI-GRU) as the best model for the tracing task. BI-GRU significantly out-performed state-of-the-art tracing methods including the Vector Space Model and Latent Semantic Indexing.
diff --git a/_publications/guo2020graphcodebert.markdown b/_publications/guo2020graphcodebert.markdown
new file mode 100644
index 00000000..41e4688a
--- /dev/null
+++ b/_publications/guo2020graphcodebert.markdown
@@ -0,0 +1,10 @@
+---
+layout: publication
+title: "GraphCodeBERT: Pre-training Code Representations with Data Flow"
+authors: Daya Guo, Shuo Ren, Shuai Lu, Zhangyin Feng, Duyu Tang, Shujie Liu, Long Zhou, Nan Duan, Jian Yin, Daxin Jiang, Ming Zhou
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.08366"}
+tags: ["pretraining"]
+---
+Pre-trained models for programming language have achieved dramatic empirical improvements on a variety of code-related tasks such as code search, code completion, code summarization, etc. However, existing pre-trained models regard a code snippet as a sequence of tokens, while ignoring the inherent structure of code, which provides crucial code semantics and would enhance the code understanding process. We present GraphCodeBERT, a pre-trained model for programming language that considers the inherent structure of code. Instead of taking syntactic-level structure of code like abstract syntax tree (AST), we use data flow in the pre-training stage, which is a semantic-level structure of code that encodes the relation of "where-the-value-comes-from" between variables. Such a semantic-level structure is neat and does not bring an unnecessarily deep hierarchy of AST, the property of which makes the model more efficient. We develop GraphCodeBERT based on Transformer. In addition to using the task of masked language modeling, we introduce two structure-aware pre-training tasks. One is to predict code structure edges, and the other is to align representations between source code and code structure. We implement the model in an efficient way with a graph-guided masked attention function to incorporate the code structure. We evaluate our model on four tasks, including code search, clone detection, code translation, and code refinement. Results show that code structure and newly introduced pre-training tasks can improve GraphCodeBERT and achieves state-of-the-art performance on the four downstream tasks. We further show that the model prefers structure-level attentions over token-level attentions in the task of code search. 
diff --git a/_publications/guo2022learning.markdown b/_publications/guo2022learning.markdown
new file mode 100644
index 00000000..cd81c34d
--- /dev/null
+++ b/_publications/guo2022learning.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Learning to Complete Code with Sketches"
+authors: Daya Guo, Alexey Svyatkovskiy, Jian Yin, Nan Duan, Marc Brockschmidt, Miltiadis Allamanis
+conference: ICLR
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.10158"}
+tags: ["Transformer", "language model", "grammar"]
+---
+Code completion is usually cast as a language modelling problem, i.e., continuing an input in a left-to-right fashion. However, in practice, some parts of the completion (e.g., string literals) may be very hard to predict, whereas subsequent parts directly follow from the context. To handle this, we instead consider the scenario of generating code completions with "holes" inserted in places where a model is uncertain. We develop Grammformer, a Transformer-based model that guides code generation by the programming language grammar, and compare it to a variety of more standard sequence models.
+
+We train the models on code completion for C# and Python given partial code context. To evaluate models, we consider both ROUGE as well as a new metric RegexAcc that measures success of generating completions matching long outputs with as few holes as possible. In our experiments, Grammformer generates 10-50% more accurate completions compared to traditional generative models and 37-50% longer sketches compared to sketch-generating baselines trained with similar techniques. 
diff --git a/_publications/guo2022unixcoder.markdown b/_publications/guo2022unixcoder.markdown
new file mode 100644
index 00000000..6e611472
--- /dev/null
+++ b/_publications/guo2022unixcoder.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "UniXcoder: Unified Cross-Modal Pre-training for Code Representation"
+authors: Daya Guo, Shuai Lu, Nan Duan, Yanlin Wang, Ming Zhou, Jian Yin
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.03850"}
+tags: ["Transformer"]
+---
+Pre-trained models for programming languages have recently demonstrated great success on code intelligence. To support both code-related understanding and generation tasks, recent works attempt to pre-train unified encoder-decoder models. However, such encoder-decoder framework is sub-optimal for auto-regressive tasks, especially code completion that requires a decoder-only manner for efficient inference. In this paper, we present UniXcoder, a unified cross-modal pre-trained model for programming language. The model utilizes mask attention matrices with prefix adapters to control the behavior of the model and leverages cross-modal contents like AST and code comment to enhance code representation. To encode AST that is represented as a tree in parallel, we propose a one-to-one mapping method to transform AST in a sequence structure that retains all structural information from the tree. Furthermore, we propose to utilize multi-modal contents to learn representation of code fragment with contrastive learning, and then align representations among programming languages using a cross-modal generation task. We evaluate UniXcoder on five code-related tasks over nine datasets. To further evaluate the performance of code fragment representation, we also construct a dataset for a new task, called zero-shot code-to-code search. Results show that our model achieves state-of-the-art performance on most tasks and analysis reveals that comment and AST can both enhance UniXcoder. 
diff --git a/_publications/guo2024deepseek.markdown b/_publications/guo2024deepseek.markdown
new file mode 100644
index 00000000..91c16fbe
--- /dev/null
+++ b/_publications/guo2024deepseek.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DeepSeek-Coder: When the Large Language Model Meets Programming -- The Rise of Code Intelligence"
+authors: Daya Guo, Qihao Zhu, Dejian Yang, Zhenda Xie, Kai Dong, Wentao Zhang, Guanting Chen, Xiao Bi, Y. Wu, Y. K. Li, Fuli Luo, Yingfei Xiong, Wenfeng Liang
+conference:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2401.14196"}
+tags: ["Transformers"]
+---
+The rapid development of large language models has revolutionized code intelligence in software development. However, the predominance of closed-source models has restricted extensive research and development. To address this, we introduce the DeepSeek-Coder series, a range of open-source code models with sizes from 1.3B to 33B, trained from scratch on 2 trillion tokens. These models are pre-trained on a high-quality project-level code corpus and employ a fill-in-the-blank task with a 16K window to enhance code generation and infilling. Our extensive evaluations demonstrate that DeepSeek-Coder not only achieves state-of-the-art performance among open-source code models across multiple benchmarks but also surpasses existing closed-source models like Codex and GPT-3.5. Furthermore, DeepSeek-Coder models are under a permissive license that allows for both research and unrestricted commercial use.
diff --git a/_publications/gupta2017deepfix.markdown b/_publications/gupta2017deepfix.markdown
index fdb06183..d60e5d67 100644
--- a/_publications/gupta2017deepfix.markdown
+++ b/_publications/gupta2017deepfix.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "DeepFix: Fixing Common C Language Errors by Deep Learning"
-authors: R. Gupta, S. Pal, A. Kanade, S. Shevade
+authors: Rahul Gupta, Soham Pal, Aditya Kanade, Shirish Shevade
 conference: AAAI
 year: 2017
-bibkey: gupta2017deepfix
-tags: ["repair", "generation"]
+tags: ["repair", "code generation"]
 ---
 The problem of automatically fixing programming errors is a
 very active research topic in software engineering. This is a
diff --git a/_publications/gupta2018deep.makrdown b/_publications/gupta2018deep.markdown
similarity index 92%
rename from _publications/gupta2018deep.makrdown
rename to _publications/gupta2018deep.markdown
index 2a0fe1da..28d10fc7 100644
--- a/_publications/gupta2018deep.makrdown
+++ b/_publications/gupta2018deep.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: "Deep Reinforcement Learning for Programming Language Correction"
-authors: R. Gupta, A. Kanade, S. Shevade
+authors: Rahul Gupta, Aditya Kanade, Shirish Shevade
 conference: 
 year: 2018
-bibkey: gupta2018deep
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1801.10467"}
    - {name: "Video", url: "https://youtu.be/S4D6MR728zY"}
-tags: ["repair", "generation"]
+tags: ["repair", "code generation"]
 ---
 Novice programmers often struggle with the formal
 syntax of programming languages.  To assist them,
diff --git a/_publications/gupta2018intelligent.markdown b/_publications/gupta2018intelligent.markdown
index 83e539e8..ed01125f 100644
--- a/_publications/gupta2018intelligent.markdown
+++ b/_publications/gupta2018intelligent.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Intelligent code reviews using deep learning"
-authors: A. Gupta, N. Sundaresan
+authors: Anshul Gupta, Neel Sundaresan
 conference: KDD
 year: 2018
-bibkey: gupta2018intelligent
 tags: ["representation", "review"]
 ---
 Peer code review is a best practice in Software Engineering where source code is reviewed manually by one or more peers(reviewers) of the code author. It is widely acceptable both in industry and open-source software (OSS) systems as a process for early detection and reduction of software defects. A larger chunk of reviews given during peer reviews are related to common issues such as coding style, documentations, and best practices. This makes the code review process less effective as reviewers focus less on finding important defects. Hence, there is a need to automatically find such common issues and help reviewers perform focused code reviews. Some of this is solved by rule based systems called linters but they are rigid and needs a lot of manual effort to adapt them for a new issue.
diff --git a/_publications/gupta2019neural.markdown b/_publications/gupta2019neural.markdown
index a605fe71..35f008a0 100644
--- a/_publications/gupta2019neural.markdown
+++ b/_publications/gupta2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Attribution for Semantic Bug-Localization in Student Programs"
-authors: R. Gupta, A. Kanade, S. Shevade
+authors: Rahul Gupta, Aditya Kanade, Shirish Shevade
 conference: NeurIPS
 year: 2019
-bibkey: gupta2019neural
 tags: ["defect", "representation"]
 ---
 Providing feedback is an integral part of teaching. Most open online courses on programming make use of automated grading systems to support programming assignments and give real-time feedback. These systems usually rely on test results to quantify the programs’ functional correctness. They return failing tests to the students as feedback. However, students may find it difficult to debug their programs if they receive no hints about where the bug is and how to fix it. In this work, we present NeuralBugLocator, a deep learning based technique, that can localize the bugs in a faulty program with respect to a failing test, without even running the program. At the heart of our technique is a novel tree convolutional neural network which is trained to predict whether a program passes or fails a given test. To localize the bugs, we analyze the trained network using a state-of-the-art neural prediction attribution technique and see which lines of the programs make it predict the test outcomes. Our experiments show that NeuralBugLocator is generally more accurate than two state-of-the-art program-spectrum based and one syntactic difference based bug-localization baselines.
diff --git a/_publications/gupta2023grace.markdown b/_publications/gupta2023grace.markdown
new file mode 100644
index 00000000..4c1f3596
--- /dev/null
+++ b/_publications/gupta2023grace.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Grace: Language Models Meet Code Edits"
+authors: Priyanshu Gupta, Avishree Khare, Yasharth Bajpai, Saikat Chakraborty, Sumit Gulwani, Aditya Kanade, Arjun Radhakrishna, Gustavo Soares, Ashish Tiwari
+conference: FSE
+year: 2023
+additional_links:
+   - {name: "ACM", url: "https://dl.acm.org/doi/abs/10.1145/3611643.3616253"}
+tags: ["editing"]
+---
+Developers spend a significant amount of time in editing code for a variety of reasons such as bug fixing or adding new features. Designing effective methods to predict code edits has been an active yet challenging area of research due to the diversity of code edits and the difficulty of capturing the developer intent. In this work, we address these challenges by endowing pre-trained large language models (LLMs) with the knowledge of relevant prior associated edits, which we call the Grace (Generation conditioned on Associated Code Edits) method. The generative capability of the LLMs helps address the diversity in code changes and conditioning code generation on prior edits helps capture the latent developer intent. We evaluate two well-known LLMs, codex and CodeT5, in zero-shot and fine-tuning settings respectively. In our experiments with two datasets, Grace boosts the performance of the LLMs significantly, enabling them to generate 29% and 54% more correctly edited code in top-1 suggestions relative to the current state-of-the-art symbolic and neural approaches, respectively.
diff --git a/_publications/gvero2015synthesizing.markdown b/_publications/gvero2015synthesizing.markdown
index 8cdbdd0e..55921597 100644
--- a/_publications/gvero2015synthesizing.markdown
+++ b/_publications/gvero2015synthesizing.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Synthesizing Java expressions from free-form queries"
-authors: T. Gvero, V. Kuncak
+authors: Tihomir Gvero, Viktor Kuncak
 conference: OOPSLA
 year: 2015
-bibkey: gvero2015synthesizing
-tags: ["synthesis", "generation", "bimodal"]
+tags: ["synthesis", "code generation", "bimodal"]
 ---
 We present a new code assistance tool for integrated development environments. Our system accepts as input free-form queries containing a mixture of English and Java, and produces Java code expressions that take the query into account and respect syntax, types, and scoping rules of Java, as well as statistical usage patterns. In contrast to solutions based on code search, the results returned by our tool need not directly correspond to any previously seen code fragment. As part of our system we have constructed a probabilistic context free grammar for Java constructs and library invocations, as well as an algorithm that uses a customized natural language processing tool chain to extract information from free-form text queries. We present the results on a number of examples showing that our technique (1) often produces the expected code fragments, (2) tolerates much of the flexibility of natural language, and (3) can repair incorrect Java expressions that use, for example, the wrong syntax or missing arguments. 
diff --git a/_publications/habib2019neural.markdown b/_publications/habib2019neural.markdown
index 8b25bd5d..21ec360b 100644
--- a/_publications/habib2019neural.markdown
+++ b/_publications/habib2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Bug Finding: A Study of Opportunities and Challenges"
-authors: A. Habib, M. Pradel
+authors: Andrew Habib, Michael Pradel
 conference: 
 year: 2019
-bibkey: habib2019neural
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1906.00307"}
 tags: ["program analysis"]
diff --git a/_publications/hajipour2019samplefix.markdown b/_publications/hajipour2019samplefix.markdown
index a47b14c9..31ba6a74 100644
--- a/_publications/hajipour2019samplefix.markdown
+++ b/_publications/hajipour2019samplefix.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "SampleFix: Learning to Correct Programs by Sampling Diverse Fixes"
-authors: H. Hajipour, A. Bhattacharya, M. Fritz
+authors: Hossein Hajipour, Apratim Bhattacharyya, Cristian-Alexandru Staicu, Mario Fritz
 conference: 
 year: 2019
-bibkey: hajipour2019samplefix
-tags: ["repair", "generation"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1906.10502"}
+tags: ["repair", "code generation"]
 ---
 Automatic program correction is an active topic of research, which holds the potential of dramatically improving productivity of programmers during the software development process and correctness of software in general. Recent advances in machine learning, deep learning and NLP have rekindled the hope to eventually fully automate the process of repairing programs. A key challenges is ambiguity, as multiple codes -- or fixes -- can implement the same functionality. In addition, dataset by nature fail to capture the variance introduced by such ambiguities. Therefore, we propose a deep generative model to automatically correct programming errors by learning a distribution of potential fixes. Our model is formulated as a deep conditional variational autoencoder that samples diverse fixes for the given erroneous programs. In order to account for ambiguity and inherent lack of representative datasets, we propose a novel regularizer to encourage the model to generate diverse fixes. Our evaluations on common programming errors show for the first time the generation of diverse fixes and strong improvements over the state-of-the-art approaches by fixing up to 61% of the mistakes. 
diff --git a/_publications/haldar2020multiperspective.markdown b/_publications/haldar2020multiperspective.markdown
new file mode 100644
index 00000000..aede1ec1
--- /dev/null
+++ b/_publications/haldar2020multiperspective.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "A Multi-Perspective Architecture for Semantic Code Search"
+authors: Rajarshi Haldar, Lingfei Wu, Jinjun Xiong, Julia Hockenmaier
+conference: ACL
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2005.06980"}
+tags: ["search"]
+---
+The ability to match pieces of code to their corresponding natural language descriptions and vice versa is fundamental for natural language search interfaces to software repositories. In this paper, we propose a novel multi-perspective cross-lingual neural framework for code--text matching, inspired in part by a previous model for monolingual text-to-text matching, to capture both global and local similarities. Our experiments on the CoNaLa dataset show that our proposed model yields better performance on this cross-lingual text-to-code matching task than previous approaches that map code and text to a single joint embedding space. 
diff --git a/_publications/haque2020improved.markdown b/_publications/haque2020improved.markdown
index a0b94af2..e048a2cf 100644
--- a/_publications/haque2020improved.markdown
+++ b/_publications/haque2020improved.markdown
@@ -4,7 +4,6 @@ title: "Improved Automatic Summarization of Subroutines via Attention to File Co
 authors: Sakib Haque, Alexander LeClair, Lingfei Wu, Collin McMillan
 conference:
 year: 2020
-bibkey: haque2020improved
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.04881"}
 tags: ["summarization"]
diff --git a/_publications/haque2022semantic.markdown b/_publications/haque2022semantic.markdown
new file mode 100644
index 00000000..b58968cd
--- /dev/null
+++ b/_publications/haque2022semantic.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Semantic Similarity Metrics for Evaluating Source Code Summarization"
+authors: Sakib Haque, Zachary Eberhart, Aakash Bansal, Collin McMillan
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2204.01632"}
+tags: ["human evaluation", "evaluation"]
+---
+Source code summarization involves creating brief descriptions of source code in natural language. These descriptions are a key component of software documentation such as JavaDocs. Automatic code summarization is a prized target of software engineering research, due to the high value summaries have to programmers and the simultaneously high cost of writing and maintaining documentation by hand. Current work is almost all based on machine models trained via big data input. Large datasets of examples of code and summaries of that code are used to train an e.g. encoder-decoder neural model. Then the output predictions of the model are evaluated against a set of reference summaries. The input is code not seen by the model, and the prediction is compared to a reference. The means by which a prediction is compared to a reference is essentially word overlap, calculated via a metric such as BLEU or ROUGE. The problem with using word overlap is that not all words in a sentence have the same importance, and many words have synonyms. The result is that calculated similarity may not match the perceived similarity by human readers. In this paper, we conduct an experiment to measure the degree to which various word overlap metrics correlate to human-rated similarity of predicted and reference summaries. We evaluate alternatives based on current work in semantic similarity metrics and propose recommendations for evaluation of source code summarization. 
diff --git a/_publications/harer2018learning.markdown b/_publications/harer2018learning.markdown
index 87f6296e..cad793c6 100644
--- a/_publications/harer2018learning.markdown
+++ b/_publications/harer2018learning.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "Learning to Repair Software Vulnerabilities with Generative Adversarial Networks"
-authors: J. A. Harer, O. Ozdemir, T. Lazovich, C. P. Reale, R. L. Russell, L. Y. Kim
-conference: "arXiv:1805.07475"
+authors: Jacob Harer, Onur Ozdemir, Tomo Lazovich, Christopher P. Reale, Rebecca L. Russell, Louis Y. Kim, Peter Chin
+conference: NeurIPS
 year: 2018
-bibkey: harer2018learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1805.07475"}
-tags: ["repair", "generation"]
+tags: ["repair", "code generation"]
 ---
 Motivated by the problem of automated repair of software vulnerabilities, we propose an adversarial learning approach that maps from one discrete source domain to another target domain without requiring paired labeled examples or source and target domains to be bijections. We demonstrate that the proposed adversarial learning approach is an effective technique for repairing software vulnerabilities, performing close to seq2seq approaches that require labeled pairs. The proposed Generative Adversarial Network approach is application-agnostic in that it can be applied to other problems similar to code repair, such as grammar correction or sentiment translation.
diff --git a/_publications/hashimoto2018retrieve.markdown b/_publications/hashimoto2018retrieve.markdown
index d3054ee4..914ebca3 100644
--- a/_publications/hashimoto2018retrieve.markdown
+++ b/_publications/hashimoto2018retrieve.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "A Retrieve-and-Edit Framework for Predicting Structured Outputs"
-authors: T. B. Hashimoto, K. Guu, Y. Oren, P. Liang
-conference: NIPS
+authors: Tatsunori B. Hashimoto, Kelvin Guu, Yonatan Oren, Percy S. Liang
+conference: NeurIPS
 year: 2018
-bibkey: hashimoto2018retrieve
-tags: ["bimodal", "search", "generation"]
+tags: ["bimodal", "search", "code generation"]
 ---
 For the task of generating complex outputs such as source code, editing existing
 outputs can be easier than generating complex outputs from scratch.  With this
diff --git a/_publications/hata2018learning.markdown b/_publications/hata2018learning.markdown
index 2cd712db..41dc7f99 100644
--- a/_publications/hata2018learning.markdown
+++ b/_publications/hata2018learning.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Learning to Generate Corrective Patches using Neural Machine Translation"
-authors: H. Hata, E. Shihab, G. Neubig
+authors: Hideaki Hata, Emad Shihab, Graham Neubig
 conference: 
 year: 2018
-bibkey: hata2018learning
-tags: ["repair", "generation"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1812.07170"}
+tags: ["repair", "code generation"]
 ---
 Bug fixing is generally a manually-intensive task. However, recent work has proposed the idea of automated program repair, which aims to repair (at least a subset of) bugs in different ways such as code mutation, etc. Following in the same line of work as automated bug repair, in this paper we aim to leverage past fixes to propose fixes of current/future bugs. Specifically, we propose Ratchet, a corrective patch generation system using neural machine translation. By learning corresponding pre-correction and post-correction code in past fixes with a neural sequence-to-sequence model, Ratchet is able to generate a fix code for a given bug-prone code query. We perform an empirical study with five open source projects, namely Ambari, Camel, Hadoop, Jetty and Wicket, to evaluate the effectiveness of Ratchet. Our findings show that Ratchet can generate syntactically valid statements 98.7% of the time, and achieve an F1-measure between 0.41-0.83 with respect to the actual fixes adopted in the code base. In addition, we perform a qualitative validation using 20 participants to see whether the generated statements can be helpful in correcting bugs. Our survey showed that Ratchet's output was considered to be helpful in fixing the bugs on many occasions, even if fix was not 100% correct.
diff --git a/_publications/hazoom2021text.markdown b/_publications/hazoom2021text.markdown
new file mode 100644
index 00000000..08fbe65f
--- /dev/null
+++ b/_publications/hazoom2021text.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Text-to-SQL in the Wild: A Naturally-Occurring Dataset Based on Stack Exchange Data"
+authors: Moshe Hazoom, Vibhor Malik, Ben Bogin
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.9.pdf"}
+tags: ["dataset"]
+---
+Most available semantic parsing datasets, comprising of pairs of natural utterances and logical forms, were collected solely for the purpose of training and evaluation of natural language understanding systems. As a result, they do not contain any of the richness and variety of natural-occurring utterances, where humans ask about data they need or are curious about. In this work, we release SEDE, a dataset with 12,023 pairs of utterances and SQL queries collected from real usage on the Stack Exchange website. We show that these pairs contain a variety of real-world challenges which were rarely reflected so far in any other semantic parsing dataset, propose an evaluation metric based on comparison of partial query clauses that is more suitable for real-world queries, and conduct experiments with strong baselines, showing a large gap between the performance on SEDE compared to other common datasets.
diff --git a/_publications/he2019learning.markdown b/_publications/he2019learning.markdown
index ff0829c7..c01ef0df 100644
--- a/_publications/he2019learning.markdown
+++ b/_publications/he2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Fuzz from Symbolic Execution with Application to Smart Contracts"
-authors: J. He, M. Balunovic, N. Ambroladze, P. Tsanakov, M. Vechev
+authors: Jingxuan He, Mislav Balunović, Nodar Ambroladze, Petar Tsankov, Martin Vechev
 conference: CCS
 year: 2019
-bibkey: he2019learning
 additional_links:
    - {name: "Preprint", url: "https://files.sri.inf.ethz.ch/website/papers/ccs19-ilf.pdf"}
 tags: ["fuzzing", "GNN"]
diff --git a/_publications/he2021learning.markdown b/_publications/he2021learning.markdown
new file mode 100644
index 00000000..8365dd76
--- /dev/null
+++ b/_publications/he2021learning.markdown
@@ -0,0 +1,23 @@
+---
+layout: publication
+title: "Learning to Find Naming Issues with Big Code and Small Supervision"
+authors: Jingxuan He, Cheng-Chun Lee, Veselin Raychev, Martin Vechev
+conference: PLDI
+year: 2021
+tags: ["repair"]
+---
+We introduce a new approach for finding and fixing naming
+issues in source code. The method is based on a careful
+combination of unsupervised and supervised procedures: (i)
+unsupervised mining of patterns from Big Code that express
+common naming idioms. Program fragments violating such
+idioms indicates likely naming issues, and (ii) supervised
+learning of a classifier on a small labeled dataset which filters
+potential false positives from the violations.
+
+We implemented our method in a system called
+Namer and evaluated it on a large number of Python and Java programs.
+We demonstrate that Namer is effective in finding naming mistakes
+in real world repositories with high precision (∼70%).
+Perhaps surprisingly, we also show that existing deep learning methods
+are not practically effective and achieve low precision in finding naming issues (up to ∼16%).
diff --git a/_publications/he2022distribution.markdown b/_publications/he2022distribution.markdown
new file mode 100644
index 00000000..8042a80f
--- /dev/null
+++ b/_publications/he2022distribution.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "On Distribution Shift in Learning-based Bug Detectors"
+authors: Jingxuan He, Luca Beurer-Kellner, Martin Vechev
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2204.10049"}
+   - {name: "Dataset", url: "https://github.com/eth-sri/learning-real-bug-detector"}
+tags: ["defect"]
+---
+Deep learning has recently achieved initial success in program analysis tasks such as bug detection. Lacking real bugs, most existing works construct training and test data by injecting synthetic bugs into correct programs. Despite achieving high test accuracy (e.g. >90%), the resulting bug detectors are found to be surprisingly unusable in practice, i.e., <10% precision when used to scan real software repositories. In this work, we argue that this massive performance difference is caused by distribution shift, i.e., a fundamental mismatch between the real bug distribution and the synthetic bug distribution used to train and evaluate the detectors. To address this key challenge, we propose to train a bug detector in two phases, first on a synthetic bug distribution to adapt the model to the bug detection domain, and then on a real bug distribution to drive the model towards the real distribution. During these two phases, we leverage a multi-task hierarchy, focal loss, and contrastive learning to further boost performance. We evaluate our approach extensively on three widely studied bug types, for which we construct new datasets carefully designed to capture the real bug distribution. The results demonstrate that our approach is practically effective and successfully mitigates the distribution shift: our learned detectors are highly performant on both our constructed test set and the latest version of open source repositories. 
diff --git a/_publications/hellendoorn2015will.markdown b/_publications/hellendoorn2015will.markdown
index 1d08da49..4cb0651d 100644
--- a/_publications/hellendoorn2015will.markdown
+++ b/_publications/hellendoorn2015will.markdown
@@ -1,13 +1,12 @@
 ---
 layout: publication
 title: "Will they like this? Evaluating Code Contributions With Language Models"
-authors: V.J. Hellendoorn, P. Devanbu, A. Bacchelli
+authors: Vincent J. Hellendoorn, Premkumar Devanbu, Alberto Bacchelli
 conference: MSR
 year: 2015
-bibkey: hellendoorn2015will
 additional_links:
    - {name: "Paper", url: "http://vhellendoorn.github.io/PDF/msr2015.pdf"}
-tags: ["code review", "language model"]
+tags: ["review", "language model"]
 ---
 Popular open-source software projects receive and
 review contributions from a diverse array of developers, many
diff --git a/_publications/hellendoorn2017deep.markdown b/_publications/hellendoorn2017deep.markdown
index 0d755543..315a7fe8 100644
--- a/_publications/hellendoorn2017deep.markdown
+++ b/_publications/hellendoorn2017deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Are Deep Neural Networks the Best Choice for Modeling Source Code?"
-authors: V. J. Hellendoorn, P. Devanbu
+authors: Vincent J. Hellendoorn, Premkumar Devanbu
 conference: FSE 
 year: 2017
-bibkey: hellendoorn2017deep
 additional_links:
    - {name: "Paper", url: "http://vhellendoorn.github.io/PDF/fse2017.pdf"}
    - {name: "Slides", url: "http://vhellendoorn.github.io/PPT/FSE17Presentation.pptx"}
diff --git a/_publications/hellendoorn2018deep.markdown b/_publications/hellendoorn2018deep.markdown
index 3e3a259b..3fc14efc 100644
--- a/_publications/hellendoorn2018deep.markdown
+++ b/_publications/hellendoorn2018deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Learning Type Inference"
-authors: V. J. Hellendoorn, C. Bird, E. T. Barr, M. Allamanis
+authors: V. J. Hellendoorn, Christian Bird, Earl T. Barr, Miltiadis Allamanis
 conference: FSE 
 year: 2018
-bibkey: hellendoorn2018deep
 tags: ["representation", "types"]
 ---
 Dynamically typed languages such as JavaScript and Python are
diff --git a/_publications/hellendoorn2020global.markdown b/_publications/hellendoorn2020global.markdown
index fad2804b..1fd5fed0 100644
--- a/_publications/hellendoorn2020global.markdown
+++ b/_publications/hellendoorn2020global.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "Global Relational Models of Source Code"
-authors: V.J. Hellendoorn, C. Sutton, R. Singh, P. Maniatis, D. Bieber
+authors: Vincent J. Hellendoorn, Charles Sutton, Rishab Singh, Petros Maniatis, David Bieber
 conference: ICLR
 year: 2020
-bibkey: hellendoorn2020global
 additional_links:
    - {name: "OpenReview", url: "https://openreview.net/forum?id=B1lnbRNtwr&noteId=B1lnbRNtwr"}
-tags: ["variable misuse", "defect", "GNN"]
+tags: ["variable misuse", "defect", "GNN", "Transformer"]
 ---
 Models of code can learn distributed representations of a program's syntax and semantics to predict many non-trivial properties of a program. Recent state-of-the-art models leverage highly structured representations of programs, such as trees, graphs and paths therein (e.g. data-flow relations), which are precise and abundantly available for code. This provides a strong inductive bias towards semantically meaningful relations, yielding more generalizable representations than classical sequence-based models. Unfortunately, these models primarily rely on graph-based message passing to represent relations in code, which makes them de facto local due to the high cost of message-passing steps, quite in contrast to modern, global sequence-based models, such as the Transformer. In this work, we bridge this divide between global and structured models by introducing two new hybrid model families that are both global and incorporate structural bias: Graph Sandwiches, which wrap traditional (gated) graph message-passing layers in sequential message-passing layers; and Graph Relational Embedding Attention Transformers (GREAT for short), which bias traditional Transformers with relational information from graph edge types. By studying a popular, non-trivial program repair task, variable-misuse identification, we explore the relative merits of traditional and hybrid model families for code representation. Starting with a  graph-based model that already improves upon the prior state-of-the-art for this task by 20%, we show that our proposed hybrid models improve an additional 10-15%, while training both faster and using fewer parameters.
diff --git a/_publications/henkel2020semantic.markdown b/_publications/henkel2020semantic.markdown
new file mode 100644
index 00000000..f6978565
--- /dev/null
+++ b/_publications/henkel2020semantic.markdown
@@ -0,0 +1,14 @@
+---
+layout: publication
+title: "Semantic Robustness of Models of Source Code"
+authors: Jordan Henkel, Goutham Ramakrishnan, Zi Wang, Aws Albarghouthi, Somesh Jha, Thomas Reps
+conference: SANER
+year: 2022
+additional_links:
+   - {name: "PDF", url: "https://pages.cs.wisc.edu/~jjhenkel/papers/saner22-semantic-robustness.pdf"}
+   - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9825895"}
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2002.03043"}
+   - {name: "Code", url: "https://github.com/jjhenkel/averloc"}
+tags: ["adversarial", "naming"]
+---
+Deep neural networks are vulnerable to adversarial examples - small input perturbations that result in incorrect predictions. We study this problem for models of source code, where we want the neural network to be robust to source-code modifications that preserve code functionality. To facilitate training robust models, we define a powerful and generic adversary that can employ sequences of parametric, semantics-preserving program transformations. We then explore how, with such an adversary, one can train models that are robust to adversarial program transformations. We conduct a thorough evaluation of our approach and find several surprising facts: we find robust training to beat dataset augmentation in every evaluation we performed; we find that a state-of-the-art architecture (code2seq) for models of code is harder to make robust than a simpler baseline; additionally, we find code2seq to have surprising weaknesses not present in our simpler baseline model; finally, we find that robust models perform better against unseen data from different sources (as one might hope) - however, we also find that robust models are not clearly better in the cross-language transfer task. To the best of our knowledge, we are the first to study the interplay between robustness of models of code and the domain-adaptation and cross-language transfer tasks.
diff --git a/_publications/heyman2020neural.markdown b/_publications/heyman2020neural.markdown
new file mode 100644
index 00000000..4f1d589d
--- /dev/null
+++ b/_publications/heyman2020neural.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Neural Code Search Revisited: Enhancing Code Snippet Retrieval through Natural Language Intent"
+authors: Geert Heyman, Tom Van Cutsem
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2008.12193"}
+tags: ["search"]
+---
+In this work, we propose and study annotated code search: the retrieval of code snippets paired with brief descriptions of their intent using natural language queries. On three benchmark datasets, we investigate how code retrieval systems can be improved by leveraging descriptions to better capture the intents of code snippets. Building on recent progress in transfer learning and natural language processing, we create a domain-specific retrieval model for code annotated with a natural language description. We find that our model yields significantly more relevant search results (with absolute gains up to 20.6% in mean reciprocal rank) compared to state-of-the-art code retrieval methods that do not use descriptions but attempt to compute the intent of snippets solely from unannotated code. 
diff --git a/_publications/hindle2012naturalness.markdown b/_publications/hindle2012naturalness.markdown
index 553899d4..bdb1f32a 100644
--- a/_publications/hindle2012naturalness.markdown
+++ b/_publications/hindle2012naturalness.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "On the Naturalness of Software"
-authors: A. Hindle, E. T. Barr, Z. Su, M. Gabel, P. Devanbu
+authors: Abram Hindle, Earl T. Barr, Mark Gabel, Zhendong Su, Premkumar Devanbu
 conference: ICSE
 year: 2012
-bibkey: hindle2012naturalness
 tags: ["language model", "autocomplete"]
 ---
 Natural languages like English are rich, complex,
diff --git a/_publications/hoang2020cc2vec.markdown b/_publications/hoang2020cc2vec.markdown
index 6e211556..aae34b38 100644
--- a/_publications/hoang2020cc2vec.markdown
+++ b/_publications/hoang2020cc2vec.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "CC2Vec: Distributed Representations of Code Changes"
-authors: T. Hoang, H. J. Kang, J. Lawall, D. Lo
+authors: Thong Hoang, Hong Jin Kang, Julia Lawall, David Lo
 conference: ICSE
 year: 2020
-bibkey: hoang2020cc2vec
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2003.05620"}
    - {name: "code", url: "https://github.com/CC2Vec/CC2Vec"}
diff --git a/_publications/hong2021fix.markdown b/_publications/hong2021fix.markdown
new file mode 100644
index 00000000..e3820c3d
--- /dev/null
+++ b/_publications/hong2021fix.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Fix-Filter-Fix: Intuitively Connect Any Models for Effective Bug Fixing"
+authors: Haiwen Hong, Jingfeng Zhang, Yin Zhang, Yao Wan, Yulei Sui
+conference: EMNLP
+year: 2021
+additional_links:
+   - {name: "Proceedings", url: "https://aclanthology.org/2021.emnlp-main.282/"}
+tags: ["repair"]
+---
+Locating and fixing bugs is a time-consuming task. Most neural machine translation (NMT) based approaches for automatically bug fixing lack generality and do not make full use of the rich information in the source code. In NMT-based bug fixing, we find some predicted code identical to the input buggy code (called unchanged fix) in NMT-based approaches due to high similarity between buggy and fixed code (e.g., the difference may only appear in one particular line). Obviously, unchanged fix is not the correct fix because it is the same as the buggy code that needs to be fixed. Based on these, we propose an intuitive yet effective general framework (called Fix-Filter-Fix or Fˆ3) for bug fixing. Fˆ3 connects models with our filter mechanism to filter out the last model’s unchanged fix to the next. We propose an Fˆ3 theory that can quantitatively and accurately calculate the Fˆ3 lifting effect. To evaluate, we implement the Seq2Seq Transformer (ST) and the AST2Seq Transformer (AT) to form some basic Fˆ3 instances, called Fˆ3_ST+AT and Fˆ3_AT+ST. Comparing them with single model approaches and many model connection baselines across four datasets validates the effectiveness and generality of Fˆ3 and corroborates our findings and methodology.
\ No newline at end of file
diff --git a/_publications/hsiao2014using.markdown b/_publications/hsiao2014using.markdown
index 9de136c3..2d24410f 100644
--- a/_publications/hsiao2014using.markdown
+++ b/_publications/hsiao2014using.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Using Web Corpus Statistics for Program Analysis"
-authors: C. Hsiao, M. Cafarella, S. Narayanasamy
+authors: Chun-Hung Hsiao, Michael Cafarella, Satish Narayanasamy
 conference: OOPSLA
 year: 2014
-bibkey: hsiao2014using
 tags: ["defect"]
 ---
 Several program analysis tools—such as plagiarism detection and bug finding—rely on knowing a piece of code’s
diff --git a/_publications/hu2017codesum.markdown b/_publications/hu2017codesum.markdown
index 90ceadd3..3bb33cbe 100644
--- a/_publications/hu2017codesum.markdown
+++ b/_publications/hu2017codesum.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "CodeSum: Translate Program Language to Natural Language"
-authors: X. Hu, Y. Wei, G. Li, Z. Jin
-conference: ArXiV 1708.01837
+authors: Xing Hu, Yuhan Wei, Ge Li, Zhi Jin
+conference: 
 year: 2017
-bibkey: hu2017codesum
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1708.01837"}
 tags: ["bimodal", "summarization"]
 ---
 During software maintenance, programmers spend a lot of time on code comprehension. Reading comments is an effective way for programmers to reduce the reading and navigating time when comprehending source code. Therefore, as a critical task in software engineering, code summarization aims to generate brief natural language descriptions for source code. In this paper, we propose a new code summarization model named CodeSum. CodeSum exploits the attention-based sequence-to-sequence (Seq2Seq) neural network with Structure-based Traversal (SBT) of Abstract Syntax Trees (AST). The AST sequences generated by SBT can better present the structure of ASTs and keep unambiguous. We conduct experiments on three large-scale corpora in different program languages, i.e., Java, C#, and SQL, in which Java corpus is our new proposed industry code extracted from Github. Experimental results show that our method CodeSum outperforms the state-of-the-art significantly. 
diff --git a/_publications/huang2021cosqa.markdown b/_publications/huang2021cosqa.markdown
new file mode 100644
index 00000000..182ac952
--- /dev/null
+++ b/_publications/huang2021cosqa.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "CoSQA: 20,000+ Web Queries for Code Search and Question Answering"
+authors: Junjie Huang, Duyu Tang, Linjun Shou, Ming Gong, Ke Xu, Daxin Jiang, Ming Zhou, Nan Duan
+conference: ACL
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.13239"}
+   - {name: "Code", url: "https://github.com/Jun-jie-Huang/CoCLR"}
+tags: ["dataset", "search"]
+---
+Finding codes given natural language query is beneficial to the productivity of software developers.
+Future progress towards better semantic matching between query and code requires richer supervised training resources.
+To remedy this, we introduce the CoSQA dataset. It includes 20,604 labels for pairs of natural language queries and codes,
+each annotated by at least 3 human annotators. We further introduce a contrastive learning method dubbed CoCLR to enhance query-code matching, which works as a data augmenter to bring more artificially generated training instances. We show that evaluated on CodeXGLUE with the same CodeBERT model, training on CoSQA improves the accuracy of code question answering by 5.1%, and incorporating CoCLR brings a further improvement of 10.5%. 
diff --git a/_publications/husain2019codesearchnet.markdown b/_publications/husain2019codesearchnet.markdown
index 52b7e378..9a37160d 100644
--- a/_publications/husain2019codesearchnet.markdown
+++ b/_publications/husain2019codesearchnet.markdown
@@ -1,15 +1,14 @@
 ---
 layout: publication
 title: "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search"
-authors: H. Husain, H. Wu, T. Gazit, M. Allamanis, M. Brockschmidt
+authors: Hamel Husain, Ho-Hsiang Wu, Tiferet Gazit, Miltiadis Allamanis, Marc Brockschmidt
 conference:
 year: 2019
-bibkey: husain2019codesearchnet
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1909.09436"}
    - {name: "Code and other info", url: "https://github.com/github/CodeSearchNet"}
    - {name: "Leaderboard", url: "https://app.wandb.ai/github/codesearchnet/benchmark"}
-tags: ["dataset", "retrieval"]
+tags: ["dataset", "search"]
 ---
 Semantic code search is the task of retrieving relevant code given a natural language query. While related to other information retrieval tasks, it requires bridging the gap between the language used in code (often abbreviated and highly technical) and natural language more suitable to describe vague concepts and ideas.
 
diff --git a/_publications/hussain2019deep.markdown b/_publications/hussain2019deep.markdown
index c8f16bc2..c3774220 100644
--- a/_publications/hussain2019deep.markdown
+++ b/_publications/hussain2019deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Transfer Learning for Source Code Modeling"
-authors: Y. Hussain, Z. Huang, Y. Zhou, S. Wang
+authors: Yasir Hussain, Zhiqiu Huang, Yu Zhou, Senzhang Wang
 conference:
 year: 2019
-bibkey: hussain2019deep
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05493"}
 tags: ["pretraining"]
diff --git a/_publications/iyer2016summarizing.markdown b/_publications/iyer2016summarizing.markdown
index a0121b9c..3f65f8d2 100644
--- a/_publications/iyer2016summarizing.markdown
+++ b/_publications/iyer2016summarizing.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Summarizing Source Code using a Neural Attention Model"
-authors: S. Iyer, I. Konstas, A. Cheung, L. Zettlemoyer
+authors: Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Luke Zettlemoyer
 conference: ACL
 year: 2016
-bibkey: iyer2016summarizing
 tags: ["summarization", "bimodal"]
 ---
 High quality source code is often paired
diff --git a/_publications/iyer2018mapping.markdown b/_publications/iyer2018mapping.markdown
index b7dd7f0b..a965b109 100644
--- a/_publications/iyer2018mapping.markdown
+++ b/_publications/iyer2018mapping.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Mapping Language to Code in Programmatic Context"
-authors: S. Iyer, I. Konstas, A. Cheung, L. Zettlemoyer
+authors: Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Luke Zettlemoyer
 conference: EMNLP
 year: 2018
-bibkey: iyer2018mapping
-tags: ["bimodal", "generation"]
+tags: ["bimodal", "code generation"]
 ---
 Source code is rarely written in isolation. It depends significantly on the programmatic context, such as the class that the code would reside in. To study this phenomenon, we introduce the task of generating class member functions given English documentation and the programmatic context provided by the rest of the class. This task is challenging because the desired code can vary greatly depending on the functionality the class provides (e.g., a sort function may or may not be available when we are asked to "return the smallest element" in a particular member variable list). We introduce CONCODE, a new large dataset with over 100,000 examples consisting of Java classes from online code repositories, and develop a new encoder-decoder architecture that models the interaction between the method documentation and the class environment. We also present a detailed error analysis suggesting that there is significant room for future work on this task.
diff --git a/_publications/iyer2019learning.markdown b/_publications/iyer2019learning.markdown
index 6e9c551e..415c4cbc 100644
--- a/_publications/iyer2019learning.markdown
+++ b/_publications/iyer2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Programmatic Idioms for Scalable Semantic Parsing"
-authors: S. Iyer, A. Cheung, L. Zettlemoyer
+authors: Srinivasan Iyer, Alvin Cheung, Luke Zettlemoyer
 conference: 
 year: 2019
-bibkey: iyer2019learning
-tags: ["idiom", "generation", "AST"]
+tags: ["pattern mining", "code generation", "grammar"]
 ---
 Programmers typically organize executable source code using high-level coding patterns or idiomatic structures such as nested loops, exception handlers and recursive blocks, rather than as individual code tokens. In contrast, state of the art semantic parsers still map natural language instructions to source code by building the code syntax tree one node at a time. In this paper, we introduce an iterative method to extract code idioms from large source code corpora by repeatedly collapsing most-frequent depth-2 subtrees of their syntax trees, and we train semantic parsers to apply these idioms during decoding. We apply this idiom-based code generation to a recent context-dependent semantic parsing task, and improve the state of the art by 2.2% BLEU score while reducing training time by more than 50%. This improved speed enables us to scale up the model by training on an extended training set that is 5x times larger, to further move up the state of the art by an additional 2.3% BLEU and 0.9% exact match. 
diff --git a/_publications/jain2020contrastive.markdown b/_publications/jain2020contrastive.markdown
new file mode 100644
index 00000000..72dd9c04
--- /dev/null
+++ b/_publications/jain2020contrastive.markdown
@@ -0,0 +1,25 @@
+---
+layout: publication
+title: "Contrastive Code Representation Learning"
+authors: Paras Jain, Ajay Jain, Tianjun Zhang, Pieter Abbeel, Joseph E. Gonzalez, Ion Stoica
+conference: 
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2007.04973"}
+   - {name: "Website", url: "https://parasj.github.io/contracode/"}
+   - {name: "GitHub", url : "https://github.com/parasj/contracode"}
+tags: ["representation", "pretraining"]
+---
+Machine-aided programming tools such as type predictors and code summarizers
+are increasingly learning-based. However, most code representation learning approaches rely on supervised learning with task-specific annotated datasets. We propose Contrastive Code Representation Learning (ContraCode), a self-supervised
+algorithm for learning task-agnostic semantic representations of programs via contrastive learning. Our approach uses no human-provided labels, relying only on
+the raw text of programs. In particular, we design an unsupervised pretext task by
+generating textually divergent copies of source functions via automated source-tosource compiler transforms that preserve semantics. We train a neural model to
+identify variants of an anchor program within a large batch of negatives. To solve
+this task, the network must extract program features representing the functionality,
+not form, of the program. This is the first application of instance discrimination
+to code representation learning to our knowledge. We pre-train models over 1.8m
+unannotated JavaScript methods mined from GitHub. ContraCode pre-training
+improves code summarization accuracy by 7.9% over supervised approaches and
+4.8% over RoBERTa pre-training. Moreover, our approach is agnostic to model architecture; for a type inference task, contrastive pre-training consistently improves
+the accuracy of existing baselines.
diff --git a/_publications/jayasundara2019treecaps.markdown b/_publications/jayasundara2019treecaps.markdown
index d56989f9..8511878c 100644
--- a/_publications/jayasundara2019treecaps.markdown
+++ b/_publications/jayasundara2019treecaps.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "TreeCaps: Tree-Structured Capsule Networks for Program Source Code Processing"
-authors: V. Jayasundara, N. Bui, L. Jiang, D. Lo
+authors: Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Luke Zettlemoyer
 conference:
 year: 2019
-bibkey: jayasundara2019treecaps
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.12306"}
 tags: ["representation"]
diff --git a/_publications/jesse2021learning.markdown b/_publications/jesse2021learning.markdown
new file mode 100644
index 00000000..c164b888
--- /dev/null
+++ b/_publications/jesse2021learning.markdown
@@ -0,0 +1,21 @@
+---
+layout: publication
+title: "Learning Type Annotation: Is Big Data Enough?"
+authors: Kevin Jesse, Premkumar Devanbu, Toufique Ahmed
+conference: FSE
+year: 2021
+tags: ["Transformer", "types"]
+---
+TypeScript is a widely used optionally-typed language where developers can adopt “pay as you go” typing: they can add types as
+desired, and benefit from static typing. The “type annotation tax”
+or manual effort required to annotate new or existing TypeScript
+can be reduced by a variety of automatic methods. Probabilistic
+machine-learning (ML) approaches work quite well. ML approaches
+use different inductive biases, ranging from simple token sequences
+to complex graphical neural network (GNN) models capturing syntax and semantic relations. More sophisticated inductive biases are
+hand-engineered to exploit the formal nature of software. Rather
+than deploying fancy inductive biases for code, can we just use “big
+data” to learn natural patterns relevant to typing? We find evidence
+suggesting that this is the case. We present TypeBert, demonstrating that even with simple token-sequence inductive bias used in
+BERT-style models and enough data, type-annotation performance
+of the most sophisticated models can be surpassed.
diff --git a/_publications/jesse2022learning.markdown b/_publications/jesse2022learning.markdown
new file mode 100644
index 00000000..994a909e
--- /dev/null
+++ b/_publications/jesse2022learning.markdown
@@ -0,0 +1,9 @@
+---
+layout: publication
+title: "Learning To Predict User-Defined Types"
+authors: Kevin Jesse, Premkumar T. Devanbu, Anand Sawant
+conference: TSE
+year: 2022
+tags: ["Transformer", "types"]
+---
+TypeScript is a widely adopted gradual typed language where developers can optionally type variables, functions, parameters and more. Probabilistic type inference approaches with ML (machine learning) work well especially for commonly occurring types such as boolean, number, and string. TypeScript permits a wide range of types including developer defined class names and type interfaces. These developer defined types, termed user-defined types, can be written within the realm of language naming conventions. The set of user-defined types is boundless and existing bounded type guessing approaches are an imperfect solution. Existing works either under perform in user-defined types or ignore user-defined types altogether. This work leverages a BERT-style pre-trained model, with multi-task learning objectives, to learn how to type user-defined classes and interfaces. Thus we present DIVERSETYPER, a solution that explores the diverse set of user-defined types by uniquely aligning classes and interfaces declarations to the places in which they are used. DIVERSETYPER surpasses all existing works including those that model user-defined types.
diff --git a/_publications/jesse2023large.markdown b/_publications/jesse2023large.markdown
new file mode 100644
index 00000000..5c953d22
--- /dev/null
+++ b/_publications/jesse2023large.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Large Language Models and Simple, Stupid Bugs"
+authors: Kevin Jesse, Toufique Ahmed, Premkumar T. Devanbu, Emily Morgan
+conference: 
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2303.11455"}
+tags: ["Transformer", "defect"]
+---
+With the advent of powerful neural language models, AI-based systems to assist developers in coding tasks are becoming widely available; Copilot is one such system. Copilot uses Codex, a large language model (LLM), to complete code conditioned on a preceding "prompt". Codex, however, is trained on public GitHub repositories, viz., on code that may include bugs and vulnerabilities. Previous studies [1], [2] show Codex reproduces vulnerabilities seen in training. In this study, we examine how prone Codex is to generate an interesting bug category, single statement bugs, commonly referred to as simple, stupid bugs or SStuBs in the MSR community. We find that Codex and similar LLMs do help avoid some SStuBs, but do produce known, verbatim SStuBs as much as 2x as likely than known, verbatim correct code. We explore the consequences of the Codex generated SStuBs and propose avoidance strategies that suggest the possibility of reducing the production of known, verbatim SStubs, and increase the possibility of producing known, verbatim fixes. 
diff --git a/_publications/jian2021multimodal.markdown b/_publications/jian2021multimodal.markdown
new file mode 100644
index 00000000..4bcbbcdd
--- /dev/null
+++ b/_publications/jian2021multimodal.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Multimodal Representation for Neural Code Search
+authors: Jian Gu, Zimin Chen, Martin Monperrus
+conference: ICSME
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2107.00992"}
+   - {name: "code", url: "https://github.com/jianguda/mrncs"}
+tags: ["search", "representation"]
+---
+Semantic code search is about finding semantically relevant code snippets for a given natural language query. In the state-of-the-art approaches, the semantic similarity between code and query is quantified as the distance of their representation in the shared vector space. In this paper, to improve the vector space, we introduce tree-serialization methods on a simplified form of AST and build the multimodal representation for the code data. We conduct extensive experiments using a single corpus that is large-scale and multi-language: CodeSearchNet. Our results show that both our tree-serialized representations and multimodal learning model improve the performance of code search. Last, we define intuitive quantification metrics oriented to the completeness of semantic and syntactic information of the code data, to help understand the experimental findings.
diff --git a/_publications/jian2022assemble.markdown b/_publications/jian2022assemble.markdown
new file mode 100644
index 00000000..49daefbc
--- /dev/null
+++ b/_publications/jian2022assemble.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Assemble Foundation Models for Automatic Code Summarization
+authors: Jian Gu, Pasquale Salza, Harald C. Gall
+conference: SANER
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2201.05222"}
+   - {name: "code", url: "https://github.com/jianguda/afm4acs"}
+tags: ["summarization", "documentation", "language model"]
+---
+Automatic code summarization is beneficial to software development and maintenance since it reduces the burden of manual tasks. Currently, artificial intelligence is undergoing a paradigm shift. The foundation models pretrained on massive data and finetuned to downstream tasks surpass specially customized models. This trend inspired us to consider reusing foundation models instead of learning from scratch. Based on this, we propose a flexible and robust approach for automatic code summarization based on neural networks. We assemble available foundation models, such as CodeBERT and GPT-2, into a single model named AdaMo. Moreover, we utilize Gaussian noise as the simulation of contextual information to optimize the latent representation. Furthermore, we introduce two adaptive schemes from the perspective of knowledge transfer, namely continuous pretraining and intermediate finetuning, and design intermediate stage tasks for general sequence-to-sequence learning. Finally, we evaluate AdaMo against a benchmark dataset for code summarization, by comparing it with state-of-the-art models.
diff --git a/_publications/jiang2017automatically.markdown b/_publications/jiang2017automatically.markdown
index 4a9b0af3..130f83f1 100644
--- a/_publications/jiang2017automatically.markdown
+++ b/_publications/jiang2017automatically.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Automatically Generating Commit Messages from Diffs using Neural Machine Translation"
-authors: S. Jiang, A. Armaly, C. McMillan 
-conference: ArXiV 1708.09492 
+authors: Siyuan Jiang, Ameer Armaly, Collin McMillan
+conference: ASE
 year: 2017
-bibkey: jiang2017automatically
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1708.09492"}
 tags: ["edit", "bimodal"]
 ---
 Commit messages are a valuable resource in comprehension of software evolution, since they provide a record of changes such as feature additions and bug repairs. Unfortunately, programmers often neglect to write good commit messages. Different techniques have been proposed to help programmers by automatically writing these messages. These techniques are effective at describing what changed, but are often verbose and lack context for understanding the rationale behind a change. In contrast, humans write messages that are short and summarize the high level rationale. In this paper, we adapt Neural Machine Translation (NMT) to automatically "translate" diffs into commit messages. We trained an NMT algorithm using a corpus of diffs and human-written commit messages from the top 1k Github projects. We designed a filter to help ensure that we only trained the algorithm on higher-quality commit messages. Our evaluation uncovered a pattern in which the messages we generate tend to be either very high or very low quality. Therefore, we created a quality-assurance filter to detect cases in which we are unable to produce good messages, and return a warning instead. 
diff --git a/_publications/jiang2021treebert.markdown b/_publications/jiang2021treebert.markdown
new file mode 100644
index 00000000..8984d4e7
--- /dev/null
+++ b/_publications/jiang2021treebert.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "TreeBERT: A Tree-Based Pre-Trained Model for Programming Language"
+authors: Xue Jiang, Zhuoran Zheng, Chen Lyu, Liang Li, Lei Lyu
+conference: UAI
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.12485"}
+tags: ["grammar", "Transformer"]
+---
+Source code can be parsed into the abstract syntax tree (AST) based on defined syntax rules. However, in pre-training, little work has considered the incorporation of tree structure into the learning process. In this paper, we present TreeBERT, a tree-based pre-trained model for improving programming language-oriented generation tasks. To utilize tree structure, TreeBERT represents the AST corresponding to the code as a set of composition paths and introduces node position embedding. The model is trained by tree masked language modeling (TMLM) and node order prediction (NOP) with a hybrid objective. TMLM uses a novel masking strategy designed according to the tree's characteristics to help the model understand the AST and infer the missing semantics of the AST. With NOP, TreeBERT extracts the syntactical structure by learning the order constraints of nodes in AST. We pre-trained TreeBERT on datasets covering multiple programming languages. On code summarization and code documentation tasks, TreeBERT outperforms other pre-trained models and state-of-the-art models designed for these tasks. Furthermore, TreeBERT performs well when transferred to the pre-trained unseen programming language. 
diff --git a/_publications/johnson2020learning.markdown b/_publications/johnson2020learning.markdown
new file mode 100644
index 00000000..205f005d
--- /dev/null
+++ b/_publications/johnson2020learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning Graph Structure With A Finite-State Automaton Layer"
+authors: Daniel D. Johnson, Hugo Larochelle, Daniel Tarlow
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2007.04929"}
+tags: ["GNN", "program analysis"]
+---
+Graph-based neural network models are producing strong results in a number of domains, in part because graphs provide flexibility to encode domain knowledge in the form of relational structure (edges) between nodes in the graph. In practice, edges are used both to represent intrinsic structure (e.g., abstract syntax trees of programs) and more abstract relations that aid reasoning for a downstream task (e.g., results of relevant program analyses). In this work, we study the problem of learning to derive abstract relations from the intrinsic graph structure. Motivated by their power in program analyses, we consider relations defined by paths on the base graph accepted by a finite-state automaton. We show how to learn these relations end-to-end by relaxing the problem into learning finite-state automata policies on a graph-based POMDP and then training these policies using implicit differentiation. The result is a differentiable Graph Finite-State Automaton (GFSA) layer that adds a new edge type (expressed as a weighted adjacency matrix) to a base graph. We demonstrate that this layer can find shortcuts in grid-world graphs and reproduce simple static analyses on Python programs. Additionally, we combine the GFSA layer with a larger graph-based model trained end-to-end on the variable misuse program understanding task, and find that using the GFSA layer leads to better performance than using hand-engineered semantic edges or other baseline methods for adding learned edge types. 
diff --git a/_publications/jung2021commitbert.markdown b/_publications/jung2021commitbert.markdown
new file mode 100644
index 00000000..d973c568
--- /dev/null
+++ b/_publications/jung2021commitbert.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CommitBERT: Commit Message Generation Using Pre-Trained Programming Language Model"
+authors: Tae Hwan Jung
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.3.pdf"}
+tags: ["dataset", "language model", "Transformer"]
+---
+Commit message is a document that summarizes source code changes in natural language. A good commit message clearly shows the source code changes, so this enhances collaboration between developers. Therefore, our work is to develop a model that automatically writes the commit message. To this end, we release 345K datasets consisting of code modification and commit messages in six programming languages (Python, PHP, Go, Java, JavaScript, and Ruby). Similar to the neural machine translation (NMT) model, using our dataset, we feed the code modification to the encoder input and the commit message to the decoder input and measure the result of the generated commit message with BLEU-4. Also, we propose the following two training methods to improve the result of generating the commit message: (1) A method of preprocessing the input to feed the code modification to the encoder input. (2) A method that uses an initial weight suitable for the code domain to reduce the gap in contextual representation between programming language (PL) and natural language (NL).
diff --git a/_publications/kacmajor2019automatic.markdown b/_publications/kacmajor2019automatic.markdown
index 3a5573b1..5acce5a7 100644
--- a/_publications/kacmajor2019automatic.markdown
+++ b/_publications/kacmajor2019automatic.markdown
@@ -1,9 +1,8 @@
 ---
 layout: publication
-title: "Automatic Acquisition of Annotated Training Corpora for Test-Code Generation "
-authors: M. Kacmajor, J. D. Kelleher
+title: "Automatic Acquisition of Annotated Training Corpora for Test-Code Generation"
+authors: Magdalena Kacmajor, John D. Kelleher.
 conference: Information
 year: 2019
-bibkey: kacmajor2019automatic
 ---
 Open software repositories make large amounts of source code publicly available. Potentially, this source code could be used as training data to develop new, machine learning-based programming tools. For many applications, however, raw code scraped from online repositories does not constitute an adequate training dataset. Building on the recent and rapid improvements in machine translation (MT), one possibly very interesting application is code generation from natural language descriptions. One of the bottlenecks in developing these MT-inspired systems is the acquisition of parallel text-code corpora required for training code-generative models. This paper addresses the problem of automatically synthetizing parallel text-code corpora in the software testing domain. Our approach is based on the observation that self-documentation through descriptive method names is widely adopted in test automation, in particular for unit testing. Therefore, we propose synthesizing parallel corpora comprised of parsed test function names serving as code descriptions, aligned with the corresponding function bodies. We present the results of applying one of the state-of-the-art MT methods on such a generated dataset. Our experiments show that a neural MT model trained on our dataset can generate syntactically correct and semantically relevant short Java functions from quasi-natural language descriptions of functionality.
\ No newline at end of file
diff --git a/_publications/kanade2020pretrained.markdown b/_publications/kanade2020pretrained.markdown
index 2ca689d5..bd18bcb6 100644
--- a/_publications/kanade2020pretrained.markdown
+++ b/_publications/kanade2020pretrained.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Pre-trained Contextual Embedding of Source Code"
-authors: A. Kanade, P. Maniatis, G. Balakrishnan, K. Shi
-conference:
+authors: Aditya Kanade, Petros Maniatis, Gogul Balakrishnan, Kensen Shi
+conference: ICML
 year: 2020
-bibkey: kanade2020pretrained
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2001.00059"}
 tags: ["pretraining"]
diff --git a/_publications/karaivanov2014phrase.markdown b/_publications/karaivanov2014phrase.markdown
index bf3486b6..f6139c59 100644
--- a/_publications/karaivanov2014phrase.markdown
+++ b/_publications/karaivanov2014phrase.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Phrase-Based Statistical Translation of Programming Languages"
-authors: S. Karaivanov, V. Raychev, M. Vechev
+authors: S. Karaivanov, Veselin Raychev, Martin Vechev
 conference: Onward
 year: 2014
-bibkey: karaivanov2014phrase
-tags: ["migration", "generation"]
+tags: ["migration", "code generation"]
 ---
 Phrase-based statistical machine translation approaches have been
 highly successful in translating between natural languages and are
diff --git a/_publications/karampatsis2019deep.markdown b/_publications/karampatsis2019deep.markdown
index a6a1e878..66f265bf 100644
--- a/_publications/karampatsis2019deep.markdown
+++ b/_publications/karampatsis2019deep.markdown
@@ -1,12 +1,12 @@
 ---
 layout: publication
 title: "Maybe Deep Neural Networks are the Best Choice for Modeling Source Code"
-authors: R.M. Karampatsis, C. Sutton
+authors: Rafael-Michael Karampatsis, Charles Sutton
 conference:
 year: 2019
-bibkey: karampatsis2019deep
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1903.05734"}
+   - {name: "Code", url: "https://github.com/mast-group/OpenVocabCodeNLM"}
 tags: ["language model"]
 ---
 Statistical language modeling techniques have successfully been applied to source code, yielding a variety of new software development tools, such as tools for code suggestion and improving readability. A major issue with these techniques is that code introduces new vocabulary at a far higher rate than natural language, as new identifier names proliferate. But traditional language models limit the vocabulary to a fixed set of common words. For code, this strong assumption has been shown to have a significant negative effect on predictive performance. But the open vocabulary version of the neural network language models for code have not been introduced in the literature. We present a new open-vocabulary neural language model for code that is not limited to a fixed vocabulary of identifier names. We employ a segmentation into subword units, subsequences of tokens chosen based on a compression criterion, following previous work in machine translation. Our network achieves best in class performance, outperforming even the state-of-the-art methods of Hellendoorn and Devanbu that are designed specifically to model code. Furthermore, we present a simple method for dynamically adapting the model to a new test project, resulting in increased performance. We showcase our methodology on code corpora in three different languages of over a billion tokens each, hundreds of times larger than in previous work. To our knowledge, this is the largest neural language model for code that has been reported. 
diff --git a/_publications/karampatsis2020big.markdown b/_publications/karampatsis2020big.markdown
index db21b50a..cedf2f60 100644
--- a/_publications/karampatsis2020big.markdown
+++ b/_publications/karampatsis2020big.markdown
@@ -1,7 +1,7 @@
 ---
 layout: publication
 title: "Big Code != Big Vocabulary: Open-Vocabulary Models for Source Code"
-authors: RM. Karampatsis, H. Babii, R. Robbes, C. Sutton, A. Janes 
+authors: Rafael-Michael Karampatsis, Hlib Babii, Romain Robbes Charles Sutton, Andrea Janes
 conference: ICSE
 year: 2020
 bibkey: karampatsis2020
diff --git a/_publications/karampatsis2020scelmo.markdown b/_publications/karampatsis2020scelmo.markdown
index e0e05558..943c6e68 100644
--- a/_publications/karampatsis2020scelmo.markdown
+++ b/_publications/karampatsis2020scelmo.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "SCELMo: Source Code Embeddings from Language Models"
-authors: Rafael - Michael Karampatsis, Charles Sutton
+authors: Rafael-Michael Karampatsis, Charles Sutton
 conference:
 year: 2020
-bibkey: karampatsis2020scelmo
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.13214"}
 tags: ["pretraining", "defect"]
diff --git a/_publications/karmakar2021what.markdown b/_publications/karmakar2021what.markdown
new file mode 100644
index 00000000..70fd139a
--- /dev/null
+++ b/_publications/karmakar2021what.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "What do pre-trained code models know about code?"
+authors: Anjan Karmakar, Romain Robbes
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.11308"}
+tags: ["Transformer"]
+---
+Pre-trained models of code built on the transformer architecture have performed well on software engineering (SE) tasks such as predictive code generation, code summarization, among others. However, whether the vector representations from these pre-trained models comprehensively encode characteristics of source code well enough to be applicable to a broad spectrum of downstream tasks remains an open question.
+
+One way to investigate this is with diagnostic tasks called probes. In this paper, we construct four probing tasks (probing for surface-level, syntactic, structural, and semantic information) for pre-trained code models. We show how probes can be used to identify whether models are deficient in (understanding) certain code properties, characterize different model layers, and get insight into the model sample-efficiency.
+
+We probe four models that vary in their expected knowledge of code properties: BERT (pre-trained on English), CodeBERT and CodeBERTa (pre-trained on source code, and natural language documentation), and GraphCodeBERT (pre-trained on source code with dataflow). While GraphCodeBERT performs more consistently overall, we find that BERT performs surprisingly well on some code tasks, which calls for further investigation. 
diff --git a/_publications/karmakar2022jemma.markdown b/_publications/karmakar2022jemma.markdown
new file mode 100644
index 00000000..4c270ff7
--- /dev/null
+++ b/_publications/karmakar2022jemma.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "JEMMA: An Extensible Java Dataset for ML4Code Applications"
+authors: Anjan Karmakar, Miltiadis Allamanis, Romain Robbes
+conference: EMSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2212.09132"}
+tags: ["dataset"]
+---
+Machine Learning for Source Code (ML4Code) is an active research field in which extensive experimentation is needed to discover how to best use source code's richly structured information. With this in mind, we introduce JEMMA, an Extensible Java Dataset for ML4Code Applications, which is a large-scale, diverse, and high-quality dataset targeted at ML4Code. Our goal with JEMMA is to lower the barrier to entry in ML4Code by providing the building blocks to experiment with source code models and tasks. JEMMA comes with a considerable amount of pre-processed information such as metadata, representations (e.g., code tokens, ASTs, graphs), and several properties (e.g., metrics, static analysis results) for 50,000 Java projects from the 50KC dataset, with over 1.2 million classes and over 8 million methods. JEMMA is also extensible allowing users to add new properties and representations to the dataset, and evaluate tasks on them. Thus, JEMMA becomes a workbench that researchers can use to experiment with novel representations and tasks operating on source code. To demonstrate the utility of the dataset, we also report results from two empirical studies on our data, ultimately showing that significant work lies ahead in the design of context-aware source code models that can reason over a broader network of source code entities in a software project, the very task that JEMMA is designed to help with.
diff --git a/_publications/karpathy2015visualizing.markdown b/_publications/karpathy2015visualizing.markdown
index b74248c1..1cf2cbd9 100644
--- a/_publications/karpathy2015visualizing.markdown
+++ b/_publications/karpathy2015visualizing.markdown
@@ -1,11 +1,12 @@
 ---
 layout: publication
 title: "Visualizing and Understanding Recurrent Networks"
-authors: A. Karpathy, J. Johnson, L. Fei-Fei
-conference: "arXiv preprint arXiv:1506.02078"
+authors: Andrej Karpathy, Justin Johnson, Li Fei-Fei
+conference:
 year: 2015
-bibkey: karpathy2015visualizing
-tags: ["language model", "generation"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1506.02078"}
+tags: ["language model", "code generation"]
 ---
 Recurrent Neural Networks (RNNs), and specifically a variant with Long Short-Term Memory (LSTM), are enjoying renewed interest as a result of successful
 applications in a wide range of machine learning problems that involve sequential
diff --git a/_publications/katz2019towards.markdown b/_publications/katz2019towards.markdown
index 2072922f..0908da3b 100644
--- a/_publications/katz2019towards.markdown
+++ b/_publications/katz2019towards.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Towards Neural Decompilation"
-authors: O. Katz, Y. Olshaker, Y. Goldberg, E. Yahav
+authors: Omer Katz, Yuval Olshaker, Yoav Goldberg, Eran Yahav
 conference:
 year: 2019
-bibkey: katz2019towards
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1905.08325"}
 tags: ["decompilation"]
 ---
 We address the problem of automatic decompilation, converting a program in low-level representation back to a higher-level human-readable programming language. The problem of decompilation is extremely important for security researchers. Finding vulnerabilities and understanding how malware operates is much easier when done over source code.
diff --git a/_publications/key2022speak.markdown b/_publications/key2022speak.markdown
new file mode 100644
index 00000000..efc5056e
--- /dev/null
+++ b/_publications/key2022speak.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "I Speak, You Verify: Toward Trustworthy Neural Program Synthesis"
+authors: Darren Key, Wen-Ding Li, Kevin Ellis
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2210.00848"}
+tags: ["synthesis"]
+---
+We develop an approach for improving the trustworthiness and overall accuracy of program synthesizers based on large language models for source code. Given a natural language description of a programming problem, our method samples both candidate programs as well as candidate predicates specifying how the program should behave. We learn to analyze the agreement between programs and predicates to judge both which program is most likely to be correct, and also judge whether the language model is able to solve the programming problem in the first place. This latter capacity allows favoring high precision over broad recall: fostering trust by only proposing a program when the system is certain that it is correct.
diff --git a/_publications/kharkar2022learning.markdown b/_publications/kharkar2022learning.markdown
new file mode 100644
index 00000000..7f74c9e4
--- /dev/null
+++ b/_publications/kharkar2022learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning to Reduce False Positives in Analytic Bug Detectors"
+authors: Anant Kharkar, Roshanak Zilouchian Moghaddam, Matthew Jin, Xiaoyu Liu, Xin Shi, Colin Clement, Neel Sundaresan
+conference: ICSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.09907"}
+tags: ["Transformer", "static analysis"]
+---
+Due to increasingly complex software design and rapid iterative development, code defects and security vulnerabilities are prevalent in modern software. In response, programmers rely on static analysis tools to regularly scan their codebases and find potential bugs. In order to maximize coverage, however, these tools generally tend to report a significant number of false positives, requiring developers to manually verify each warning. To address this problem, we propose a Transformer-based learning approach to identify false positive bug warnings. We demonstrate that our models can improve the precision of static analysis by 17.5%. In addition, we validated the generalizability of this approach across two major bug types: null dereference and resource leak. 
diff --git a/_publications/kim2020code.markdown b/_publications/kim2020code.markdown
index de1e2ce1..21d28fd2 100644
--- a/_publications/kim2020code.markdown
+++ b/_publications/kim2020code.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: "Code Prediction by Feeding Trees to Transformers"
-authors: S. Kim, J. Zhao, Y. Tian, S. Chandra
+authors: Seohyun Kim, Jinman Zhao, Yuchi Tian, Satish Chandra
 conference: 
 year: 2020
-bibkey: kim2020code
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2003.13848"}
    - {name: "Code", url: "https://github.com/facebookresearch/code-prediction-transformer"}
-tags: ["autocompletion"]
+tags: ["autocomplete"]
 ---
 In this paper, we describe how to leverage Transformer, a recent neural architecture for learning from sequential data (such as text), for code completion. As in the realm of natural language processing, Transformers surpass the prediction accuracy achievable by RNNs; we provide an experimental confirmation of this over a Python dataset.
 
diff --git a/_publications/koc2017learning.markdown b/_publications/koc2017learning.markdown
index 1cf1390b..fc5e20ca 100644
--- a/_publications/koc2017learning.markdown
+++ b/_publications/koc2017learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning a Classifier for False Positive Error Reports Emitted by Static Code Analysis Tools"
-authors: U. Koc, P. Saadatpanah, J. S. Foster, A. A. Porter
+authors: Ugur Koc, Parsa Saadatpanah, Jeffrey S. Foster, Adam A. Porter.
 conference: MAPL
 year: 2017
-bibkey: koc2017learning
 tags: ["static analysis"]
 ---
 The large scale and high complexity of modern software systems
diff --git a/_publications/kocetkov2022stack.markdown b/_publications/kocetkov2022stack.markdown
new file mode 100644
index 00000000..6bb0e716
--- /dev/null
+++ b/_publications/kocetkov2022stack.markdown
@@ -0,0 +1,23 @@
+---
+layout: publication
+title: "The Stack: 3TB of permissively licensed source code"
+authors: Denis Kocetkov, Raymond Li, Loubna Ben Allal, Jia Li, Chenghao Mou, Carlos Muñoz Ferrandis, Sean Hughes, Thomas Wolf, Dzmitry Bahdanau, Leandro von Werra, Harm de Vries
+conference:
+year: 2022
+additional_links:
+   - {name: "Preprint", url: "https://drive.google.com/file/d/17J-0KXTDzY9Esp-JqXYHIcy--i_7G5Bb/view"}
+tags: ["dataset"]
+---
+Large Language Models (LLMs) play an ever-increasing role in the field of
+Artificial Intelligence (AI)–not only for natural language processing but also
+for code understanding and generation. To stimulate open and responsible
+research on LLMs for code, we introduce The Stack, a 3.1 TB dataset
+consisting of permissively licensed source code in 30 programming languages.
+We describe how we collect the full dataset, construct a permissively licensed
+subset, and present promising results on text2code benchmarks by training 350M-parameter decoders on different Python subsets. We find that
+(1) near-deduplicating the data significantly boosts performance across all
+experiments, and (2) it is possible to match previously reported HumanEval
+and MBPP performance using only permissively licensed data. We make the
+dataset available at https://hf.co/BigCode and give developers the possi-
+bility to have their code removed from the dataset by following the instruc-
+tions at https://www.bigcode-project.org/docs/about/the-stack/.
diff --git a/_publications/korbak2021energy.markdown b/_publications/korbak2021energy.markdown
new file mode 100644
index 00000000..80750698
--- /dev/null
+++ b/_publications/korbak2021energy.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Energy-Based Models for Code Generation under Compilability Constraints"
+authors: Tomasz Korbak, Hady Elsahar, Marc Dymetman, Germán Kruszewski
+conference: ACL
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.04985"}
+tags: ["code generation"]
+---
+Neural language models can be successfully trained on source code, leading to applications such as code completion. However, their versatile autoregressive self-supervision objective overlooks important global sequence-level features that are present in the data such as syntactic correctness or compilability. In this work, we pose the problem of learning to generate compilable code as constraint satisfaction. We define an Energy-Based Model (EBM) representing a pre-trained generative model with an imposed constraint of generating only compilable sequences. We then use the KL-Adaptive Distributional Policy Gradient algorithm (Khalifa et al., 2021) to train a generative model approximating the EBM. We conduct experiments showing that our proposed approach is able to improve compilability rates without sacrificing diversity and complexity of the generated samples. 
diff --git a/_publications/kovalchuk2022human.markdown b/_publications/kovalchuk2022human.markdown
new file mode 100644
index 00000000..7bfd8a0f
--- /dev/null
+++ b/_publications/kovalchuk2022human.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Human perceiving behavior modeling in evaluation of code generation models"
+authors: S. Kovalchuk, V. Lomshakov, A. Aliev
+conference: GEM
+year: 2022
+additional_links:
+   - {name: "ACLAnthology", url: "https://aclanthology.org/2022.gem-1.24/"}
+tags: ["code generation", "evaluation", "human evaluation", ]
+---
+Within this study, we evaluated a series of code generation models based on CodeGen and GPTNeo to compare the metric-based performance and human evaluation. For a deeper analysis of human perceiving within the evaluation procedure we’ve implemented a 5-level Likert scale assessment of the model output using a perceiving model based on the Theory of Planned Behavior (TPB). Through such analysis, we showed an extension of model assessment as well as a deeper understanding of the quality and applicability of generated code for practical question answering. The approach was evaluated with several model settings in order to assess diversity in quality and style of answer. With the TPB-based model, we showed a different level of perceiving the model result, namely personal understanding, agreement level, and readiness to use the particular code. With such analysis, we investigate a series of issues in code generation as natural language generation (NLG) problems observed in a practical context of programming question-answering with code.
\ No newline at end of file
diff --git a/_publications/kovalchuk2023test.markdown b/_publications/kovalchuk2023test.markdown
new file mode 100644
index 00000000..476f609c
--- /dev/null
+++ b/_publications/kovalchuk2023test.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: Test-based and metric-based evaluation of code generation models for practical question answering
+authors: S. Kovalchuk, D. Fedrushkov, V. Lomshakov, A. Aliev
+conference: ICCQ
+year: 2023
+additional_links:
+   - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/10114665"}
+tags: ["code generation", "test generation", "natural language generation", "evaluation", "metrics", "natural language processing"]
+---
+We performed a comparative analysis of code generation model performance with evaluation using common NLP metrics in comparison to a test-based evaluation. The investigation was performed in the context of question answering with code (test-to-code problem) and was aimed at applicability checking both ways for generated code evaluation in a fully automatic manner. We used CodeGen and GPTNeo pretrained models applied to a problem of question answering using Stack Overflow-based corpus (APIzation). For test-based evaluation, industrial test-generation solutions (Machinet, UTBot) were used for providing automatically generated tests. The analysis showed that the performance evaluation based solely on NLP metrics or on tests provides a rather limited assessment of generated code quality. We see the evidence that predictions with both high and low NLP metrics exist that pass and don't pass tests. With the early results of our empirical study being discussed in this paper, we believe that the combination of both approaches may increase possible ways for building, evaluating, and training code generation models.
\ No newline at end of file
diff --git a/_publications/kovalenko2019pathminer.markdown b/_publications/kovalenko2019pathminer.markdown
index 2cddee0b..25d0f01b 100644
--- a/_publications/kovalenko2019pathminer.markdown
+++ b/_publications/kovalenko2019pathminer.markdown
@@ -1,13 +1,12 @@
 ---
 layout: publication
 title: "PathMiner : A Library for Mining of Path-Based Representations of Code"
-authors: V. Kovalenko, E. Bogomolov, A. Bacchelli
+authors: Vladimir Kovalenko, Egor Bogomolov, Timofey Bryksin, Alberto Bacchelli.
 conference: MSR
 year: 2019
-bibkey: kovalenko2019pathminer
 additional_links:
    - {name: "Zenodo", url: "https://zenodo.org/record/2595271#.XMlqHKQo_mF"}
-tags: ["representation", "AST"]
+tags: ["representation", "grammar"]
 ---
 One recent, significant advance in modeling source code for machine learning algorithms has been the introduction of path-based representation -- an approach consisting in representing a snippet of code as a collection of paths from its syntax tree. Such representation efficiently captures the structure of code, which, in turn, carries its semantics and other information.
 Building the path-based representation involves parsing the code and extracting the paths from its syntax tree; these steps build up to a substantial technical job. With no common reusable toolkit existing for this task, the burden of mining diverts the focus of researchers from the essential work and hinders newcomers in the field of machine learning on code.
diff --git a/_publications/kremenek2007factor.markdown b/_publications/kremenek2007factor.markdown
index 04c7e3c6..77229066 100644
--- a/_publications/kremenek2007factor.markdown
+++ b/_publications/kremenek2007factor.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Factor Graph Model for Software Bug Finding"
-authors: T. Kremenek, A.Y. Ng, D. Engler
+authors: Ted Kremenek, Andrew Y. Ng, Dawson R. Engler. 
 conference: IJCAI
 year: 2007
-bibkey: kremenek2007factor
 tags: ["program analysis"]
 ---
 Automatic tools for finding software errors require
diff --git a/_publications/kulal2019spoc.markdown b/_publications/kulal2019spoc.markdown
index 9f37a9ab..468b0bf1 100644
--- a/_publications/kulal2019spoc.markdown
+++ b/_publications/kulal2019spoc.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "SPoC: Search-based Pseudocode to Code"
-authors: S. Kulal, P. Pasupat, K. Chandra, M. Lee, O. Padon, A. Aiken, P. Liang
+authors: Sumith Kulal, Panupong Pasupat, Kartik Chandra, Mina Lee, Oded Padon, Alex Aiken, Percy S. Liang
 conference:
 year: 2019
-bibkey: kulal2019spoc
 tags: ["bimodal", "synthesis"]
 ---
 We consider the task of mapping pseudocode to long programs that are functionally correct. Given test cases as a mechanism to validate programs, we search over the space of possible translations of the pseudocode to find a program that passes the validation. However, without proper credit assignment to localize the sources of program failures, it is difficult to guide search toward more promising programs. We propose to perform credit assignment based on signals from compilation errors, which constitute 88.7% of program failures. Concretely, we treat the translation of each pseudocode line as a discrete portion of the program, and whenever a synthesized program fails to compile, an error localization method tries to identify the portion of the program responsible for the failure. We then focus search over alternative translations of the pseudocode for those portions. For evaluation, we collected the SPoC dataset (Search-based Pseudocode to Code) containing 18,356 programs with human-authored pseudocode and test cases. Under a budget of 100 program compilations, performing search improves the synthesis success rate over using the top-one translation of the pseudocode from 25.6% to 44.7%. 
diff --git a/_publications/kurbatova2020recommendation.markdown b/_publications/kurbatova2020recommendation.markdown
index 8c445af0..347ba0b0 100644
--- a/_publications/kurbatova2020recommendation.markdown
+++ b/_publications/kurbatova2020recommendation.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Recommendation of Move Method Refactoring Using Path-Based Representation of Code"
-authors: Z. Kurbatova, I. Veselov, Y. Golubev, T. Bryksin
+authors: Zarina Kurbatova, Ivan Veselov, Yaroslav Golubev, Timofey Bryksin
 conference:
 year: 2020
-bibkey: kurbatova2020recommendation
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2002.06392"}
 tags: ["refactoring"]
diff --git a/_publications/kushman2013using.markdown b/_publications/kushman2013using.markdown
index de407237..178524d9 100644
--- a/_publications/kushman2013using.markdown
+++ b/_publications/kushman2013using.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Using Semantic Unification to Generate Regular Expressions from Natural Language"
-authors: N. Kushman, R. Barzilay
+authors: Nate Kushman, Regina Barzilay
 conference: NAACL
 year: 2013
-bibkey: kushman2013using
-tags: ["bimodal", "generation"]
+tags: ["bimodal", "code generation"]
 ---
 We consider the problem of translating natural language text queries into regular expressions which represent their meaning. The mismatch in the level of abstraction between the natural language representation and the regular expression representation make this a novel and challenging problem. However, a given regular expression can be written in many semantically equivalent forms, and we exploit this flexibility to facilitate translation by finding a form which more directly corresponds to the natural language. We evaluate our technique on a set of natural language queries and their associated regular expressions which we gathered from Amazon Mechanical Turk. Our model substantially outperforms a state-of-the-art semantic parsing baseline, yielding a 29% absolute improvement in accuracy.
diff --git a/_publications/lachaux2020unsupervised.markdown b/_publications/lachaux2020unsupervised.markdown
index 85785810..f6f9d0a9 100644
--- a/_publications/lachaux2020unsupervised.markdown
+++ b/_publications/lachaux2020unsupervised.markdown
@@ -4,9 +4,9 @@ title: "Unsupervised Translation of Programming Languages"
 authors: Marie-Anne Lachaux, Baptiste Roziere, Lowik Chanussot, Guillaume Lample
 conference:
 year: 2020
-bibkey: lachaux2020unsupervised
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2006.03511"}
+   - {name: "GitHub", url: "https://github.com/facebookresearch/TransCoder"}
 tags: ["migration"]
 ---
 A transcompiler, also known as source-to-source translator, is a system that converts source code from a high-level programming language (such as C++ or Python) to another. Transcompilers are primarily used for interoperability, and to port codebases written in an obsolete or deprecated language (e.g. COBOL, Python 2) to a modern one. They typically rely on handcrafted rewrite rules, applied to the source code abstract syntax tree. Unfortunately, the resulting translations often lack readability, fail to respect the target language conventions, and require manual modifications in order to work properly. The overall translation process is timeconsuming and requires expertise in both the source and target languages, making code-translation projects expensive. Although neural models significantly outperform their rule-based counterparts in the context of natural language translation, their applications to transcompilation have been limited due to the scarcity of parallel data in this domain. In this paper, we propose to leverage recent approaches in unsupervised machine translation to train a fully unsupervised neural transcompiler. We train our model on source code from open source GitHub projects, and show that it can translate functions between C++, Java, and Python with high accuracy. Our method relies exclusively on monolingual source code, requires no expertise in the source or target languages, and can easily be generalized to other programming languages. We also build and release a test set composed of 852 parallel functions, along with unit tests to check the correctness of translations. We show that our model outperforms rule-based commercial baselines by a significant margin.
diff --git a/_publications/lacomis2019neural.markdown b/_publications/lacomis2019neural.markdown
index c829eda2..b6db13ba 100644
--- a/_publications/lacomis2019neural.markdown
+++ b/_publications/lacomis2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Neural Approach to Decompiled Identifier Renaming"
-authors: J. Lacomis, P. Yin, E.J. Schwartz, M. Allamanis, C. Le Goues, G. Neubig, B. Vasilescu
+authors: Jeremy Lacomis, Pengcheng Yin, Edward J. Schwartz, Miltiadis Allamanis, Claire Le Goues, Graham Neubig, Bogdan Vasilescu
 conference: ASE
 year: 2019
-bibkey: lacomis2019neural
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1909.09029"}
    - {name: "Code and Data", url: "https://github.com/pcyin/dire"}
diff --git a/_publications/lanchantin2018exploring.markdown b/_publications/lanchantin2018exploring.markdown
index 89bdc0b9..46810c1f 100644
--- a/_publications/lanchantin2018exploring.markdown
+++ b/_publications/lanchantin2018exploring.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Exploring the Naturalness of Buggy Code with Recurrent Neural Network"
-authors: J. Lanchantin, J. Gao
+authors: Jack Lanchantin, Ji Gao
 conference: 
 year: 2018
-bibkey: lanchantin2018exploring
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1803.08793"}
 tags: ["language model", "defect"]
 ---
 Statistical   language   models   are   powerful   tools
diff --git a/_publications/leclair2019neural.markdown b/_publications/leclair2019neural.markdown
index 2f12444d..9ff92a02 100644
--- a/_publications/leclair2019neural.markdown
+++ b/_publications/leclair2019neural.markdown
@@ -1,10 +1,12 @@
 ---
 layout: publication
 title: "A Neural Model for Generating Natural Language Summaries of Program Subroutines"
-authors: A. LeClair, S. Jiang, C. McMillan
+authors: Alexander LeClair, Siyuan Jiang, Collin McMillan
 conference: ICSE
 year: 2019
-bibkey: leclair2019neural
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1902.01954"}
+   - {name: "Code and Data", url: "https://s3.us-east-2.amazonaws.com/icse2018/index.html"}
 tags: ["summarization", "documentation"]
 ---
 Source code summarization -- creating natural language descriptions of source code behavior -- is a rapidly-growing research topic with applications to automatic documentation generation, program comprehension, and software maintenance. Traditional techniques relied on heuristics and templates built manually by human experts. Recently, data-driven approaches based on neural machine translation have largely overtaken template-based systems. But nearly all of these techniques rely almost entirely on programs having good internal documentation; without clear identifier names, the models fail to create good summaries. In this paper, we present a neural model that combines words from code with code structure from an AST. Unlike previous approaches, our model processes each data source as a separate input, which allows the model to learn code structure independent of the text in code. This process helps our approach provide coherent summaries in many cases even when zero internal documentation is provided. We evaluate our technique with a dataset we created from 2.1m Java methods. We find improvement over two baseline techniques from SE literature and one from NLP literature.
diff --git a/_publications/leclair2019recommendations.markdown b/_publications/leclair2019recommendations.markdown
index 44bba482..da2ca0a0 100644
--- a/_publications/leclair2019recommendations.markdown
+++ b/_publications/leclair2019recommendations.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Recommendations for Datasets for Source Code Summarization"
-authors: A. LeClair, C. McMillan
+authors: Alexander LeClair, Collin McMillan
 conference: NAACL 2019
 year: 2019
-bibkey: leclair2019recommendations
 tags: ["summarization", "dataset"]
 ---
 Source Code Summarization is the task of writing short, natural language descriptions of source code. The main use for these descriptions is in software documentation e.g. the one-sentence Java method descriptions in JavaDocs. Code summarization is rapidly becoming a popular research problem, but progress is restrained due to a lack of suitable datasets. In addition, a lack of community standards for creating datasets leads to confusing and unreproducible research results -- we observe swings in performance of more than 33% due only to changes in dataset design. In this paper, we make recommendations for these standards from experimental results. We release a dataset based on prior work of over 2.1m pairs of Java methods and one sentence method descriptions from over 28k Java projects. We describe the dataset and point out key differences from natural language data, to guide and support future researchers. 
diff --git a/_publications/leclair2020improved.markdown b/_publications/leclair2020improved.markdown
index 18a49579..23d57760 100644
--- a/_publications/leclair2020improved.markdown
+++ b/_publications/leclair2020improved.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Improved Code Summarization via a Graph Neural Network"
-authors: A. LeClair, S. Haque, L. Wu, C. McMillan
+authors: Alexander LeClair, Sakib Haque, Lingfei Wu, Collin McMillan
 conference:
 year: 2020
-bibkey: leclair2020improved
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.02843"}
 tags: ["summarization"]
diff --git a/_publications/lee2020montage.markdown b/_publications/lee2020montage.markdown
index 0aa3d16c..91cd2d9e 100644
--- a/_publications/lee2020montage.markdown
+++ b/_publications/lee2020montage.markdown
@@ -4,7 +4,6 @@ title: "Montage: A Neural Network Language Model-Guided JavaScript Engine Fuzzer
 authors: Suyoung Lee, HyungSeok Han, Sang Kil Cha, Sooel Son
 conference: USENIX
 year: 2020
-bibkey: lee2020montage
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2001.04107"}
 tags: ["fuzzing", "language model"]
diff --git a/_publications/lee2021cotraining.markdown b/_publications/lee2021cotraining.markdown
new file mode 100644
index 00000000..98702251
--- /dev/null
+++ b/_publications/lee2021cotraining.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Co-Training for Commit Classification"
+authors: Jian Yi, David Lee, Hai Leong Chieu
+conference: EMNLP WNUT
+year: 2021
+additional_links:
+   - {name: "website", url: "https://aclanthology.org/2021.wnut-1.43/"}
+   - {name: "code", url: "https://github.com/davidleejy/wnut21-cotrain"}
+tags: ["Transformer", "bimodal", "defect"]
+---
+Commits in version control systems (e.g. Git) track changes in a software project. Commits comprise noisy user-generated natural language and code patches. Automatic commit classification (CC) has been used to determine the type of code maintenance activities performed, as well as to detect bug fixes in code repositories. Much prior work occurs in the fully-supervised setting – a setting that can be a stretch in resource-scarce situations presenting difficulties in labeling commits. In this paper, we apply co-training, a semi-supervised learning method, to take advantage of the two views available – the commit message (natural language) and the code changes (programming language) – to improve commit classification.
diff --git a/_publications/levy2017learning.markdown b/_publications/levy2017learning.markdown
index 08f32c8d..36f8188c 100644
--- a/_publications/levy2017learning.markdown
+++ b/_publications/levy2017learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Align the Source Code to the Compiled Object Code"
-authors: D. Levy, L. Wolf
+authors: Dor Levy, Lior Wolf
 conference: ICML
 year: 2017
-bibkey: levy2017learning
 tags: ["decompilation"]
 ---
 We propose a new neural network architecture
diff --git a/_publications/lherondelle2022topical.markdown b/_publications/lherondelle2022topical.markdown
new file mode 100644
index 00000000..52eb73ef
--- /dev/null
+++ b/_publications/lherondelle2022topical.markdown
@@ -0,0 +1,20 @@
+---
+layout: publication
+title: "Topical: Learning Repository Embeddings from Source Code using Attention"
+authors: Agathe Lherondelle, Yash Satsangi, Fran Silavong, Shaltiel Eloul, Sean Moran
+conference: Arxiv
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/pdf/2208.09495.pdf"}
+tags: ["representation", "topic modelling"]
+---
+Machine learning on source code (MLOnCode) promises to transform how software is delivered. By mining the context and relationship between software artefacts, MLOnCode
+augments the software developer’s capabilities with code autogeneration, code recommendation, code auto-tagging and other data-driven enhancements. For many of these tasks a script level
+representation of code is sufficient, however, in many cases a repository level representation that takes into account various dependencies and repository structure is imperative, for example,
+auto-tagging repositories with topics or auto-documentation of repository code etc. Existing methods for computing repository level representations suffer from (a) reliance on natural language
+documentation of code (for example, README files) (b) naive aggregation of method/script-level representation, for example, by concatenation or averaging. This paper introduces Topical a
+deep neural network to generate repository level embeddings of publicly available GitHub code repositories directly from source code. Topical incorporates an attention mechanism that projects the source code, the full dependency graph and the
+script level textual information into a dense repository-level representation. To compute the repository-level representations, Topical is trained to predict the topics associated with a repository, on a dataset of publicly available GitHub repositories that
+were crawled along with their ground truth topic tags. Our experiments show that the embeddings computed by Topical are able to outperform multiple baselines, including baselines
+that naively combine the method-level representations through averaging or concatenation at the task of repository auto-tagging. Furthermore, we show that Topical’s attention mechanism outperforms naive aggregation methods when computing repositorylevel representations from script-level representation generated
+by existing methods. Topical is a lightweight framework for computing repository-level representation of code repositories that scales efficiently with the number of topics and dataset size.
diff --git a/_publications/li2016gated.markdown b/_publications/li2016gated.markdown
index 1bd6cec6..566259b9 100644
--- a/_publications/li2016gated.markdown
+++ b/_publications/li2016gated.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Gated Graph Sequence Neural Networks"
-authors: Y. Li, R. Zemel, M. Brockschmidt, D. Tarlow
+authors: Yujia Li, Daniel Tarlow, Marc Brockschmidt, Richard Zemel
 conference: ICLR
 year: 2016
-bibkey: li2016gated
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1511.05493"}
 tags: ["GNN", "program analysis"]
 ---
 Graph-structured data appears frequently in domains including chemistry, natural
diff --git a/_publications/li2017code.markdown b/_publications/li2017code.markdown
index fe942e71..6fb39908 100644
--- a/_publications/li2017code.markdown
+++ b/_publications/li2017code.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Code Completion with Neural Attention and Pointer Networks
-authors: J. Li, Y. Wang, I. King, M. R. Lyu
+authors: Jian Li, Yue Wang, Michael R. Lyu, Irwin King
 conference: 
 year: 2017
-bibkey: li2017code
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1711.09573"}
 tags: ["language model", "autocomplete"]
diff --git a/_publications/li2017software.markdown b/_publications/li2017software.markdown
index 0b8bc9f6..f62092e9 100644
--- a/_publications/li2017software.markdown
+++ b/_publications/li2017software.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Software Defect Prediction via Convolutional Neural Network"
-authors: J. Li, P. He, J. Zhu, and M. R. Lyu
+authors: Jian Li, Pinjia He, Jieming Zhu, Michael R. Lyu
 conference: QRS
 year: 2017
-bibkey: li2017software
 tags: ["defect"]
 ---
 To improve software reliability, software defect prediction is utilized to assist developers in finding potential bugs
diff --git a/_publications/li2019improving.markdown b/_publications/li2019improving.markdown
index 62010d5d..50d6dcb5 100644
--- a/_publications/li2019improving.markdown
+++ b/_publications/li2019improving.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Improving Bug Detection via Context-Based Code Representation Learning and Attention-Based Neural Networks
-authors: Y. Li, S. Wang, T. N. Nguyen, S. V. Nguyen
-conference: OOPSLA 2019
+authors: Yi Li, Shaohua Wang, Tien N. Nguyen, Son Van Nguyen
+conference: OOPSLA
 year: 2019
-bibkey: li2019improving
 tags: ["representation", "defect"]
 ---
 Bug detection has been shown to be an effective way to help developers in detecting bugs early, thus, saving much effort and time in software development process. Recently, deep learning-based bug detection approaches have gained successes over the traditional machine learning-based approaches, the rule-based program analysis approaches, and mining-based approaches. However, they are still limited in detecting bugs that involve multiple methods and suffer high rate of false positives. In this paper, we propose a combination approach with the use of contexts and attention neural network to overcome those limitations. We propose to use as the global context the Program Dependence Graph (PDG) and Data Flow Graph (DFG) to connect the method under investigation with the other relevant methods that might contribute to the buggy code. The global context is complemented by the local context extracted from the path on the AST built from the method’s body. The use of PDG and DFG enables our model to reduce the false positive rate, while to complement for the potential reduction in recall, we make use of the attention neural network mechanism to put more weights on the buggy paths in the source code. That is, the paths that are similar to the buggy paths will be ranked higher, thus, improving the recall of our model. We have conducted several experiments to evaluate our approach on a very large dataset with +4.973M methods in 92 different project versions. The results show that our tool can have a relative improvement up to 160% on F-score when comparing with the state-of-the-art bug detection approaches. Our tool can detect 48 true bugs in the list of top 100 reported bugs, which is 24 more true bugs when comparing with the baseline approaches. We also reported that our representation is better suitable for bug detection and relatively improves over the other representations up to 206% in accuracy.
diff --git a/_publications/li2019neural.markdown b/_publications/li2019neural.markdown
index 0711ac49..0e7caaf0 100644
--- a/_publications/li2019neural.markdown
+++ b/_publications/li2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Code Search Evaluation Dataset"
-authors: H. Li, S. Kim, S. Chandra
+authors: Hongyu Li, Seohyun Kim, Satish Chandra
 conference:
 year: 2019
-bibkey: li2019neural
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1908.09804"}
    - {name: "Dataset", url: "https://github.com/facebookresearch/Neural-Code-Search-Evaluation-Dataset"}
diff --git a/_publications/li2019using.markdown b/_publications/li2019using.markdown
index e71466de..3da013f8 100644
--- a/_publications/li2019using.markdown
+++ b/_publications/li2019using.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Using GGNN to recommend log statement level"
-authors: M. Li, J. Pei, J. He, K. Song, F. Che, Y. Huang, C. Wang
+authors: Mingzhe Li, Jianrui Pei, Jin He, Kevin Song, Frank Che, Yongfeng Huang, Chitai Wang
 conference:
 year: 2019
-bibkey: li2019using
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1912.05097"}
 tags: ["GNN", "logging"]
diff --git a/_publications/li2020dlfix.markdown b/_publications/li2020dlfix.markdown
index d9ffc4b1..9081f929 100644
--- a/_publications/li2020dlfix.markdown
+++ b/_publications/li2020dlfix.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "DLFix: Context-based Code Transformation Learning for Automated Program Repair"
-authors: Y. Li, S. Wang, T. N. Nguyen
+authors: Yi Li, Shaohua Wang, Tien N. Nguyen
 conference: ICSE
 year: 2020
-bibkey: li2020dlfix
-tags: ["edit", "repair", "AST"]
+tags: ["edit", "repair", "grammar"]
 ---
 Automated Program Repair (APR) is very useful in helping developers in the process of software development and maintenance. Despite recent advances in deep learning (DL), the DL-based APR approaches still have limitations in learning bug-fixing code changes and the context of the surrounding source code of the bug-fixing code changes. These limitations lead to incorrect fixing locations or fixes. In this paper, we introduce DLFix, a two-tier DL model that treats APR as code transformation learning from the prior bug fixes and the surrounding code contexts of the fixes. The first layer is a tree-based RNN model that learns the contexts of bug fixes and its result is used as an additional weighting input for the second layer designed to learn the bug-fixing code transformations. 
 
diff --git a/_publications/li2020learning.markdown b/_publications/li2020learning.markdown
new file mode 100644
index 00000000..505da575
--- /dev/null
+++ b/_publications/li2020learning.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Learning Code-Query Interaction for Enhancing Code Searches"
+authors: Wei Li, Haozhe Qin, Shuhan Yan, Beijun Shen, Yuting Chen
+conference: ICSME
+year: 2020
+additional_links:
+  - { name: "IEEE", url: "https://ieeexplore.ieee.org/document/9240627" }
+tags: ["search"]
+---
+
+Code search plays an important role in software development and maintenance. In recent years, deep learning (DL) has achieved a great success in this domain-several DL-based code search methods, such as DeepCS and UNIF, have been proposed for exploring deep, semantic correlations between code and queries; each method usually embeds source code and natural language queries into real vectors followed by computing their vector distances representing their semantic correlations. Meanwhile, deep learning-based code search still suffers from three main problems, i.e., the OOV (Out of Vocabulary) problem, the independent similarity matching problem, and the small training dataset problem. To tackle the above problems, we propose CQIL, a novel, deep learning-based code search method. CQIL learns code-query interactions and uses a CNN (Convolutional Neural Network) to compute semantic correlations between queries and code snippets. In particular, CQIL employs a hybrid representation to model code-query correlations, which solves the OOV problem. CQIL also deeply learns the code-query interaction for enhancing code searches, which solves the independent similarity matching and the small training dataset problems. We evaluate CQIL on two datasets (CODEnn and CosBench). The evaluation results show the strengths of CQIL-it achieves the MAP@1 values, 0.694 and 0.574, on CODEnn and CosBench, respectively. In particular, it outperforms DeepCS and UNIF, two state-of-the-art code search methods, by 13.6% and 18.1% in MRR, respectively, when the training dataset is insufficient.
diff --git a/_publications/li2021learning.markdown b/_publications/li2021learning.markdown
new file mode 100644
index 00000000..23944f4c
--- /dev/null
+++ b/_publications/li2021learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning to Extend Program Graphs to Work-in-Progress Code"
+authors: Xuechen Li, Chris J. Maddison, Daniel Tarlow
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.14038"}
+tags: ["Transformer", "autocomplete", "repair"]
+---
+Source code spends most of its time in a broken or incomplete state during software development. This presents a challenge to machine learning for code, since high-performing models typically rely on graph structured representations of programs derived from traditional program analyses. Such analyses may be undefined for broken or incomplete code. We extend the notion of program graphs to work-in-progress code by learning to predict edge relations between tokens, training on well-formed code before transferring to work-in-progress code. We consider the tasks of code completion and localizing and repairing variable misuse in a work-in-process scenario. We demonstrate that training relation-aware models with fine-tuned edges consistently leads to improved performance on both tasks.
diff --git a/_publications/li2021toward.markdown b/_publications/li2021toward.markdown
new file mode 100644
index 00000000..02a21023
--- /dev/null
+++ b/_publications/li2021toward.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Toward Less Hidden Cost of Code Completion with Acceptance and Ranking Models
+authors: Jingxuan Li, Rui Huang, Wei Li, Kai Yao, Weiguo Tan
+conference: ICSME
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.13928"}
+tags: ["autocomplete", "language model", "optimization", "Transformer"]
+---
+Code completion is widely used by software developers to provide coding suggestions given a partially written code snippet. Apart from the traditional code completion methods, which only support single token completion at minimal positions, recent studies show the ability to provide longer code completion at more flexible positions. However, such frequently triggered and longer completion results reduce the overall precision as they generate more invalid results. Moreover, different studies are mostly incompatible with each other. Thus, it is vital to develop an ensemble framework that can combine results from multiple models to draw merits and offset defects of each model.
+This paper conducts a coding simulation to collect data from code context and different code completion models and then apply the data in two tasks. First, we introduce an acceptance model which can dynamically control whether to display completion results to the developer. It uses simulation features to predict whether correct results exist in the output of these models. Our best model reduces the percentage of false-positive completion from 55.09% to 17.44%. Second, we design a fusion ranking scheme that can automatically identify the priority of the completion results and reorder the candidates from multiple code completion models. This scheme is flexible in dealing with various models, regardless of the type or the length of their completion results. We integrate this ranking scheme with two frequency models and a GPT-2 styled language model, along with the acceptance model to yield 27.80% and 37.64% increase in TOP1 and TOP5 accuracy, respectively. In addition, we propose a new code completion evaluation metric, Benefit-Cost Ratio(BCR), taking into account the benefit of keystrokes saving and hidden cost of completion list browsing, which is closer to real coder experience scenario.
diff --git a/_publications/li2022codereviewer.markdown b/_publications/li2022codereviewer.markdown
new file mode 100644
index 00000000..717689c0
--- /dev/null
+++ b/_publications/li2022codereviewer.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeReviewer: Pre-Training for Automating Code Review Activities"
+authors: Zhiyu Li, Shuai Lu, Daya Guo, Nan Duan, Shailesh Jannu, Grant Jenks, Deep Majumder, Jared Green, Alexey Svyatkovskiy, Shengyu Fu, Neel Sundaresan
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.09095"}
+tags: ["review"]
+---
+Code review is an essential part to software development lifecycle since it aims at guaranteeing the quality of codes. Modern code review activities necessitate developers viewing, understanding and even running the programs to assess logic, functionality, latency, style and other factors. It turns out that developers have to spend far too much time reviewing the code of their peers. Accordingly, it is in significant demand to automate the code review process. In this research, we focus on utilizing pre-training techniques for the tasks in the code review scenario. We collect a large-scale dataset of real world code changes and code reviews from open-source projects in nine of the most popular programming languages. To better understand code diffs and reviews, we propose CodeReviewer, a pre-trained model that utilizes four pre-training tasks tailored specifically for the code review senario. To evaluate our model, we focus on three key tasks related to code review activities, including code change quality estimation, review comment generation and code refinement. Furthermore, we establish a high-quality benchmark dataset based on our collected data for these three tasks and conduct comprehensive experiments on it. The experimental results demonstrate that our model outperforms the previous state-of-the-art pre-training approaches in all tasks. Further analysis show that our proposed pre-training tasks and the multilingual pre-training dataset benefit the model on the understanding of code changes and reviews. 
diff --git a/_publications/li2022exploring.markdown b/_publications/li2022exploring.markdown
new file mode 100644
index 00000000..f185b730
--- /dev/null
+++ b/_publications/li2022exploring.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Exploring Representation-Level Augmentation for Code Search
+authors: Haochen Li, Chunyan Miao, Cyril Leung, Yanxian Huang, Yuan Huang, Hongyu Zhang, Yanlin Wang
+conference: EMNLP
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2210.12285"}
+   - {name: "code", url: "https://github.com/Alex-HaochenLi/RACS"}
+tags: ["search", "Transformer"]
+---
+Code search, which aims at retrieving the most relevant code fragment for a given natural language query, is a common activity in software development practice. Recently, contrastive learning is widely used in code search research, where many data augmentation approaches for source code (e.g., semantic-preserving program transformation) are proposed to learn better representations.  However, these augmentations are at the raw-data level, which requires additional code analysis in the preprocessing stage and additional training costs in the training stage. In this paper, we explore augmentation methods that augment data (both code and query) at representation level which does not require additional data processing and training, and based on this we propose a general format of representation-level augmentation that unifies existing methods. Then, we propose three new augmentation methods (linear extrapolation, binary interpolation, and Gaussian scaling) based on the general format. Furthermore, we theoretically analyze the advantages of the proposed augmentation methods over traditional contrastive learning methods on code search. We experimentally evaluate the proposed representation-level augmentation methods with state-of-the-art code search models on a large-scale public dataset consisting of six programming languages. The experimental results show that our approach can consistently boost the performance of the studied code search models.
\ No newline at end of file
diff --git a/_publications/li2023hitchhiker.markdown b/_publications/li2023hitchhiker.markdown
new file mode 100644
index 00000000..eb046f44
--- /dev/null
+++ b/_publications/li2023hitchhiker.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "The Hitchhiker's Guide to Program Analysis: A Journey with Large Language Models"
+authors: Haonan Li, Yu Hao, Yizhuo Zhai, Zhiyun Qian
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2308.00245"}
+tags: ["static analysis"]
+---
+Static analysis is a widely used technique in software engineering for identifying and mitigating bugs. However, a significant hurdle lies in achieving a delicate balance between precision and scalability. Large Language Models (LLMs) offer a promising alternative, as recent advances demonstrate remarkable capabilities in comprehending, generating, and even debugging code. Yet, the logic of bugs can be complex and require sophisticated reasoning and a large analysis scope spanning multiple functions. Therefore, at this point, LLMs are better used in an assistive role to complement static analysis. In this paper, we take a deep dive into the open space of LLM-assisted static analysis, using use-before-initialization (UBI) bugs as a case study. To this end, we develop LLift, a fully automated agent that interfaces with both a static analysis tool and an LLM. By carefully designing the agent and the prompts, we are able to overcome a number of challenges, including bug-specific modeling, the large problem scope, the non-deterministic nature of LLMs, etc. Tested in a real-world scenario analyzing nearly a thousand potential UBI bugs produced by static analysis, LLift demonstrates an extremely potent capability, showcasing a high precision (50%) and recall rate (100%). It even identified 13 previously unknown UBI bugs in the Linux kernel. This research paves the way for new opportunities and methodologies in the use of LLMs for bug discovery in extensive, real-world datasets.
diff --git a/_publications/li2023rethinking.markdown b/_publications/li2023rethinking.markdown
new file mode 100644
index 00000000..daa816c0
--- /dev/null
+++ b/_publications/li2023rethinking.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Rethinking Negative Pairs in Code Search
+authors: Haochen Li, Xin Zhou, Luu Anh Tuan, Chunyan Miao
+conference: EMNLP
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2310.08069"}
+   - {name: "code", url: "https://github.com/Alex-HaochenLi/Soft-InfoNCE"}
+tags: ["search", "Transformer", "retrieval", "optimization", "representation"]
+---
+Recently, contrastive learning has become a key component in fine-tuning code search models for software development efficiency and effectiveness. It pulls together positive code snippets while pushing negative samples away given search queries. Among contrastive learning, InfoNCE is the most widely used loss function due to its better performance. However, the following problems in negative samples of InfoNCE may deteriorate its representation learning: 1) The existence of false negative samples in large code corpora due to duplications. 2). The failure to explicitly differentiate between the potential relevance of negative samples. As an example, a bubble sorting algorithm example is less ``negative'' than a file saving function for the quick sorting algorithm query. In this paper, we tackle the above problems by proposing a simple yet effective Soft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss function, we apply three methods to estimate the weights of negative pairs and show that the vanilla InfoNCE loss is a special case of Soft-InfoNCE. Theoretically, we analyze the effects of Soft-InfoNCE on controlling the distribution of learnt code representations and on deducing a more precise mutual information estimation. We furthermore discuss the superiority of proposed loss functions with other design alternatives. Extensive experiments demonstrate the effectiveness of Soft-InfoNCE and weights estimation methods under state-of-the-art code search models on a large-scale public dataset consisting of six programming languages.
\ No newline at end of file
diff --git a/_publications/li2023starcoder.markdown b/_publications/li2023starcoder.markdown
new file mode 100644
index 00000000..416b3924
--- /dev/null
+++ b/_publications/li2023starcoder.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "StarCoder: may the source be with you!"
+authors: Raymond Li, Loubna Ben Allal, Yangtian Zi, Niklas Muennighoff, Denis Kocetkov, Chenghao Mou, Marc Marone, Christopher Akiki, Jia Li, Jenny Chim, Qian Liu, Evgenii Zheltonozhskii, Terry Yue Zhuo, Thomas Wang, Olivier Dehaene, Mishig Davaadorj, Joel Lamy-Poirier, João Monteiro, Oleh Shliazhko, Nicolas Gontier, Nicholas Meade, Armel Zebaze, Ming-Ho Yee, Logesh Kumar Umapathi, Jian Zhu, Benjamin Lipkin, Muhtasham Oblokulov, Zhiruo Wang, Rudra Murthy, Jason Stillerman, Siva Sankalp Patel, Dmitry Abulkhanov, Marco Zocca, Manan Dey, Zhihan Zhang, Nour Fahmy, Urvashi Bhattacharyya, Wenhao Yu, Swayam Singh, Sasha Luccioni, Paulo Villegas, Maxim Kunakov, Fedor Zhdanov, Manuel Romero, Tony Lee, Nadav Timor, Jennifer Ding, Claire Schlesinger, Hailey Schoelkopf, Jan Ebert, Tri Dao, Mayank Mishra, Alex Gu, Jennifer Robinson, Carolyn Jane Anderson, Brendan Dolan-Gavitt, Danish Contractor, Siva Reddy, Daniel Fried, Dzmitry Bahdanau, Yacine Jernite, Carlos Muñoz Ferrandis, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, Harm de Vries
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2305.06161"}
+tags: ["Transformer"]
+---
+The BigCode community, an open-scientific collaboration working on the responsible development of Large Language Models for Code (Code LLMs), introduces StarCoder and StarCoderBase: 15.5B parameter models with 8K context length, infilling capabilities and fast large-batch inference enabled by multi-query attention. StarCoderBase is trained on 1 trillion tokens sourced from The Stack, a large collection of permissively licensed GitHub repositories with inspection tools and an opt-out process. We fine-tuned StarCoderBase on 35B Python tokens, resulting in the creation of StarCoder. We perform the most comprehensive evaluation of Code LLMs to date and show that StarCoderBase outperforms every open Code LLM that supports multiple programming languages and matches or outperforms the OpenAI `code-cushman-001` model. Furthermore, StarCoder outperforms every model that is fine-tuned on Python, can be prompted to achieve 40% pass@1 on HumanEval, and still retains its performance on other programming languages. We take several important steps towards a safe open-access model release, including an improved PII redaction pipeline and a novel attribution tracing tool, and make the StarCoder models publicly available under a more commercially viable version of the Open Responsible AI Model license.
diff --git a/_publications/li2023think.markdown b/_publications/li2023think.markdown
new file mode 100644
index 00000000..441e3d49
--- /dev/null
+++ b/_publications/li2023think.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Think Outside the Code: Brainstorming Boosts Large Language Models in Code Generation"
+authors: Xin-Ye Li, Jiang-Tian Xue, Zheng Xie, Ming Li
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2305.10679"}
+tags: ["generation", "Transformer"]
+---
+Code generation aims to automatically generate source code from high-level task specifications, which can significantly increase productivity of software engineering. Recently, approaches based on large language models (LLMs) have shown remarkable code generation abilities on simple tasks. However, generate code for more complex tasks, such as competition-level problems, remains challenging. In this paper, we introduce Brainstorm framework for code generation. It leverages a brainstorming step that generates and selects diverse thoughts on the problem to facilitate algorithmic reasoning, where the thoughts are possible blueprint of solving the problem. We demonstrate that Brainstorm significantly enhances the ability of LLMs to solve competition-level programming problems, resulting in a more than 50% increase in the pass@$k$ metrics for ChatGPT on the CodeContests benchmark, achieving state-of-the-art performance. Furthermore, our experiments conducted on LeetCode contests show that our framework boosts the ability of ChatGPT to a level comparable to that of human programmers.
diff --git a/_publications/li2024rewriting.markdown b/_publications/li2024rewriting.markdown
new file mode 100644
index 00000000..7b98ccd4
--- /dev/null
+++ b/_publications/li2024rewriting.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Rewriting the Code: A Simple Method for Large Language Model Augmented Code Search"
+authors: Haochen Li, Xin Zhou, Zhiqi Shen
+conference: 
+year: 2024
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2401.04514"}
+tags: ["search", "large language models", "metrics"]
+---
+In code search, the Generation-Augmented Retrieval (GAR) framework, which generates exemplar code snippets to augment queries, has emerged as a promising strategy to address the principal challenge of modality misalignment between code snippets and natural language queries, particularly with the demonstrated code generation capabilities of Large Language Models (LLMs). Nevertheless, our preliminary investigations indicate that the improvements conferred by such an LLM-augmented framework are somewhat constrained. This limitation could potentially be ascribed to the fact that the generated codes, albeit functionally accurate, frequently display a pronounced stylistic deviation from the ground truth code in the codebase. In this paper, we extend the foundational GAR framework and propose a simple yet effective method that additionally Rewrites the Code (ReCo) within the codebase for style normalization. Experimental results demonstrate that ReCo significantly boosts retrieval accuracy across sparse (up to 35.7%), zero-shot dense (up to 27.6%), and fine-tuned dense (up to 23.6%) retrieval settings in diverse search scenarios. To further elucidate the advantages of ReCo and stimulate research in code style normalization, we introduce Code Style Similarity, the first metric tailored to quantify stylistic similarities in code. Notably, our empirical findings reveal the inadequacy of existing metrics in capturing stylistic nuances.
diff --git a/_publications/liguori2021shellcode_ia32.markdown b/_publications/liguori2021shellcode_ia32.markdown
new file mode 100644
index 00000000..f49c4daf
--- /dev/null
+++ b/_publications/liguori2021shellcode_ia32.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Shellcode_IA32: A Dataset for Automatic Shellcode Generation"
+authors: Pietro Liguori, Erfan Al-Hossami, Domenico Cotroneo, Roberto Natella, Bojan Cukic, Samira Shaikh
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.7.pdf"}
+tags: ["code generation", "dataset"]
+---
+We take the first step to address the task of automatically generating shellcodes, i.e., small pieces of code used as a payload in the exploitation of a software vulnerability, starting from natural language comments. We assemble and release a novel dataset (Shellcode_IA32), consisting of challenging but common assembly instructions with their natural language descriptions. We experiment with standard methods in neural machine translation (NMT) to establish baseline performance levels on this task.
diff --git a/_publications/lin2017program.markdown b/_publications/lin2017program.markdown
index 75d0c654..8a6e6564 100644
--- a/_publications/lin2017program.markdown
+++ b/_publications/lin2017program.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: "Program Synthesis from Natural Language Using Recurrent Neural Networks"
-authors: X.V. Lin, C. Wang, D. Pang, K. Vu, L. Zettlemoyer, M.D. Ernst
+authors: Xi Victoria Lin, Chenglong Wang, Deric Pang, Kevin Vu, Michael D. Ernst
 conference: Technical Report UW-CSE-17-03-01, University of Washington Department of Computer Science and Engineering
 year: 2017
-bibkey: lin2017program
 additional_links:
    - {name: "PDF", url: "http://victorialin.net/pubs/tellina_tr180201.pdf"}
    - {name: "Tool", url: "http://tellina.rocks/"}
-tags: ["bimodal", "generation"]
+tags: ["bimodal", "code generation"]
 ---
 Oftentimes, a programmer may have difficulty implementing a
 desired operation. Even when the programmer can describe her
diff --git a/_publications/lin2018nl2bash.markdown b/_publications/lin2018nl2bash.markdown
index 11aab729..6ee240d2 100644
--- a/_publications/lin2018nl2bash.markdown
+++ b/_publications/lin2018nl2bash.markdown
@@ -1,12 +1,12 @@
 ---
 layout: publication
-title: "NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System."
-authors: X.V. Lin, C. Wang, L. Zettlemoyer and M.D. Ernst
+title: "NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System"
+authors: Xi Victoria Lin, Chenglong Wang, Luke Zettlemoyer, Michael D. Ernst
 conference: LREC 
 year: 2018
-bibkey: lin2018nl2bash
 additional_links:
    - {name: "PDF", url: "http://victorialin.net/pubs/nl2bash.pdf"}
-tags: ["bimodal", "generation"]
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1802.08979"}
+tags: ["bimodal", "code generation"]
 ---
 We present new data and semantic parsing methods for the problem of mapping english sentences to Bash commands (NL2Bash). Our long-term goal is to enable any user to easily solve otherwise repetitive tasks (such as file manipulation, search, and application-specific scripting) by simply stating their intents in English. We take a first step in this domain, by providing a large new dataset of challenging but commonly used commands paired with their English descriptions, along with the baseline methods to establish performance levels on this task. 
\ No newline at end of file
diff --git a/_publications/lin2019impact.markdown b/_publications/lin2019impact.markdown
index 5717d019..38396619 100644
--- a/_publications/lin2019impact.markdown
+++ b/_publications/lin2019impact.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "On the Impact of Refactoring Operations on Code Naturalness"
-authors: B. Lin, C. Nagy, G. Bavota, M. Lanza
+authors: Bin Lin, Csaba Nagy, Gabriele Bavota, Michele Lanza
 conference: SANER
 year: 2019
-bibkey: lin2019impact
 additional_links:
    - {name: "IEEEexplore", url: "https://ieeexplore.ieee.org/document/8667992"}
    - {name: "PDF", url: "https://www.inf.usi.ch/phd/lin/downloads/Lin2019b.pdf"}
diff --git a/_publications/ling2016latent.markdown b/_publications/ling2016latent.markdown
index 7ee16678..31dd057e 100644
--- a/_publications/ling2016latent.markdown
+++ b/_publications/ling2016latent.markdown
@@ -1,11 +1,12 @@
 ---
 layout: publication
 title: "Latent Predictor Networks for Code Generation"
-authors: W. Ling, E. Grefenstette, K. M. Hermann, T. Kocisky, A. Senior, F. Wang, P. Blunsom
+authors: Wang Ling, Edward Grefenstette, Karl Moritz Hermann, Tomáš Kočiský, Andrew Senior, Fumin Wang, Phil Blunsom
 conference: ACL
 year: 2016
-bibkey: ling2016latent
-tags: ["bimodal", "generation"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1603.06744"}
+tags: ["bimodal", "code generation"]
 ---
 Many  language  generation  tasks  require
 the production of text conditioned on both
diff --git a/_publications/ling2020adaptive.markdown b/_publications/ling2020adaptive.markdown
new file mode 100644
index 00000000..b5013a35
--- /dev/null
+++ b/_publications/ling2020adaptive.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Adaptive Deep Code Search"
+authors: Chunyang Ling, Zeqi Lin, Yanzhen Zou, Bing Xie
+conference: ICPC
+year: 2020
+additional_links:
+  - { name: "ACM", url: "https://dl.acm.org/doi/abs/10.1145/3387904.3389278" }
+tags: ["search"]
+---
+
+Searching code in a large-scale codebase using natural language queries is a common practice during software development. Deep learning-based code search methods demonstrate superior performance if models are trained with large amount of text-code pairs. However, few deep code search models can be easily transferred from one codebase to another. It can be very costly to prepare training data for a new codebase and re-train an appropriate deep learning model. In this paper, we propose AdaCS, an adaptive deep code search method that can be trained once and transferred to new codebases. AdaCS decomposes the learning process into embedding domain-specific words and matching general syntactic patterns. Firstly, an unsupervised word embedding technique is used to construct a matching matrix to represent the lexical similarities. Then, a recurrent neural network is used to capture latent syntactic patterns from these matching matrices in a supervised way. As the supervised task learns general syntactic patterns that exist across domains, AdaCS is transferable to new codebases. Experimental results show that: when extended to new software projects never seen in the training data, AdaCS is more robust and significantly outperforms state-of-the-art deep code search methods.
diff --git a/_publications/ling2020deep.markdown b/_publications/ling2020deep.markdown
new file mode 100644
index 00000000..1f307149
--- /dev/null
+++ b/_publications/ling2020deep.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Deep Graph Matching and Searching for Semantic Code Retrieval"
+authors: Xiang Ling, Lingfei Wu, Saizhuo Wang, Gaoning Pan, Tengfei Ma, Fangli Xu, Alex X. Liu, Chunming Wu, Shouling Ji
+conference: TKDD
+year: 2020
+additional_links:
+  - { name: "ArXiV", url: "https://arxiv.org/abs/2010.12908" }
+tags: ["search", "GNN"]
+---
+
+Code retrieval is to find the code snippet from a large corpus of source code repositories that highly matches the query of natural language description. Recent work mainly uses natural language processing techniques to process both query texts (i.e., human natural language) and code snippets (i.e., machine programming language), however neglecting the deep structured features of query texts and source codes, both of which contain rich semantic information. In this paper, we propose an end-to-end deep graph matching and searching (DGMS) model based on graph neural networks for the task of semantic code retrieval. To this end, we first represent both natural language query texts and programming language code snippets with the unified graph-structured data, and then use the proposed graph matching and searching model to retrieve the best matching code snippet. In particular, DGMS not only captures more structural information for individual query texts or code snippets but also learns the fine-grained similarity between them by cross-attention based semantic matching operations. We evaluate the proposed DGMS model on two public code retrieval datasets with two representative programming languages (i.e., Java and Python). Experiment results demonstrate that DGMS significantly outperforms state-of-the-art baseline models by a large margin on both datasets. Moreover, our extensive ablation studies systematically investigate and illustrate the impact of each part of DGMS.
diff --git a/_publications/liu2016towards.markdown b/_publications/liu2016towards.markdown
index 6edd1838..7d272f48 100644
--- a/_publications/liu2016towards.markdown
+++ b/_publications/liu2016towards.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Towards Better Program Obfuscation: Optimization via Language Models"
-authors: H. Liu
+authors: Han Liu
 conference: ICSE
 year: 2016
-bibkey: liu2016towards
 tags: ["deobfuscation"]
 ---
 As a common practice in software development, program
diff --git a/_publications/liu2018neural.markdown b/_publications/liu2018neural.markdown
index d42c48ca..06eab5c8 100644
--- a/_publications/liu2018neural.markdown
+++ b/_publications/liu2018neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural-Machine-Translation-Based Commit Message Generation: How Far Are We?"
-authors: Z. Liu, X. Xia, A.E. Hassan, D. Lo, Z. Xing, X. Wang
+authors: Zhongxin Liu, Xin Xia, Ahmed E. Hassan, David Lo, Zhenchang Xing, Xinyu Wang
 conference: ASE
 year: 2018
-bibkey: liu2018neural
 tags: ["edit", "summarization"]
 ---
 Commit messages can be regarded as the documentation of software changes. These messages describe the content and purposes of changes, hence are useful for program comprehension and software maintenance. However, due to the lack of time and direct motivation, commit messages sometimes are neglected by developers. To address this problem, Jiang et al. proposed an approach (we refer to it as NMT), which leverages a neural machine translation algorithm to automatically generate short commit messages from code. The reported performance of their approach is promising, however, they did not explore why their approach performs well. Thus, in this paper, we first perform an in-depth analysis of their experimental results. We find that (1) Most of the test <pre>diffs</pre> from which NMT can generate high-quality messages are similar to one or more training <pre>diffs</pre> at the token level. (2) About 16% of the commit messages in Jiang et al.’s dataset are noisy due to being automatically generated or due to them describing repetitive trivial changes. (3) The performance of NMT declines by a large amount after removing such noisy commit messages. In addition, NMT is complicated and time-consuming. Inspired by our first finding, we proposed a simpler and faster approach, named NNGen (Nearest Neighbor Generator), to generate concise commit messages using the nearest neighbor algorithm. Our experimental results show that NNGen is over 2,600 times faster than NMT, and outperforms NMT in terms of BLEU (an accuracy measure that is widely used to evaluate machine translation systems) by 21%. Finally, we also discuss some observations for the road ahead for automated commit message generation to inspire other researchers.
diff --git a/_publications/liu2019deepfuzz.markdown b/_publications/liu2019deepfuzz.markdown
index 7c8a302d..2466aa6d 100644
--- a/_publications/liu2019deepfuzz.markdown
+++ b/_publications/liu2019deepfuzz.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "DeepFuzz: Automatic Generation of Syntax Valid C Programs for Fuzz Testing"
-authors: X. Liu, X. Li, R. Prajapati, D. Wu
+authors: Xiao Liu, Xiaoting Li, Rupesh Prajapati, Dinghao Wu
 conference: AAAI
 year: 2019
-bibkey: liu2019deepfuzz
-tags: ["fuzzing", "generation"]
+tags: ["fuzzing", "code generation"]
 ---
 Compilers  are among  the  most  fundamental  programming
 tools for building software. However, production compilers
diff --git a/_publications/liu2019generating.markdown b/_publications/liu2019generating.markdown
index e1d2e9ff..027d3a2b 100644
--- a/_publications/liu2019generating.markdown
+++ b/_publications/liu2019generating.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Generating commit messages from diffs using pointer-generator network"
-authors: Q. Liu, Z. Liu, H. Zhu, H. Fan, B. Du, Y. Qian
+authors: Qin Liu, Zihe Liu, Hongming Zhu, Hongfei Fan, Bowen Du, Yu Qian. 
 conference: MSR
 year: 2019
-bibkey: liu2019generating
 tags: ["edit"]
 ---
 The commit messages in source code repositories are valuable but not easy to be generated manually in time for tracking issues, reporting bugs, and understanding codes. Recently published works indicated that the deep neural machine translation approaches have drawn considerable attentions on automatic generation of commit messages. However, they could not deal with out-of-vocabulary (OOV) words, which are essential context-specific identifiers such as class names and method names in code diffs. In this paper, we propose PtrGNCMsg, a novel approach which is based on an improved sequence-to-sequence model with the pointer-generator network to translate code diffs into commit messages. By searching the smallest identifier set with the highest probability, PtrGNCMsg outperforms recent approaches based on neural machine translation, and first enables the prediction of OOV words. The experimental results based on the corpus of diffs and manual commit messages from the top 2,000 Java projects in GitHub show that PtrGNCMsg outperforms the state-of-the-art approach with improved BLEU by 1.02, ROUGE-1 by 4.00 and ROUGE-L by 3.78, respectively.
diff --git a/_publications/liu2019learning.markdown b/_publications/liu2019learning.markdown
index cca027d5..56c8754b 100644
--- a/_publications/liu2019learning.markdown
+++ b/_publications/liu2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Sport and Refactor Inconsistent Method Names"
-authors: K. Liu, D. Kim, T. F. Bissyand́e, T. Kim, K. Kim, A. Koyuncu, S. Kim, Y. Le Traon
+authors: Kui Liu, Dongsun Kim, Tegawendé F. Bissyandé, Taeyoung Kim, Kisub Kim, Anil Koyuncu, Suntae Kim, Yves Le Traon
 conference: ICSE
 year: 2019
-bibkey: liu2019learning
 tags: ["naming"]
 ---
 To ensure code readability and facilitate software maintenance, program methods must be named properly. In particular, method names must be consistent with the corresponding method implementations. Debugging method names remains an important topic in the literature, where various approaches analyze commonalities among method names in a large dataset to detect inconsistent method names and suggest better ones. We note that the state-of-the-art does not analyze the implemented code itself to assess consistency. We thus propose a novel automated approach to debugging method names based on the analysis of consistency between method names and method code. The approach leverages deep feature representation techniques adapted to the nature of each artifact. Experimental results on over 2.1 million Java methods show that we can achieve up to 15 percentage points improvement over the state-of-the-art, establishing a record performance of 67.9% F1-measure in identifying inconsistent method names. We further demonstrate that our approach yields up to 25% accuracy in suggesting full names, while the state-of-the-art lags far behind at 1.1% accuracy. Finally, we report on our success in fixing 66 inconsistent method names in a live study on projects in the wild.
\ No newline at end of file
diff --git a/_publications/liu2019neural.markdown b/_publications/liu2019neural.markdown
index 53de5147..cacfc801 100644
--- a/_publications/liu2019neural.markdown
+++ b/_publications/liu2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural query expansion for code search"
-authors: J. Liu, S. Kim, V. Murali, S. Chaudhuri, S. Chandra
+authors: Jason Liu, Seohyun Kim, Vijayaraghavan Murali, Swarat Chaudhuri, Satish Chandra
 conference: MAPL
 year: 2019
-bibkey: liu2019neural
 tags: ["search"]
 ---
 Searching repositories of existing source code for code snippets is a key task in software engineering. Over the years, many approaches to this problem have been proposed. One recent tool called NCS, takes in a natural language query and outputs relevant code snippets, often being able to correctly answer Stack Overflow questions. But what happens when the developer doesn’t provide a query with a clear intent? What if shorter queries are used to demonstrate a more vague intent? 
diff --git a/_publications/liu2020automating.markdown b/_publications/liu2020automating.markdown
new file mode 100644
index 00000000..e149d68d
--- /dev/null
+++ b/_publications/liu2020automating.markdown
@@ -0,0 +1,9 @@
+---
+layout: publication
+title: "Automating Just-In-Time Comment Updating"
+authors: Zhongxin Liu, Xin Xia, Meng Yan, Shanping Li
+conference: ASE
+year: 2020
+tags: ["documentation"]
+---
+Code comments are valuable for program comprehension and software maintenance, and also require maintenance with code evolution. However, when changing code, developers sometimes neglect updating the related comments, bringing in inconsistent or obsolete comments (aka., bad comments). Such comments are detrimental since they may mislead developers and lead to future bugs. Therefore, it is necessary to fix and avoid bad comments. In this work, we argue that bad comments can be reduced and even avoided by automatically performing comment updates with code changes. We refer to this task as “Just-In-Time (JIT) Comment Updating” and propose an approach named CUP (Comment UPdater) to automate this task. CUP can be used to assist developers in updating comments during code changes and can consequently help avoid the introduction of bad comments. Specifically, CUP leverages a novel neural sequence-to-sequence model to learn comment update patterns from extant code-comment co-changes and can automatically generate a new comment based on its corresponding old comment and code change. Several customized enhancements, such as a special tokenizer and a novel co-attention mechanism, are introduced in CUP by us to handle the characteristics of this task. We build a dataset with over 108K comment-code co-change samples and evaluate CUP on it. The evaluation results show that CUP outperforms an information-retrieval-based and a rule-based baselines by substantial margins, and can reduce developers' edits required for JIT comment updating. In addition, the comments generated by our approach are identical to those updated by developers in 1612 (16.7%) test samples, 7 times more than the best-performing baseline.
diff --git a/_publications/liu2022open.markdown b/_publications/liu2022open.markdown
new file mode 100644
index 00000000..1ff11cdb
--- /dev/null
+++ b/_publications/liu2022open.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Open-ended Knowledge Tracing
+authors: Naiming Liu, Zichao Wang, Richard G. Baraniuk, Andrew Lan
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.03716"}
+   - {name: "code", url: "https://github.com/lucy66666/OKT"}
+tags: ["education", "code generation"]
+---
+In education applications, knowledge tracing refers to the problem of estimating students' time-varying concept/skill mastery level from their past responses to questions and predicting their future performance. One key limitation of most existing knowledge tracing methods is that they treat student responses to questions as binary-valued, i.e., whether they are correct or incorrect. Response correctness analysis/prediction ignores important information on student knowledge contained in the exact content of the responses, especially for open-ended questions. In this paper, we conduct the first exploration into open-ended knowledge tracing (OKT) by studying the new task of predicting students' exact open-ended responses to questions. Our work is grounded in the domain of computer science education with programming questions. We develop an initial solution to the OKT problem, a student knowledge-guided code generation approach, that combines program synthesis methods using language models with student knowledge tracing methods. We also conduct a series of quantitative and qualitative experiments on a real-world student code dataset to validate OKT and demonstrate its promise in educational applications.
diff --git a/_publications/liu2023code.markdown b/_publications/liu2023code.markdown
new file mode 100644
index 00000000..2009fd2d
--- /dev/null
+++ b/_publications/liu2023code.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Code Execution with Pre-trained Language Models"
+authors: Chenxiao Liu, Shuai Lu, Weizhu Chen, Daxin Jiang, Alexey Svyatkovskiy, Shengyu Fu, Neel Sundaresan, Nan Duan
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2305.05383"}
+tags: ["Transformer", "execution"]
+---
+Code execution is a fundamental aspect of programming language semantics that reflects the exact behavior of the code. However, most pre-trained models for code intelligence ignore the execution trace and only rely on source code and syntactic structures. In this paper, we investigate how well pre-trained models can understand and perform code execution. We develop a mutation-based data augmentation technique to create a large-scale and realistic Python dataset and task for code execution, which challenges existing models such as Codex. We then present CodeExecutor, a Transformer model that leverages code execution pre-training and curriculum learning to enhance its semantic comprehension. We evaluate CodeExecutor on code execution and show its promising performance and limitations. We also demonstrate its potential benefits for code intelligence tasks such as zero-shot code-to-code search and text-to-code generation. Our analysis provides insights into the learning and generalization abilities of pre-trained models for code execution.
diff --git a/_publications/lomshakov2023fine.markdown b/_publications/lomshakov2023fine.markdown
new file mode 100644
index 00000000..b38a2ff2
--- /dev/null
+++ b/_publications/lomshakov2023fine.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Fine-Tuning Large Language Models for Answering Programming Questions with Code Snippets
+authors: V. Lomshakov, S. Kovalchuk, M. Omelchenko, S. Nikolenko, A. Aliev
+conference: ICCS
+year: 2023
+additional_links:
+   - {name: "LNCS", url: "https://link.springer.com/chapter/10.1007/978-3-031-36021-3_15"}
+   - {name: "Papers with Code ", url: "https://paperswithcode.com/paper/fine-tuning-large-language-models-for"}
+tags: ["program synthesis", "question answering", "large language models"]
+---
+We study the ability of pretrained large language models (LLM) to answer questions from online question answering fora such as Stack Overflow. We consider question-answer pairs where the main part of the answer consists of source code. On two benchmark datasets — CoNaLa and a newly collected dataset based on Stack Overflow — we investigate how a closed-book question answering system can be improved by fine-tuning the LLM for the downstream task, prompt engineering, and data preprocessing. We use publicly available autoregressive language models such as GPT-Neo, CodeGen, and PanGu-Coder, and after the proposed fine-tuning achieve a BLEU score of 0.4432 on the CoNaLa test set, significantly exceeding previous state of the art for this task.
\ No newline at end of file
diff --git a/_publications/louis2018deep.markdown b/_publications/louis2018deep.markdown
index b5278ddb..3c92a2c3 100644
--- a/_publications/louis2018deep.markdown
+++ b/_publications/louis2018deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Learning to Detect Redundant Method Comments"
-authors: A. Louis, S. K. Dash, E. T. Barr, C. Sutton
+authors: Annie Louis, Santanu Kumar Dash, Earl T. Barr, Charles Sutton
 conference: 
 year: 2018
-bibkey: louis2018deep
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1806.04616"}
 tags: ["bimodal", "documentation"]
diff --git a/_publications/louis2020where.markdown b/_publications/louis2020where.markdown
index 02e539b8..cad2c83a 100644
--- a/_publications/louis2020where.markdown
+++ b/_publications/louis2020where.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Where should I comment my code? A dataset and model for predicting locations that need comments"
-authors: A. Louis, S.K. Dash, E.T. Barr, M.D. Ernst, and C. Sutton
+authors: Annie Louis, Santanu Kumar Dash, Earl T. Barr, Charles Sutton
 conference: International Conference on Software Engineering (ICSE; NIER track)
 year: 2020
-bibkey: louis2020where
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1806.04616"}
    - {name: "Data", url: "http://groups.inf.ed.ac.uk/cup/comment-locator"}
diff --git a/_publications/loyola2017neural.markdown b/_publications/loyola2017neural.markdown
index 0166a1dd..4db50bf2 100644
--- a/_publications/loyola2017neural.markdown
+++ b/_publications/loyola2017neural.markdown
@@ -1,10 +1,10 @@
 ---
 layout: publication
 title: "A Neural Architecture for Generating Natural Language Descriptions from Source Code Changes"
-authors: P. Loyola, E. Marrese-Taylor, Y. Matsuo
-conference: ArXiV 1704.04856
+authors: Pablo Loyola, Edison Marrese-Taylor, Yutaka Matsuo
 year: 2017
-bibkey: loyola2017neural
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1704.04856"}
 tags: ["edit", "summarization"]
 ---
 We propose a model to automatically describe changes introduced in the source code of a program using natural language. Our method receives as input a set of code commits, which contains both the modifications and message introduced by an user. These two modalities are used to train an encoder-decoder architecture. We evaluated our approach on twelve real world open source projects from four different programming languages. Quantitative and qualitative results showed that the proposed approach can generate feasible and semantically sound descriptions not only in standard in-project settings, but also in a cross-project setting. 
diff --git a/_publications/loyola2018content.markdown b/_publications/loyola2018content.markdown
index c1748420..f2dc4412 100644
--- a/_publications/loyola2018content.markdown
+++ b/_publications/loyola2018content.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Content Aware Source Code Change Description Generation"
-authors: P. Loyola, E. Marrese-Taylor, J.A. Balazs, Y. Matsuo, F. Satoh
+authors: Pablo Loyola, Edison Marrese-Taylor, Jorge Balazs, Yutaka Matsuo, Fumiko Satoh
 conference: International Natural Language Generation Conference
 year: 2018
-bibkey: loyola2018content
 tags: ["edit", "summarization"]
 ---
 We propose to study the generation of descriptions from source code changes by integrating  the  messages  included  on  code
diff --git a/_publications/lu2019program.markdown b/_publications/lu2019program.markdown
index 11e30e6d..09402580 100644
--- a/_publications/lu2019program.markdown
+++ b/_publications/lu2019program.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Program Classification Using Gated Graph Attention Neural Network for Online Programming Service"
-authors: M. Lu, D. Tan, N. Xiong, Z. Chen, H. Li
+authors: Mingming Lu, Dingwu Tan, Naixue Xiong, Zailiang Chen, Haifeng Li
 conference:
 year: 2019
-bibkey: lu2019program
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1903.03804"}
 tags: ["GNN", "representation"]
 ---
 The online programing services, such as Github, TopCoder, and EduCoder, have promoted a lot of social interactions among the service users. However, the existing social interactions is rather limited and inefficient due to the rapid increasing of source-code repositories, which is difficult to explore manually. The emergence of source-code mining provides a promising way to analyze those source codes, so that those source codes can be relatively easy to understand and share among those service users. Among all the source-code mining attempts,program classification lays a foundation for various tasks related to source-code understanding, because it is impossible for a machine to understand a computer program if it cannot classify the program correctly. Although numerous machine learning models, such as the Natural Language Processing (NLP) based models and the Abstract Syntax Tree (AST) based models, have been proposed to classify computer programs based on their corresponding source codes, the existing works cannot fully characterize the source codes from the perspective of both the syntax and semantic information. To address this problem, we proposed a Graph Neural Network (GNN) based model, which integrates data flow and function call information to the AST,and applies an improved GNN model to the integrated graph, so as to achieve the state-of-art program classification accuracy. The experiment results have shown that the proposed work can classify programs with accuracy over 97%.
\ No newline at end of file
diff --git a/_publications/lu2021codexglue.markdown b/_publications/lu2021codexglue.markdown
new file mode 100644
index 00000000..bb852c47
--- /dev/null
+++ b/_publications/lu2021codexglue.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation"
+authors: Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Duyu Tang, Ge Li, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, Ming Gong, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, Shujie Liu
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2102.04664"}
+tags: ["benchmark", "Transformer"]
+---
+Benchmark datasets have a significant impact on accelerating research in programming language tasks. In this paper, we introduce CodeXGLUE, a benchmark dataset to foster machine learning research for program understanding and generation. CodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison. CodeXGLUE also features three baseline systems, including the BERT-style, GPT-style, and Encoder-Decoder models, to make it easy for researchers to use the platform. The availability of such data and baselines can help the development and validation of new methods that can be applied to various program understanding and generation problems. 
diff --git a/_publications/lu2022reacc.markdown b/_publications/lu2022reacc.markdown
new file mode 100644
index 00000000..06cc08e5
--- /dev/null
+++ b/_publications/lu2022reacc.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "ReACC: A Retrieval-Augmented Code Completion Framework"
+authors: Shuai Lu, Nan Duan, Hojae Han, Daya Guo, Seung-won Hwang, Alexey Svyatkovskiy
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.07722"}
+tags: ["Transformer", "autocomplete"]
+---
+Code completion, which aims to predict the following code token(s) according to the code context, can improve the productivity of software development. Recent work has proved that statistical language modeling with transformers can greatly improve the performance in the code completion task via learning from large-scale source code datasets. However, current approaches focus only on code context within the file or project, i.e. internal context. Our distinction is utilizing "external" context, inspired by human behaviors of copying from the related code snippets when writing code. Specifically, we propose a retrieval-augmented code completion framework, leveraging both lexical copying and referring to code with similar semantics by retrieval. We adopt a stage-wise training approach that combines a source code retriever and an auto-regressive language model for programming language. We evaluate our approach in the code completion task in Python and Java programming languages, achieving a state-of-the-art performance on CodeXGLUE benchmark. 
diff --git a/_publications/luan2019aroma.markdown b/_publications/luan2019aroma.markdown
new file mode 100644
index 00000000..ec4eeb4b
--- /dev/null
+++ b/_publications/luan2019aroma.markdown
@@ -0,0 +1,9 @@
+---
+layout: publication
+title: "Aroma: code recommendation via structural code search"
+authors: Sifei Luan, Di Yang, Celeste Barnaby, Koushik Sen, Satish Chandra
+conference: PACMPL
+year: 2015
+tags: ["search"]
+---
+Programmers often write code that has similarity to existing code written somewhere. A tool that could help programmers to search such similar code would be immensely useful. Such a tool could help programmers to extend partially written code snippets to completely implement necessary functionality, help to discover extensions to the partial code which are commonly included by other programmers, help to cross-check against similar code written by other programmers, or help to add extra code which would fix common mistakes and errors. We propose Aroma, a tool and technique for code recommendation via structural code search. Aroma indexes a huge code corpus including thousands of open-source projects, takes a partial code snippet as input, searches the corpus for method bodies containing the partial code snippet, and clusters and intersects the results of the search to recommend a small set of succinct code snippets which both contain the query snippet and appear as part of several methods in the corpus. We evaluated Aroma on 2000 randomly selected queries created from the corpus, as well as 64 queries derived from code snippets obtained from Stack Overflow, a popular website for discussing code. We implemented Aroma for 4 different languages, and developed an IDE plugin for Aroma. Furthermore, we conducted a study where we asked 12 programmers to complete programming tasks using Aroma, and collected their feedback. Our results indicate that Aroma is capable of retrieving and recommending relevant code snippets efficiently.
diff --git a/_publications/maddison2014structured.markdown b/_publications/maddison2014structured.markdown
index bac89749..f875d449 100644
--- a/_publications/maddison2014structured.markdown
+++ b/_publications/maddison2014structured.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Structured Generative Models of Natural Source Code"
-authors: C.J. Maddison, D. Tarlow
+authors: Chris J. Maddison, Daniel Tarlow
 conference: ICML
 year: 2014
-bibkey: maddison2014structured
-tags: ["language model", "generation", "grammar", "AST"]
+tags: ["language model", "code generation", "grammar", "grammar"]
 ---
 We study the problem of building generative
 models of natural source code (NSC); that is,
diff --git a/_publications/mahmud2021code.markdown b/_publications/mahmud2021code.markdown
new file mode 100644
index 00000000..f364f7b1
--- /dev/null
+++ b/_publications/mahmud2021code.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Code to Comment Translation: A Comparative Study on Model Effectiveness & Errors"
+authors: Junayed Mahmud, Fahim Faisal, Raihan Islam Arnob, Antonios Anastasopoulos, Kevin Moran
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.1.pdf"}
+tags: ["survey", "summarization", "Transformer"]
+---
+Automated source code summarization is a popular software engineering research topic wherein machine translation models are employed to “translate” code snippets into relevant natural language descriptions. Most evaluations of such models are conducted using automatic reference-based metrics. However, given the relatively large semantic gap between programming languages and natural language, we argue that this line of research would benefit from a qualitative investigation into the various error modes of current state-of-the-art models. Therefore, in this work, we perform both a quantitative and qualitative comparison of three recently proposed source code summarization models. In our quantitative evaluation, we compare the models based on the smoothed BLEU-4, METEOR, and ROUGE-L machine translation metrics, and in our qualitative evaluation, we perform a manual open-coding of the most common errors committed by the models when compared to ground truth captions. Our investigation reveals new insights into the relationship between metric-based performance and model prediction errors grounded in an error taxonomy that can be used to drive future research efforts.
diff --git a/_publications/malik2019nl2type.markdown b/_publications/malik2019nl2type.markdown
index 234f011f..27dd35ef 100644
--- a/_publications/malik2019nl2type.markdown
+++ b/_publications/malik2019nl2type.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "NL2Type: Inferring JavaScript Function Types from Natural Language Information"
-authors: R.S. Malik, J. Patra, M. Pradel
+authors: Rabee Sohail Malik, Jibesh Patra, Michael Pradel
 conference: ICSE
 year: 2019
-bibkey: malik2019nl2type
 tags: ["bimodal", "types"]
 ---
 JavaScript is dynamically typed and hence lacks thetype safety  of  statically  typed  languages,
diff --git a/_publications/mammadli2020static.markdown b/_publications/mammadli2020static.markdown
new file mode 100644
index 00000000..aba4d2f9
--- /dev/null
+++ b/_publications/mammadli2020static.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Static Neural Compiler Optimization via Deep Reinforcement Learning"
+authors: Rahim Mammadli, Ali Jannesari, Felix Wolf
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2008.08951"}
+tags: ["compilation"]
+---
+The phase-ordering problem of modern compilers has received a lot of attention from the research community over the years, yet remains largely unsolved. Various optimization sequences exposed to the user are manually designed by compiler developers. In designing such a sequence developers have to choose the set of optimization passes, their parameters and ordering within a sequence. Resulting sequences usually fall short of achieving optimal runtime for a given source code and may sometimes even degrade the performance when compared to unoptimized version. In this paper, we employ a deep reinforcement learning approach to the phase-ordering problem. Provided with sub-sequences constituting LLVM's O3 sequence, our agent learns to outperform the O3 sequence on the set of source codes used for training and achieves competitive performance on the validation set, gaining up to 1.32x speedup on previously-unseen programs. Notably, our approach differs from autotuning methods by not depending on one or more test runs of the program for making successful optimization decisions. It has no dependence on any dynamic feature, but only on the statically-attainable intermediate representation of the source code. We believe that the models trained using our approach can be integrated into modern compilers as neural optimization agents, at first to complement, and eventually replace the hand-crafted optimization sequences.
diff --git a/_publications/mangal2015user.markdown b/_publications/mangal2015user.markdown
index 5895108d..5f84f066 100644
--- a/_publications/mangal2015user.markdown
+++ b/_publications/mangal2015user.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A User-Guided Approach to Program Analysis"
-authors: R. Mangal, X. Zhang, A. V. Nori, M. Naik
+authors: Ravi Mangal, Xin Zhang, Aditya V. Nori, Mayur Naik
 conference: FSE
 year: 2015
-bibkey: mangal2015user
 tags: ["program analysis"]
 ---
 Program analysis tools often produce undesirable output
diff --git a/_publications/markovtsev2017topic.markdown b/_publications/markovtsev2017topic.markdown
index 5d7902fe..50734201 100644
--- a/_publications/markovtsev2017topic.markdown
+++ b/_publications/markovtsev2017topic.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Topic modeling of public repositories at scale using names in source code"
-authors: V. Markovtsev, E. Kant
-conference: ArXiV 1704.00135
+authors: Vadim Markovtsev, Eiso Kant
+conference:
 year: 2017
-bibkey: markovtsev2017topic
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1704.00135"}
    - {name: "website", url: "https://blog.sourced.tech/post/github_topic_modeling"}
diff --git a/_publications/markovtsev2018public.markdown b/_publications/markovtsev2018public.markdown
index 3ddbf914..12e55d1c 100644
--- a/_publications/markovtsev2018public.markdown
+++ b/_publications/markovtsev2018public.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Public Git Archive: a Big Code dataset for all"
-authors: V. Markovtsev, W. Long
+authors: Vadim Markovtsev, Waren Long
 conference: MSR
 year: 2018
-bibkey: markovtsev2018public
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1803.10144"}
    - {name: "GitHub", url: "https://github.com/src-d/datasets/tree/master/PublicGitArchive"}
diff --git a/_publications/markovtsev2019style.markdown b/_publications/markovtsev2019style.markdown
index 6a9af904..8b890c96 100644
--- a/_publications/markovtsev2019style.markdown
+++ b/_publications/markovtsev2019style.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "STYLE-ANALYZER: fixing code style inconsistencies with interpretable unsupervised algorithms"
-authors: V. Markovtsev, W. Long, H. Mougard, K. Slavnov, E. Bulychev
+authors: Vadim Markovtsev, Waren Long, Hugo Mougard, Konstantin Slavnov, Egor Bulychev
 conference: MSR
 year: 2019
-bibkey: markovtsev2019style
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1904.00935"}
 tags: ["style"]
diff --git a/_publications/mastropaolo2022using.markdown b/_publications/mastropaolo2022using.markdown
new file mode 100644
index 00000000..630b56ac
--- /dev/null
+++ b/_publications/mastropaolo2022using.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Using Deep Learning to Generate Complete Log Statements"
+authors: Antonio Mastropaolo, Luca Pascarella, Gabriele Bavota
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2201.04837"}
+tags: ["Transformer", "logging"]
+---
+Logging is a practice widely adopted in several phases of the software lifecycle. For example, during software development log statements allow engineers to verify and debug the system by exposing fine-grained information of the running software. While the benefits of logging are undisputed, taking proper decisions about where to inject log statements, what information to log, and at which log level (e.g., error, warning) is crucial for the logging effectiveness. In this paper, we present LANCE (Log stAtemeNt reCommEnder), the first approach supporting developers in all these decisions. LANCE features a Text-To-Text-Transfer-Transformer (T5) model that has been trained on 6,894,456 Java methods. LANCE takes as input a Java method and injects in it a full log statement, including a human-comprehensible logging message and properly choosing the needed log level and the statement location. Our results show that LANCE is able to (i) properly identify the location in the code where to inject the statement in 65.9% of Java methods requiring it; (ii) selecting the proper log level in 66.2% of cases; and (iii) generate a completely correct log statement including a meaningful logging message in 15.2% of cases. 
diff --git a/_publications/mehrotra2020modeling.markdown b/_publications/mehrotra2020modeling.markdown
new file mode 100644
index 00000000..5e5def39
--- /dev/null
+++ b/_publications/mehrotra2020modeling.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Modeling Functional Similarity in Source Code with Graph-Based Siamese Networks"
+authors: Nikita Mehrotra, Navdha Agarwal, Piyush Gupta, Saket Anand, David Lo, Rahul Purandare
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2011.11228"}
+tags: ["clone", "GNN"]
+---
+Code clones are duplicate code fragments that share (nearly) similar syntax or semantics. Code clone detection plays an important role in software maintenance, code refactoring, and reuse. A substantial amount of research has been conducted in the past to detect clones. A majority of these approaches use lexical and syntactic information to detect clones. However, only a few of them target semantic clones. Recently, motivated by the success of deep learning models in other fields, including natural language processing and computer vision, researchers have attempted to adopt deep learning techniques to detect code clones. These approaches use lexical information (tokens) and(or) syntactic structures like abstract syntax trees (ASTs) to detect code clones. However, they do not make sufficient use of the available structural and semantic information hence, limiting their capabilities.
+
+This paper addresses the problem of semantic code clone detection using program dependency graphs and geometric neural networks, leveraging the structured syntactic and semantic information. We have developed a prototype tool HOLMES, based on our novel approach, and empirically evaluated it on popular code clone benchmarks. Our results show that HOLMES performs considerably better than the other state-of-the-art tool, TBCCD. We also evaluated HOLMES on unseen projects and performed cross dataset experiments to assess the generalizability of HOLMES. Our results affirm that HOLMES outperforms TBCCD since most of the pairs that HOLMES detected were either undetected or suboptimally reported by TBCCD. 
diff --git a/_publications/menon2013machine.markdown b/_publications/menon2013machine.markdown
index 90d1a6ab..806a4cdf 100644
--- a/_publications/menon2013machine.markdown
+++ b/_publications/menon2013machine.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "A Machine Learning Framework for Programming by Example"
-authors: A. K. Menon, O. Tamuz, S. Gulwani, B. Lampson, A.T. Kalai
+authors: Aditya Menon, Omer Tamuz, Sumit Gulwani, Butler Lampson, Adam Kalai
 conference: ICML
 year: 2013
-bibkey: menon2013machine
-tags: ["generation"]
+tags: ["code generation"]
 ---
 Learning programs is a timely and interesting challenge. In Programming by Example
 (PBE), a system attempts to infer a program
diff --git a/_publications/mesbah2019deepdelta.markdown b/_publications/mesbah2019deepdelta.markdown
index b8f78c51..fd572924 100644
--- a/_publications/mesbah2019deepdelta.markdown
+++ b/_publications/mesbah2019deepdelta.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "DeepDelta: Learning to Repair Compilation Errors"
-authors: A. Mesbah, A. Rice, E. Johnstin, N. Glorioso
+authors: Ali Mesbah, Andrew Rice, Emily Johnston, Nick Glorioso, Edward Aftandilian.
 conference: 
 year: 2019
-bibkey: mesbah2019deepdelta
 tags: ["repair", "edit", "compilation"]
 ---
 Programmers spend a substantial amount of time manually repairing
diff --git a/_publications/mir2021manytypes4py.markdown b/_publications/mir2021manytypes4py.markdown
new file mode 100644
index 00000000..539f9985
--- /dev/null
+++ b/_publications/mir2021manytypes4py.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "ManyTypes4Py: A Benchmark Python Dataset for Machine Learning-based Type Inference"
+authors: Amir M. Mir, Evaldas Latoskinas, Georgios Gousios
+conference: MSR
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2104.04706"}
+   - {name: "Dataset", url: "https://zenodo.org/record/4479714"}
+tags: ["dataset", "types"]
+---
+In this paper, we present ManyTypes4Py, a large Python dataset for machine learning (ML)-based type inference. The dataset contains a total of 5,382 Python projects with more than 869K type annotations. Duplicate source code files were removed to eliminate the negative effect of the duplication bias. To facilitate training and evaluation of ML models, the dataset was split into training, validation and test sets by files. To extract type information from abstract syntax trees (ASTs), a lightweight static analyzer pipeline is developed and accompanied with the dataset. Using this pipeline, the collected Python projects were analyzed and the results of the AST analysis were stored in JSON-formatted files. The ManyTypes4Py dataset is shared on zenodo and its tools are publicly available on GitHub. 
diff --git a/_publications/mir2021type4py.markdown b/_publications/mir2021type4py.markdown
new file mode 100644
index 00000000..fb8922a4
--- /dev/null
+++ b/_publications/mir2021type4py.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Type4Py: Deep Similarity Learning-Based Type Inference for Python"
+authors: Amir M. Mir, Evaldas Latoskinas, Sebastian Proksch, Georgios Gousios
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2101.04470"}
+   - {name: "GitHub", url: "https://github.com/saltudelft/type4py"}
+tags: ["types"]
+---
+Dynamic languages, such as Python and Javascript, trade static typing for developer flexibility. While this allegedly enables greater productivity, lack of static typing can cause runtime exceptions, type inconsistencies, and is a major factor for weak IDE support. To alleviate these issues, PEP 484 introduced optional type annotations for Python. As retrofitting types to existing codebases is error-prone and laborious, learning-based approaches have been proposed to enable automatic type annotations based on existing, partially annotated codebases. However, the prediction of rare and user-defined types is still challenging. In this paper, we present Type4Py, a deep similarity learning-based type inference model for Python. We design a hierarchical neural network model that learns to discriminate between types of the same kind and dissimilar types in a high-dimensional space, which results in clusters of types. Nearest neighbor search suggests likely type signatures of given Python functions. The types visible to analyzed modules are surfaced using lightweight dependency analysis. The results of quantitative and qualitative evaluation indicate that Type4Py significantly outperforms state-of-the-art approaches at the type prediction task. Considering the Top-1 prediction, Type4Py obtains 19.33% and 13.49% higher precision than Typilus and TypeWriter, respectively, while utilizing a much bigger vocabulary. 
diff --git a/_publications/mohajer2023skipanalyzer.markdown b/_publications/mohajer2023skipanalyzer.markdown
new file mode 100644
index 00000000..cbf424e7
--- /dev/null
+++ b/_publications/mohajer2023skipanalyzer.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "SkipAnalyzer: A Tool for Static Code Analysis with Large Language Models"
+authors: Mohammad Mahdi Mohajer, Reem Aleithan, Nima Shiri Harzevili, Moshi Wei, Alvine Boaye Belle, Hung Viet Pham, Song Wang
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2310.18532"}
+tags: ["repair"]
+---
+We introduce SkipAnalyzer, a large language model (LLM)-powered tool for static code analysis. SkipAnalyzer has three components: 1) an LLM-based static bug detector that scans source code and reports specific types of bugs, 2) an LLM-based false-positive filter that can identify false-positive bugs in the results of static bug detectors (e.g., the result of step 1) to improve detection accuracy, and 3) an LLM-based patch generator that can generate patches for the detected bugs above. As a proof-of-concept, SkipAnalyzer is built on ChatGPT, which has exhibited outstanding performance in various software engineering tasks. To evaluate SkipAnalyzer, we focus on two types of typical and critical bugs that are targeted by static bug detection, i.e., Null Dereference and Resource Leak as subjects. We employ Infer to aid the gathering of these two bug types from 10 open-source projects. Consequently, our experiment dataset contains 222 instances of Null Dereference bugs and 46 instances of Resource Leak bugs. Our study demonstrates that SkipAnalyzer achieves remarkable performance in the mentioned static analysis tasks, including bug detection, false-positive warning removal, and bug repair. In static bug detection, SkipAnalyzer achieves accuracy values of up to 68.37% for detecting Null Dereference bugs and 76.95% for detecting Resource Leak bugs, improving the precision of the current leading bug detector, Infer, by 12.86% and 43.13%, respectively. For removing false-positive warnings, SkipAnalyzer can reach a precision of up to 93.88% for Null Dereference bugs and 63.33% for Resource Leak bugs. Additionally, SkipAnalyzer surpasses state-of-the-art false-positive warning removal tools. Furthermore, in bug repair, SkipAnalyzer can generate syntactically correct patches to fix its detected bugs with a success rate of up to 97.30%.
diff --git a/_publications/monperrus2021megadiff.markdown b/_publications/monperrus2021megadiff.markdown
new file mode 100644
index 00000000..9a3bb4d9
--- /dev/null
+++ b/_publications/monperrus2021megadiff.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Megadiff: A Dataset of 600k Java Source Code Changes Categorized by Diff Size"
+authors: Martin Monperrus, Matias Martinez, He Ye, Fernanda Madeiral, Thomas Durieux, Zhongxing Yu
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.04631"}
+   - {name: "Dataset", url: "https://zenodo.org/record/5013515"}
+tags: ["dataset", "edit"]
+---
+This paper presents Megadiff, a dataset of source code diffs. It focuses on Java, with strict inclusion criteria based on commit message and diff size. Megadiff contains 663 029 Java diffs that can be used for research on commit comprehension, fault localization, automated program repair, and machine learning on code changes. 
diff --git a/_publications/mou2014building.markdown b/_publications/mou2014building.markdown
index 81ec0c42..b29e9093 100644
--- a/_publications/mou2014building.markdown
+++ b/_publications/mou2014building.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Building Program Vector Representations for Deep Learning"
-authors: L. Mou, G. Li, Y. Liu, H. Peng, Z. Jin, Y. Xu, L. Zhang
+authors: Hao Peng, Lili Mou, Ge Li, Yuxuan Liu, Lu Zhang, Zhi Jin. 
 conference: International Conference on Knowledge Science, Engineering and Management
 year: 2014
-bibkey: mou2014building
-tags: ["representation", "AST"]
+tags: ["representation", "grammar"]
 ---
 Deep learning has made significant breakthroughs
 in various fields of artificial intelligence. Advantages of deep
diff --git a/_publications/mou2016convolutional.markdown b/_publications/mou2016convolutional.markdown
index c36878d8..5df86b1a 100644
--- a/_publications/mou2016convolutional.markdown
+++ b/_publications/mou2016convolutional.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Convolutional Neural Networks over Tree Structures for Programming Language Processing"
-authors: L. Mou, G. Li, L. Zhang, T. Wang, Z. Jin
+authors: Lili Mou, Ge Li, Lu Zhang, Tao Wang, Zhi Jin
 conference: AAAI
 year: 2016
-bibkey: mou2016convolutional
-tags: ["representation", "AST"]
+tags: ["representation", "grammar"]
 ---
 Programming language processing (similar to natural language processing) is a hot research topic in the field of software engineering; it has also aroused growing interest in the
 artificial intelligence community. However, different from a
diff --git a/_publications/movshovitz2013natural.markdown b/_publications/movshovitz2013natural.markdown
index 0b4eab5b..1e734eec 100644
--- a/_publications/movshovitz2013natural.markdown
+++ b/_publications/movshovitz2013natural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Natural Language Models for Predicting Programming Comments"
-authors: D. Movshovitz-Attias, W.W. Cohen
+authors: Dana Movshovitz-Attias, William W. Cohen
 conference: ACL
 year: 2013
-bibkey: movshovitz2013natural
 tags: ["bimodal", "documentation", "summarization"]
 ---
 Statistical language models have successfully been used to describe and analyze
diff --git a/_publications/movshovitz2015kb.markdown b/_publications/movshovitz2015kb.markdown
index caa5a2b1..de0926b2 100644
--- a/_publications/movshovitz2015kb.markdown
+++ b/_publications/movshovitz2015kb.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "KB-LDA: Jointly Learning a Knowledge Base of Hierarchy, Relations, and Facts"
-authors: D. Movshovitz-Attias, W. W. Cohen
+authors: Dana Movshovitz-Attias, William W. Cohen
 conference: ACL
 year: 2015
-bibkey: movshovitz2015kb
 tags: ["pattern mining"]
 ---
 Many existing knowledge bases (KBs), including Freebase, Yago, and NELL, rely
diff --git a/_publications/muennighoff2023octopack.markdown b/_publications/muennighoff2023octopack.markdown
new file mode 100644
index 00000000..718e7c30
--- /dev/null
+++ b/_publications/muennighoff2023octopack.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "OctoPack: Instruction Tuning Code Large Language Models"
+authors: Niklas Muennighoff, Qian Liu, Armel Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro von Werra, Shayne Longpre
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2308.07124"}
+tags: ["dataset", "instruction tuning"]
+---
+Finetuning large language models (LLMs) on instructions leads to vast performance improvements on natural language tasks. We apply instruction tuning using code, leveraging the natural structure of Git commits, which pair code changes with human instructions. We compile CommitPack: 4 terabytes of Git commits across 350 programming languages. We benchmark CommitPack against other natural and synthetic code instructions (xP3x, Self-Instruct, OASST) on the 16B parameter StarCoder model, and achieve state-of-the-art performance among models not trained on OpenAI outputs, on the HumanEval Python benchmark (46.2% pass@1). We further introduce HumanEvalPack, expanding the HumanEval benchmark to a total of 3 coding tasks (Code Repair, Code Explanation, Code Synthesis) across 6 languages (Python, JavaScript, Java, Go, C++, Rust). Our models, OctoCoder and OctoGeeX, achieve the best performance across HumanEvalPack among all permissive models, demonstrating CommitPack's benefits in generalizing to a wider set of languages and natural coding tasks. Code, models and data are freely available at https://github.com/bigcode-project/octopack.
diff --git a/_publications/mukherjee2020searching.markdown b/_publications/mukherjee2020searching.markdown
index c7bd98f1..7ee9d482 100644
--- a/_publications/mukherjee2020searching.markdown
+++ b/_publications/mukherjee2020searching.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Searching a Database of Source Codes Using Contextualized Code Search"
-authors: R. Mukherjee, S. Chaudhuri, C. Jermaine
+authors: Rohan Mukherjee, Swarat Chaudhuri, Chris Jermaine
 conference: 
 year: 2020
-bibkey: mukherjee2020searching
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2001.03277"}
 tags: ["search", "representation"]
diff --git a/_publications/mukherjee2021neural.markdown b/_publications/mukherjee2021neural.markdown
new file mode 100644
index 00000000..a3e07641
--- /dev/null
+++ b/_publications/mukherjee2021neural.markdown
@@ -0,0 +1,22 @@
+---
+layout: publication
+title: "Neural Program Generation Modulo Static Analysis"
+authors: Rohan Mukherjee, Yeming Wen, Dipak Chaudhari, Thomas W. Reps, Swarat Chaudhuri, Chris Jermaine
+conference: NeurIPS
+year: 2021
+additional_links:
+   - {name: "Preprint", url: "https://www.cs.utexas.edu/~swarat/pubs/neurips21-nsg.pdf"}
+tags: ["synthesis", "language model"]
+---
+State-of-the-art neural models of source code tend to be evaluated on the generation
+of individual expressions and lines of code, and commonly fail on long-horizon
+tasks such as the generation of entire method bodies. We propose to address this
+deficiency using weak supervision from a static program analyzer. Our neurosymbolic method allows a deep generative model to symbolically compute, using calls
+to a static-analysis tool, long-distance semantic relationships in the code that it
+has already generated. During training, the model observes these relationships
+and learns to generate programs conditioned on them. We apply our approach to
+the problem of generating entire Java methods given the remainder of the class
+that contains the method. Our experiments show that the approach substantially
+outperforms state-of-the-art transformers and a model that explicitly tries to learn
+program semantics on this task, both in terms of producing programs free of basic
+semantic errors and in terms of syntactically matching the ground truth.
diff --git a/_publications/murali2017bayesian.markdown b/_publications/murali2017bayesian.markdown
index 6e077221..29100b79 100644
--- a/_publications/murali2017bayesian.markdown
+++ b/_publications/murali2017bayesian.markdown
@@ -1,11 +1,12 @@
 ---
 layout: publication
 title: "Bayesian Sketch Learning for Program Synthesis"
-authors: V. Murali, S. Chaudhuri, C. Jermaine
-conference: arXiv preprint 1703.05698
-year: 2017
-bibkey: murali2017bayesian
-tags: ["generation", "API"]
+authors: Vijayaraghavan Murali, Letao Qi, Swarat Chaudhuri, Chris Jermaine
+conference: ICLR
+year: 2018
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1703.05698"}
+tags: ["code generation", "API"]
 ---
 We present a Bayesian statistical approach to the problem of automatic program synthesis. Our synthesizer starts
 by learning, offline and from an existing corpus, a probabilistic model of real-world programs. During synthesis,
diff --git a/_publications/murali2017finding.markdown b/_publications/murali2017finding.markdown
index 367cd9e5..9b0e9a55 100644
--- a/_publications/murali2017finding.markdown
+++ b/_publications/murali2017finding.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Finding Likely Errors with Bayesian Specifications"
-authors: V. Murali, S. Chaudhuri, C. Jermaine
-conference: arXiv preprint 1703.01370
+authors: Vijayaraghavan Murali, Swarat Chaudhuri, Chris Jermaine
+conference:
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1703.01370"}
 year: 2017
-bibkey: murali2017finding
 tags: ["program analysis", "API"]
 ---
 We present a Bayesian framework for learning probabilistic specifications from large, unstructured code corpora, and
diff --git a/_publications/nadeem2022codedsi.markdown b/_publications/nadeem2022codedsi.markdown
new file mode 100644
index 00000000..224c2e8b
--- /dev/null
+++ b/_publications/nadeem2022codedsi.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeDSI: Differentiable Code Search"
+authors: Usama Nadeem, Noah Ziems, Shaoen Wu
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2210.00328"}
+tags: ["search"]
+---
+Reimplementing solutions to previously solved software engineering problems is not only inefficient but also introduces inadequate and error-prone code. Many existing methods achieve impressive performance on this issue by using autoregressive text-generation models trained on code. However, these methods are not without their flaws. The generated code from these models can be buggy, lack documentation, and introduce vulnerabilities that may go unnoticed by developers. An alternative to code generation -- neural code search -- is a field of machine learning where a model takes natural language queries as input and, in turn, relevant code samples from a database are returned. Due to the nature of this pre-existing database, code samples can be documented, tested, licensed, and checked for vulnerabilities before being used by developers in production. In this work, we present CodeDSI, an end-to-end unified approach to code search. CodeDSI is trained to directly map natural language queries to their respective code samples, which can be retrieved later. In an effort to improve the performance of code search, we have investigated docid representation strategies, impact of tokenization on docid structure, and dataset sizes on overall code search performance. Our results demonstrate CodeDSI strong performance, exceeding conventional robust baselines by 2-6% across varying dataset sizes.
\ No newline at end of file
diff --git a/_publications/naik2022probing.markdown b/_publications/naik2022probing.markdown
new file mode 100644
index 00000000..7945b28b
--- /dev/null
+++ b/_publications/naik2022probing.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Probing Semantic Grounding in Language Models of Code with Representational Similarity Analysis"
+authors: Shounak Naik, Rajaswa Patil, Swati Agarwal, Veeky Baths
+conference: International Conference on Advanced Data Mining and Applications (ADMA 2022)
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2207.07706"}
+   - {name: "PDF", url: "https://link.springer.com/chapter/10.1007/978-3-031-22137-8_29"}
+   - {name: "Code", url: "https://github.com/shounaknaik/Probing-Semantic-Grounding-in-Language-Models-of-Code-with-Representational-Similarity-Analysis"}
+tags: ["interpretability", "language model", "evaluation", "Transformer"]
+---
+Representational Similarity Analysis is a method from cognitive neuroscience, which helps in comparing representations from two different sources of data. In this paper, we propose using Representational Similarity Analysis to probe the semantic grounding in language models of code. We probe representations from the CodeBERT model for semantic grounding by using the data from the IBM CodeNet dataset. Through our experiments, we show that current pre-training methods do not induce semantic grounding in language models of code, and instead focus on optimizing form-based patterns. We also show that even a little amount of fine-tuning on semantically relevant tasks increases the semantic grounding in CodeBERT significantly. Our ablations with the input modality to the CodeBERT model show that using bimodal inputs (code and natural language) over unimodal inputs (only code) gives better semantic grounding and sample efficiency during semantic fine-tuning. Finally, our experiments with semantic perturbations in code reveal that CodeBERT is able to robustly distinguish between semantically correct and incorrect code.
diff --git a/_publications/nair2020funcgnn.markdown b/_publications/nair2020funcgnn.markdown
new file mode 100644
index 00000000..7f7932d7
--- /dev/null
+++ b/_publications/nair2020funcgnn.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "funcGNN: A Graph Neural Network Approach to Program Similarity"
+authors: Aravind Nair, Avijit Roy, Karl Meinke
+conference: ESEM 
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2007.13239"}
+tags: ["GNN", "clone"]
+---
+Program similarity is a fundamental concept, central to the solution of software engineering tasks such as software plagiarism, clone identification, code refactoring and code search. Accurate similarity estimation between programs requires an in-depth understanding of their structure, semantics and flow. A control flow graph (CFG), is a graphical representation of a program which captures its logical control flow and hence its semantics. A common approach is to estimate program similarity by analysing CFGs using graph similarity measures, e.g. graph edit distance (GED). However, graph edit distance is an NP-hard problem and computationally expensive, making the application of graph similarity techniques to complex software programs impractical. This study intends to examine the effectiveness of graph neural networks to estimate program similarity, by analysing the associated control flow graphs. We introduce funcGNN, which is a graph neural network trained on labeled CFG pairs to predict the GED between unseen program pairs by utilizing an effective embedding vector. To our knowledge, this is the first time graph neural networks have been applied on labeled CFGs for estimating the similarity between high-level language programs. Results: We demonstrate the effectiveness of funcGNN to estimate the GED between programs and our experimental analysis demonstrates how it achieves a lower error rate (0.00194), with faster (23 times faster than the quickest traditional GED approximation method) and better scalability compared with the state of the art methods. funcGNN posses the inductive learning ability to infer program structure and generalise to unseen programs. The graph embedding of a program proposed by our methodology could be applied to several related software engineering problems (such as code plagiarism and clone identification) thus opening multiple research directions. 
diff --git a/_publications/nguyen2013lexical.markdown b/_publications/nguyen2013lexical.markdown
index 146364fb..c9ae218e 100644
--- a/_publications/nguyen2013lexical.markdown
+++ b/_publications/nguyen2013lexical.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Lexical Statistical Machine Translation for Language Migration"
-authors: A. T. Nguyen, T. T. Nguyen, T. N. Nguyen
+authors: Anh Tuan Nguyen, Tung Thanh Nguyen, Tien N. Nguyen
 conference: FSE
 year: 2013
-bibkey: nguyen2013lexical
 tags: ["migration", "API"]
 ---
 Prior research has shown that source code also exhibits naturalness, i.e. it is written by humans and is likely to be
diff --git a/_publications/nguyen2013statistical.markdown b/_publications/nguyen2013statistical.markdown
index 33171ac5..d78f4953 100644
--- a/_publications/nguyen2013statistical.markdown
+++ b/_publications/nguyen2013statistical.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Statistical Semantic Language Model for Source Code"
-authors: T.T. Nguyen, A.T. Nguyen, H.A. Nguyen, T.N. Nguyen
+authors: Tung Thanh Nguyen, Anh Tuan Nguyen, Hoan Anh Nguyen, Tien N. Nguyen
 conference: FSE
 year: 2013
-bibkey: nguyen2013statistical
 tags: ["language model"]
 ---
 Recent research has successfully applied the statistical n-gram language model to show that source code exhibits a
diff --git a/_publications/nguyen2013study.markdown b/_publications/nguyen2013study.markdown
index 74baa75f..114b9c5f 100644
--- a/_publications/nguyen2013study.markdown
+++ b/_publications/nguyen2013study.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Study of Repetitiveness of Code Changes in Software Evolution"
-authors: H.A. Nguyen, A.T. Nguyen, T.T. Nguyen, T.N. Nguyen, H. Rajan
+authors: Hoan Anh Nguyen, Anh Tuan Nguyen, Tung Thanh Nguyen, Tien N. Nguyen, and Hridesh Rajan
 conference: ASE
 year: 2013
-bibkey: nguyen2013study
 tags: ["edit"]
 ---
 In this paper, we present a large-scale study of
diff --git a/_publications/nguyen2014statistical.markdown b/_publications/nguyen2014statistical.markdown
index fec01c7f..5c791ab2 100644
--- a/_publications/nguyen2014statistical.markdown
+++ b/_publications/nguyen2014statistical.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Statistical Learning Approach for Mining API Usage Mappings for Code Migration"
-authors: A.T. Nguyen, H.A. Nguyen, T.T. Nguyen, T.N. Nguyen
+authors: Anh Tuan Nguyen, Hoan Anh Nguyen, Tung Thanh Nguyen, Tien N. Nguyen
 conference: ASE
 year: 2014
-bibkey: nguyen2014statistical
 tags: ["migration", "API"]
 ---
 The same software product nowadays could appear in multiple platforms and devices. To address business needs, software companies
diff --git a/_publications/nguyen2015divide.markdown b/_publications/nguyen2015divide.markdown
index 565206b1..13c993d3 100644
--- a/_publications/nguyen2015divide.markdown
+++ b/_publications/nguyen2015divide.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Divide-and-Conquer Approach for Multi-phase Statistical Migration for Source Code"
-authors: A.T. Nguyen, T.T. Nguyen, T.N. Nguyen
+authors: Anh Tuan Nguyen, Tung Thanh Nguyen, Tien N. Nguyen
 conference: ASE
 year: 2014
-bibkey: nguyen2015divide
 tags: ["migration"]
 ---
 Prior research shows that directly applying phrase-based SMT on lexical tokens to migrate Java to C# produces
diff --git a/_publications/nguyen2015graph.markdown b/_publications/nguyen2015graph.markdown
index c35743de..5c9a8e07 100644
--- a/_publications/nguyen2015graph.markdown
+++ b/_publications/nguyen2015graph.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Graph-based Statistical Language Model for Code"
-authors: A.T. Nguyen, T.N. Nguyen
+authors: Anh Tuan Nguyen, Tien N. Nguyen
 conference: ICSE
 year: 2015
-bibkey: nguyen2015graph
 tags: ["representation", "language model", "autocomplete"]
 ---
 n-gram statistical language model has been successfully applied to capture programming patterns to support code
diff --git a/_publications/nguyen2016learning.markdown b/_publications/nguyen2016learning.markdown
index a40942e7..7af8d204 100644
--- a/_publications/nguyen2016learning.markdown
+++ b/_publications/nguyen2016learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning API Usages from Bytecode: A Statistical Approach"
-authors: T.T. Nguyen, H.V. Pham, P.M. Vu, T.T. Nguyen
+authors: Tam The Nguyen, Hung Viet Pham, Phong Minh Vu, Tung Thanh Nguyen
 conference: ICSE
 year: 2016
-bibkey: nguyen2016learning
 tags: ["representation", "API"]
 ---
 Mobile app developers rely heavily on standard API frameworks and libraries. However, learning API usages is often challenging due to the fast-changing nature of API frameworks for mobile systems and the insufficiency of API documentation and source code examples. In this paper, we propose a novel approach to learn API usages from bytecode of Android mobile apps. Our core contributions include HAPI, a statistical model of API usages and three algorithms to extract method call sequences from apps' bytecode, to train HAPI based on those sequences, and to recommend method calls in code completion using the trained HAPIs. Our empirical evaluation shows that our prototype tool can effectively learn API usages from 200 thousand apps containing 350 million method sequences. It recommends next method calls with top-3 accuracy of 90% and outperforms baseline approaches on average 10-20%.
diff --git a/_publications/nguyen2016mapping.markdown b/_publications/nguyen2016mapping.markdown
index a4078884..39212e21 100644
--- a/_publications/nguyen2016mapping.markdown
+++ b/_publications/nguyen2016mapping.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Mapping API Elements for Code Migration with Vector Representations"
-authors: T.D. Nguyen, A.T. Nguyen, T.N. Nguyen
+authors: Trong Duc Nguyen, Anh Tuan Nguyen, Tien N. Nguyen
 conference: ICSE
 year: 2016
-bibkey: nguyen2016mapping
 tags: ["migration", "API"]
 ---
-
+Mapping API elements has a significant role in software development, especially in code migration. A manual process of defining the migration is tedious and error-prone while recent approaches to automatically mine API mappings are limited to discover the mappings with textually similar APIs' names. This leads to the low accuracy in existing migration tools.We propose an approach to automatically mine API mappings which overcomes the lexical mismatch problem. We represent an API by its usages instead of its name.To characterize an API with its context consisting of surrounding APIs in its usages, we take advantage of Word2Vec model to project the APIs of Java JDK and C# .NET into corresponding continuous vector spaces. The semantic relations among APIs will be observed in those continuous space as the geometric arrangements between their representation vectors in two vector spaces.We use a learning approach to derive the linear (e.g., rotating and scaling) transformation function between two vector spaces. Transformation function is trained from human-defined pairs of API mappings from Java to C#. To find the C# API mapping with a given Java API, we use the learned function to compute its transformed vector in the C# vector space. Then, the C# API which has the most similar vector with the transformed vector is considered as the result. Our experiment shows that for just one suggestion, we are able to correctly derive the API in C# in almost 43% of the cases. With 5 suggestions, we can correctly suggest the correct C# API in almost 3 out of 4 cases (73.2%).
\ No newline at end of file
diff --git a/_publications/nguyen2017exploring.markdown b/_publications/nguyen2017exploring.markdown
index 22197b71..dc298c22 100644
--- a/_publications/nguyen2017exploring.markdown
+++ b/_publications/nguyen2017exploring.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Exploring API Embedding for API Usages and Applications"
-authors: T.D. Nguyen, A.T. Nguyen, H.D. Phan, T.N. Nguyen
+authors: Trong Duc Nguyen, Anh Tuan Nguyen, Hung Dang Phan, Tien N. Nguyen
 conference: ICSE
 year: 2017
-bibkey: nguyen2017exploring
 tags: ["API", "representation"]
 ---
 Word2Vec is a class of neural network models that
diff --git a/_publications/nguyen2019graph.markdown b/_publications/nguyen2019graph.markdown
index 4918b518..1e586aac 100644
--- a/_publications/nguyen2019graph.markdown
+++ b/_publications/nguyen2019graph.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Graph-based Mining of In-the-Wild, Fine-grained, Semantic Code Change Patterns"
-authors: H. Nguyen, T. Nguyen, D. Dig, S. Nguyen, H. Tran, M. Hilton
+authors: Hoan Anh Nguyen, Tien N. Nguyen, Danny Dig, Son Nguyen, Hieu Tran, and Michael Hilton
 conference: ICSE
 year: 2019
-bibkey: nguyen2019graph
 tags: ["edit", "pattern mining"]
 ---
 Existing approaches for detecting repetitive code changes relying on syntactic similarity cannot effectively detect semantic change patterns. In this work, we introduce a novel graph-based mining approach, CPatMiner, which is capable of detecting semantic code change patterns from a large number of open-source repositories by capturing dependencies between fine-grained change elements. We evaluated CPatMiner by mining change patterns in a diverse corpus of 5,000+ open-source projects from GitHub with 170,000+ developers. We use three complementary methods. First, we sent the mined patterns to the authors and received 108 responses. 70% of respondents recognized those patterns as their meaningful frequent changes. 79% of respondents even named the patterns, and 44% wanted IDEs to automate such repetitive changes. The mined patterns belong to various activities: adaptive (9%), perfective (20%), corrective (35%) and preventive (36%). Second, we compared CPatMiner with the state-of-the-art, AST-based technique, and reported that CPatMiner detects 2.1x more meaningful patterns. Third, we used CPatMiner to search for patterns in a corpus of 88 GitHub projects with longer histories consisting of 164M SLOCs. It constructed 322K fine-grained change graphs containing 3M nodes, and detected 17K change patterns which provide unique insights on the practice of change patterns among individuals and teams. We found that a large percentage (75%) of the patterns from individual developers are commonly shared with others, and this holds true for teams. Moreover, we found that the patterns spread widely over time. Thus, we call for a community-based change pattern database to provide important resources in novel applications.
\ No newline at end of file
diff --git a/_publications/nguyen2020suggesting.markdown b/_publications/nguyen2020suggesting.markdown
index 1a93964e..2c895539 100644
--- a/_publications/nguyen2020suggesting.markdown
+++ b/_publications/nguyen2020suggesting.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Suggesting Natural Method Names to Check Name Consistencies"
-authors: S. Nguyen, H. Phan, T. Le, T. N. Nguyen
+authors: Son Nguyen, Hung Phan, Trinh Le, Tien N. Nguyen
 conference: ICSE
 year: 2020
-bibkey: nguyen2020suggesting
 additional_links:
    - {name: "Preprint", url: "https://sonvnguyen.github.io/publications/icse20-final.pdf"}
 tags: ["naming"]
diff --git a/_publications/nie2021evaluation.markdown b/_publications/nie2021evaluation.markdown
new file mode 100644
index 00000000..c2bb2abd
--- /dev/null
+++ b/_publications/nie2021evaluation.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Impact of Evaluation Methodologies on Code Summarization"
+authors: Pengyu Nie, Jiyang Zhang, Junyi Jessy Li, Raymond J. Mooney, Milos Gligoric
+conference: ACL
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.09619"}
+tags: ["evaluation", "dataset"]
+---
+There has been a growing interest in developing machine learning (ML) models for code summarization tasks, e.g., comment generation and method naming. Despite substantial increase in the effectiveness of ML models, the evaluation methodologies, i.e., the way people split datasets into training, validation, and test sets, were not well studied. Specifically, no prior work on code summarization considered the timestamps of code and comments during evaluation. This may lead to evaluations that are inconsistent with the intended use cases. In this paper, we introduce the time-segmented evaluation methodology, which is novel to the code summarization research community, and compare it with the mixed-project and cross-project methodologies that have been commonly used. Each methodology can be mapped to some use cases, and the time-segmented methodology should be adopted in the evaluation of ML models for code summarization. To assess the impact of methodologies, we collect a dataset of (code, comment) pairs with timestamps to train and evaluate several recent ML models for code summarization. Our experiments show that different methodologies lead to conflicting evaluation results. We invite the community to expand the set of methodologies used in evaluations. 
diff --git a/_publications/nijkamp2022conversational.markdown b/_publications/nijkamp2022conversational.markdown
new file mode 100644
index 00000000..5d3e1a72
--- /dev/null
+++ b/_publications/nijkamp2022conversational.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "A Conversational Paradigm for Program Synthesis"
+authors: Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.13474"}
+tags: ["Transformer", "synthesis"]
+---
+Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: https://github.com/salesforce/CodeGen. 
diff --git a/_publications/nijkamp2023codegen2.markdown b/_publications/nijkamp2023codegen2.markdown
new file mode 100644
index 00000000..ab8f7e4f
--- /dev/null
+++ b/_publications/nijkamp2023codegen2.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "CodeGen2: Lessons for Training LLMs on Programming and Natural Languages"
+authors: Erik Nijkamp, Hiroaki Hayashi, Caiming Xiong, Silvio Savarese, Yingbo Zhou
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2305.02309"}
+tags: ["Transformer"]
+---
+Large language models (LLMs) have demonstrated remarkable abilities in representation learning for program synthesis and understanding tasks. The quality of the learned representations appears to be dictated by the neural scaling laws as a function of the number of model parameters and observations, while imposing upper bounds on the model performance by the amount of available data and compute, which is costly.
+
+In this study, we attempt to render the training of LLMs for program synthesis more efficient by unifying four key components: (1) model architectures, (2) learning methods, (3) infill sampling, and, (4) data distributions. Specifically, for the model architecture, we attempt to unify encoder and decoder-based models into a single prefix-LM. For learning methods, (i) causal language modeling, (ii) span corruption, (iii) infilling are unified into a simple learning algorithm. For infill sampling, we explore the claim of a "free lunch" hypothesis. For data distributions, the effect of a mixture distribution of programming and natural languages on model performance is explored.
+
+We conduct a comprehensive series of empirical experiments on 1B LLMs, for which failures and successes of this exploration are distilled into four lessons. We will provide a final recipe for training and release CodeGen2 models in size 1B, 3.7B, 7B, and, 16B parameters, along with the training framework as open-source: https://github.com/salesforce/CodeGen2 
diff --git a/_publications/nitin2021direct.markdown b/_publications/nitin2021direct.markdown
new file mode 100644
index 00000000..03a9b529
--- /dev/null
+++ b/_publications/nitin2021direct.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DIRECT : A Transformer-based Model for Decompiled Identifier Renaming"
+authors: Vikram Nitin, Anthony Saieva, Baishakhi Ray, Gail Kaiser
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.6.pdf"}
+tags: ["Transformer", "decompilation"]
+---
+Decompiling binary executables to high-level code is an important step in reverse engineering scenarios, such as malware analysis and legacy code maintenance. However, the generated high-level code is difficult to understand since the original variable names are lost. In this paper, we leverage transformer models to reconstruct the original variable names from decompiled code. Inherent differences between code and natural language present certain challenges in applying conventional transformer-based architectures to variable name recovery. We propose DIRECT, a novel transformer-based architecture customized specifically for the task at hand. We evaluate our model on a dataset of decompiled functions and find that DIRECT outperforms the previous state-of-the-art model by up to 20%. We also present ablation studies evaluating the impact of each of our modifications. We make the source code of DIRECT available to encourage reproducible research.
diff --git a/_publications/niu2022spt-code.markdown b/_publications/niu2022spt-code.markdown
new file mode 100644
index 00000000..8a42fa41
--- /dev/null
+++ b/_publications/niu2022spt-code.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "SPT-Code: Sequence-to-Sequence Pre-Training for Learning Source Code Representations"
+authors: Changan Niu, Chuanyi Li, Vincent Ng, Jidong Ge, Liguo Huang, Bin Luo
+conference: ICSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2201.01549"}
+   - {name: "code", url: "https://github.com/NougatCA/SPT-Code"}
+tags: ["Transformer", "representation"]
+---
+Recent years have seen the successful application of large pre-trained modelsto code representation learning, resulting in substantial improvements on many code-related downstream tasks. But there are issues surrounding theirapplication to SE tasks. First, the majority of the pre-trained models focus on pre-training only the encoder of the Transformer. For generation tasks that are addressed using models with the encoder-decoder architecture, however, there is no reason why the decoder should be left out during pre-training. Second, many existing pre-trained models, including state-of-the-art models such as T5-learning, simply reuse the pre-training tasks designed for natural languages. Moreover, to learn the natural language description of source code needed eventually for code-related tasks such as code summarization, existingpre-training tasks require a bilingual corpus composed of source code and the associated natural language description, which severely limits the amount of data for pre-training. To this end, we propose SPT-Code, a sequence-to-sequence pre-trained model for source code. In order to pre-train SPT-Code in a sequence-to-sequence manner and address the aforementioned weaknesses associated with existing pre-training tasks, we introduce three pre-training tasks that are specifically designed to enable SPT-Code to learn knowledge of source code, the corresponding code structure, as well as a natural language description of the code without relying on any bilingual corpus, and eventually exploit these three sources of information when it is applied to downstreamt asks. Experimental results demonstrate that SPT-Code achieves state-of-the-artperformance on five code-related downstream tasks after fine-tuning.
diff --git a/_publications/nye2021program.markdown b/_publications/nye2021program.markdown
new file mode 100644
index 00000000..9cd7e576
--- /dev/null
+++ b/_publications/nye2021program.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Program Synthesis with Large Language Models"
+authors: Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, Charles Sutton
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.07732"}
+tags: ["Transformer", "synthesis"]
+---
+This paper explores the limits of the current generation of large language models for program synthesis in general purpose programming languages. We evaluate a collection of such models (with between 244M and 137B parameters) on two new benchmarks, MBPP and MathQA-Python, in both the few-shot and fine-tuning regimes. Our benchmarks are designed to measure the ability of these models to synthesize short Python programs from natural language descriptions. The Mostly Basic Programming Problems (MBPP) dataset contains 974 programming tasks, designed to be solvable by entry-level programmers. The MathQA-Python dataset, a Python version of the MathQA benchmark, contains 23914 problems that evaluate the ability of the models to synthesize code from more complex text. On both datasets, we find that synthesis performance scales log-linearly with model size. Our largest models, even without finetuning on a code dataset, can synthesize solutions to 59.6 percent of the problems from MBPP using few-shot learning with a well-designed prompt. Fine-tuning on a held-out portion of the dataset improves performance by about 10 percentage points across most model sizes. On the MathQA-Python dataset, the largest fine-tuned model achieves 83.8 percent accuracy. Going further, we study the model's ability to engage in dialog about code, incorporating human feedback to improve its solutions. We find that natural language feedback from a human halves the error rate compared to the model's initial prediction. Additionally, we conduct an error analysis to shed light on where these models fall short and what types of programs are most difficult to generate. Finally, we explore the semantic grounding of these models by fine-tuning them to predict the results of program execution. We find that even our best models are generally unable to predict the output of a program given a specific input.
diff --git a/_publications/nye2021show.markdown b/_publications/nye2021show.markdown
new file mode 100644
index 00000000..3bb58a6f
--- /dev/null
+++ b/_publications/nye2021show.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Show Your Work: Scratchpads for Intermediate Computation with Language Models"
+authors: Maxwell Nye, Anders Johan Andreassen, Guy Gur-Ari, Henryk Michalewski, Jacob Austin, David Bieber, David Dohan, Aitor Lewkowycz, Maarten Bosma, David Luan, Charles Sutton, Augustus Odena
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2112.00114"}
+tags: ["Transformer", "execution"]
+---
+Large pre-trained language models perform remarkably well on tasks that can be done "in one pass", such as generating realistic text or synthesizing computer programs. However, they struggle with tasks that require unbounded multi-step computation, such as adding integers or executing programs. Surprisingly, we find that these same models are able to perform complex multi-step computations -- even in the few-shot regime -- when asked to perform the operation "step by step", showing the results of intermediate computations. In particular, we train transformers to perform multi-step computations by asking them to emit intermediate computation steps into a "scratchpad". On a series of increasingly complex tasks ranging from long addition to the execution of arbitrary programs, we show that scratchpads dramatically improve the ability of language models to perform multi-step computations.
diff --git a/_publications/oda2015learning.markdown b/_publications/oda2015learning.markdown
index 0e2cb897..7e1a8897 100644
--- a/_publications/oda2015learning.markdown
+++ b/_publications/oda2015learning.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Learning to Generate Pseudo-code from Source Code using Statistical Machine Translation"
-authors: Y. Oda, H. Fudaba, G. Neubig, H. Hata, S. Sakti, T. Toda, and S. Nakamura
+authors: Yusuke Oda, Hiroyuki Fudaba, Graham Neubig, Hideaki Hata, Sakriani Sakti, Tomoki Toda, Satoshi Nakamura
 conference: ASE
 year: 2015
-bibkey: oda2015learning
-tags: ["representation", "bimodal", "AST"]
+tags: ["representation", "bimodal", "grammar"]
 ---
 Pseudo-code written in natural language can aid
 the comprehension of source code in unfamiliar programming
diff --git a/_publications/oh2015learning.markdown b/_publications/oh2015learning.markdown
index a3075012..e4b91de3 100644
--- a/_publications/oh2015learning.markdown
+++ b/_publications/oh2015learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning a Strategy for Adapting a Program Analysis via Bayesian Optimisation"
-authors: H. Oh, H. Yang, K, Yi
+authors: Hakjoo Oh, Hongseok Yang, Kwangkeun Yi.
 conference: OOPSLA 
 year: 2015
-bibkey: oh2015learning
 tags: ["program analysis"]
 ---
 Building a cost-effective static analyser for real-world programs is still regarded an art. One key contributor to this
diff --git a/_publications/olausson2023demystifying.markdown b/_publications/olausson2023demystifying.markdown
new file mode 100644
index 00000000..8f89853a
--- /dev/null
+++ b/_publications/olausson2023demystifying.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Demystifying GPT Self-Repair for Code Generation"
+authors: Theo X. Olausson, Jeevana Priya Inala, Chenglong Wang, Jianfeng Gao, Armando Solar-Lezama
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2306.09896"}
+tags: ["repair"]
+---
+Large Language Models (LLMs) have shown remarkable aptitude in code generation but still struggle on challenging programming tasks. Self-repair -- in which the model debugs and fixes mistakes in its own code -- has recently become a popular way to boost performance in these settings. However, only very limited studies on how and when self-repair works effectively exist in the literature, and one might wonder to what extent a model is really capable of providing accurate feedback on why the code is wrong when that code was generated by the same model. In this paper, we analyze GPT-3.5 and GPT-4's ability to perform self-repair on APPS, a challenging dataset consisting of diverse coding challenges. To do so, we first establish a new evaluation strategy dubbed pass@t that measures the pass rate of the tasks against the total number of tokens sampled from the model, enabling a fair comparison to purely sampling-based approaches. With this evaluation strategy, we find that the effectiveness of self-repair is only seen in GPT-4. We also observe that self-repair is bottlenecked by the feedback stage; using GPT-4 to give feedback on the programs generated by GPT-3.5 and using expert human programmers to give feedback on the programs generated by GPT-4, we unlock significant performance gains.
diff --git a/_publications/omar2013structured.markdown b/_publications/omar2013structured.markdown
index d01b769b..6c03c8af 100644
--- a/_publications/omar2013structured.markdown
+++ b/_publications/omar2013structured.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Structured Statistical Syntax Tree Prediction"
-authors: C. Omar
+authors: Cyrus Omar
 conference: SPLASH
 year: 2013
-bibkey: omar2013structured
-tags: ["language model", "AST"]
+tags: ["language model", "grammar"]
 ---
 Statistical models of source code can be used to improve
 code completion systems, assistive interfaces, and code
diff --git a/_publications/orlanski2021reading.markdown b/_publications/orlanski2021reading.markdown
new file mode 100644
index 00000000..a3c31c09
--- /dev/null
+++ b/_publications/orlanski2021reading.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Reading StackOverflow Encourages Cheating: Adding Question Text Improves Extractive Code Generation"
+authors: Gabriel Orlanski, Alex Gittens
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.8.pdf"}
+tags: ["dataset", "Transformer"]
+---
+Answering a programming question with only its title is difficult as salient contextual information is left out. To address this, we present a corpus of over 40,000 StackOverflow question texts to be used in conjunction with the corresponding intents from the CoNaLa dataset (Yin et al., 2018). Using both the intent and the question body, we use BART to establish a baseline BLEU score of 34.35 for this new task. We then find further improvements of 2.8% by combining the mined CoNaLa data with the labeled data to achieve a 35.32 BLEU score. We then evaluate the prior state-of-the-art CoNaLa models with this additional data. We find that our proposed method of using the body and mined data beats that of the previous state-of-the-art by a 71.96% BLEU score. Finally, we perform ablations that prove that BART is an unsupervised multimodal learner and examine its extractive behavior.
diff --git a/_publications/ott2018deep.markdown b/_publications/ott2018deep.markdown
index 08e9777b..a117bce0 100644
--- a/_publications/ott2018deep.markdown
+++ b/_publications/ott2018deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A Deep Learning Approach to Identifying Source Code in Images and Video"
-authors: J. Ott, A. Atchison, P. Harnack, A. Bergh, E. Linstead
+authors: Jordan Ott, Abigail Atchison, Paul Harnack, Adrienne Bergh, Erik Linstead.
 conference: MSR
 year: 2018
-bibkey: ott2018deep
 tags: ["information extraction"]
 ---
 While substantial progress has been made in mining code on an
diff --git a/_publications/pandi2020opttyper.markdown b/_publications/pandi2020opttyper.markdown
index 27287bb0..b662c530 100644
--- a/_publications/pandi2020opttyper.markdown
+++ b/_publications/pandi2020opttyper.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "OptTyper: Probabilistic Type Inference by Optimising Logical and Natural Constraints"
-authors: I. V. Pandi, E.T. Barr, A.D. Gordon, C. Sutton
+authors: Irene Vlassi Pandi, Earl T. Barr, Andrew D. Gordon, Charles Sutton
 conference: 
 year: 2020
-bibkey: pandi2020opttyper
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.00348"}
 tags: ["types", "bimodal"]
diff --git a/_publications/panthaplackel2020associating.markdown b/_publications/panthaplackel2020associating.markdown
index 74c177fb..c021aef4 100644
--- a/_publications/panthaplackel2020associating.markdown
+++ b/_publications/panthaplackel2020associating.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Associating Natural Language Comment and Source Code Entities"
-authors: S. Panthaplackel, M. Gligoric, R. J. Mooney, J. J. Li
+authors: Sheena Panthaplackel, Milos Gligoric, Raymond J. Mooney, Junyi Jessy Li
 conference: AAAI
 year: 2020
-bibkey: panthaplackel2020associating
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1912.06728"}
 tags: ["dataset", "bimodal"]
diff --git a/_publications/panthaplackel2020copy.markdown b/_publications/panthaplackel2020copy.markdown
index b4605ca6..d09c2960 100644
--- a/_publications/panthaplackel2020copy.markdown
+++ b/_publications/panthaplackel2020copy.markdown
@@ -4,7 +4,6 @@ title: "Copy that! Editing Sequences by Copying Spans"
 authors: Sheena Panthaplackel, Miltiadis Allamanis, Marc Brockschmidt
 conference:
 year: 2020
-bibkey: panthaplackel2020copy
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2006.04771"}
 tags: ["edit"]
diff --git a/_publications/panthaplackel2020deep.markdown b/_publications/panthaplackel2020deep.markdown
new file mode 100644
index 00000000..30e722e1
--- /dev/null
+++ b/_publications/panthaplackel2020deep.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Deep Just-In-Time Inconsistency Detection Between Comments and Source Code"
+authors: Sheena Panthaplackel, Junyi Jessy Li, Milos Gligoric, Raymond J. Mooney
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2010.01625"}
+tags: ["edit", "bimodal", "documentation"]
+---
+Natural language comments convey key aspects of source code such as implementation, usage, and pre- and post-conditions. Failure to update comments accordingly when the corresponding code is modified introduces inconsistencies, which is known to lead to confusion and software bugs. In this paper, we aim to detect whether a comment becomes inconsistent as a result of changes to the corresponding body of code, in order to catch potential inconsistencies just-in-time, i.e., before they are committed to a version control system. To achieve this, we develop a deep-learning approach that learns to correlate a comment with code changes. By evaluating on a large corpus of comment/code pairs spanning various comment types, we show that our model outperforms multiple baselines by significant margins. For extrinsic evaluation, we show the usefulness of our approach by combining it with a comment update model to build a more comprehensive automatic comment maintenance system which can both detect and resolve inconsistent comments based on code changes. 
diff --git a/_publications/panthaplackel2020learning.markdown b/_publications/panthaplackel2020learning.markdown
index ea998df9..5fb3b7a2 100644
--- a/_publications/panthaplackel2020learning.markdown
+++ b/_publications/panthaplackel2020learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Update Natural Language Comments Based on Code Changes"
-authors: S. Panthaplackel, P.Nie, M. Gligoric, J. J. Li, R. J. Mooney
+authors: Sheena Panthaplackel, Pengyu Nie, Milos Gligoric, Raymond J. Mooney, Junyi Jessy Li
 conference: ACL
 year: 2020
-bibkey: panthaplackel2020learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.12169"}
 tags: ["bimodal", "edit", "documentation"]
diff --git a/_publications/panthaplackel2021learning.markdown b/_publications/panthaplackel2021learning.markdown
new file mode 100644
index 00000000..4c33b959
--- /dev/null
+++ b/_publications/panthaplackel2021learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning to Describe Solutions for Bug Reports Based on Developer Discussions"
+authors: Sheena Panthaplackel, Junyi Jessy Li, Milos Gligoric, Raymond J. Mooney
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2110.04353"}
+tags: ["summarization", "documentation"]
+---
+When a software bug is reported, developers engage in a discussion to collaboratively resolve it. While the solution is likely formulated within the discussion, it is often buried in a large amount of text, making it difficult to comprehend, which delays its implementation. To expedite bug resolution, we propose generating a concise natural language description of the solution by synthesizing relevant content within the discussion, which encompasses both natural language and source code. Furthermore, to support generating an informative description during an ongoing discussion, we propose a secondary task of determining when sufficient context about the solution emerges in real-time. We construct a dataset for these tasks with a novel technique for obtaining noisy supervision from repository changes linked to bug reports. We establish baselines for generating solution descriptions, and develop a classifier which makes a prediction following each new utterance on whether or not the necessary context for performing generation is available. Through automated and human evaluation, we find these tasks to form an ideal testbed for complex reasoning in long, bimodal dialogue context. 
diff --git a/_publications/panthaplackel2022using.markdown b/_publications/panthaplackel2022using.markdown
new file mode 100644
index 00000000..1597adcc
--- /dev/null
+++ b/_publications/panthaplackel2022using.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Using Developer Discussions to Guide Fixing Bugs in Software"
+authors: Sheena Panthaplackel, Milos Gligoric, Junyi Jessy Li, Raymond J. Mooney
+conference: EMNLP
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2211.06335"}
+tags: ["Transformer", "repair"]
+---
+Automatically fixing software bugs is a challenging task. While recent work showed that natural language context is useful in guiding bug-fixing models, the approach required prompting developers to provide this context, which was simulated through commit messages written after the bug-fixing code changes were made. We instead propose using bug report discussions, which are available before the task is performed and are also naturally occurring, avoiding the need for any additional information from developers. For this, we augment standard bug-fixing datasets with bug report discussions. Using these newly compiled datasets, we demonstrate that various forms of natural language context derived from such discussions can aid bug-fixing, even leading to improved performance over using commit messages corresponding to the oracle bug-fixing commits.
diff --git a/_publications/parisi2021source.markdown b/_publications/parisi2021source.markdown
new file mode 100644
index 00000000..4cff09c3
--- /dev/null
+++ b/_publications/parisi2021source.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Source Code Classification for Energy Efficiency in Parallel Ultra Low-Power Microcontrollers"
+authors: Emanuele Parisi, Francesco Barchi, Andrea Bartolini, Giuseppe Tagliavini, Andrea Acquaviva
+conference: DATE
+year: 2021
+additional_links:
+   - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9474085"}
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2012.06836"}
+tags: ["optimization", "program analysis"]
+---
+The analysis of source code through machine learning techniques is an increasingly explored research topic aiming at increasing smartness in the software toolchain to exploit modern architectures in the best possible way. In the case of low-power, parallel embedded architectures, this means finding the configuration, for instance in terms of the number of cores, leading to minimum energy consumption. Depending on the kernel to be executed, the energy optimal scaling configuration is not trivial. While recent work has focused on general-purpose systems to learn and predict the best execution target in terms of the execution time of a snippet of code or kernel (e.g. offload OpenCL kernel on multicore CPU or GPU), in this work we focus on static compile-time features to assess if they can be successfully used to predict the minimum energy configuration on PULP, an ultra-low-power architecture featuring an on-chip cluster of RISC-V processors. Experiments show that using machine learning models on the source code to select the best energy scaling configuration automatically is viable and has the potential to be used in the context of automatic system configuration for energy minimisation.
\ No newline at end of file
diff --git a/_publications/parisi2022making.markdown b/_publications/parisi2022making.markdown
new file mode 100644
index 00000000..0c1efc18
--- /dev/null
+++ b/_publications/parisi2022making.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Making the Most of Scarce Input Data in Deep Learning-Based Source Code Classification for Heterogeneous Device Mapping"
+authors: Emanuele Parisi, Francesco Barchi, Andrea Bartolini, Andrea Acquaviva
+journal: IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems
+year: 2022
+additional_links:
+   - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9544064"}
+   - {name: "code", url: "https://gitlab.com/ecs-lab/deepllvm"}
+tags: ["optimization", "program analysis", "static analysis", "language model"]
+---
+Despite its relatively recent history, deep learning (DL)-based source code analysis is already a cornerstone in machine learning for compiler optimization. When applied to the classification of pieces of code to identify the best computational unit in a heterogeneous Systems-on-Chip, it can be effective in supporting decisions that a programmer has otherwise to take manually. Several techniques have been proposed exploiting different networks and input information, prominently sequence-based and graph-based representations, complemented by auxiliary information typically related to payload and device configuration. While the accuracy of DL methods strongly depends on the training and test datasets, so far no exhaustive and statistically meaningful analysis has been done on its impact on the results and on how to effectively extract the available information. This is relevant also considering the scarce availability of source code datasets that can be labeled by profiling on heterogeneous compute units. In this article, we first present such a study, which leads us to devise the contribution of code sequences and auxiliary inputs separately. Starting from this analysis, we then demonstrate that by using the normalization of auxiliary information, it is possible to improve state-of-the-art results in terms of accuracy. Finally, we propose a novel approach exploiting Siamese networks that further improve mapping accuracy by increasing the cardinality of the dataset, thus compensating for its relatively small size.
\ No newline at end of file
diff --git a/_publications/parvez2018building.markdown b/_publications/parvez2018building.markdown
index 53b5f93e..955eb000 100644
--- a/_publications/parvez2018building.markdown
+++ b/_publications/parvez2018building.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Building Language Models for Text with Named Entities"
-authors: M.R. Parvez, S. Chakraborty, B. Ray, KW Chang
+authors: M.R. Parvez, Saikat Chakraborty, Baishakhi Ray, KW Chang
 conference: ACL
 year: 2018
-bibkey: parvez2018building
 tags: ["language model"]
 ---
 Text  in  many  domains  involves  a  significant amount of named entities.   Predicting the entity names is often challenging
diff --git a/_publications/parvez2021retrieval.markdown b/_publications/parvez2021retrieval.markdown
new file mode 100644
index 00000000..78c36878
--- /dev/null
+++ b/_publications/parvez2021retrieval.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Retrieval Augmented Code Generation and Summarization"
+authors: Md Rizwan Parvez, Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang
+conference: EMNLP-Findings
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.11601"}
+tags: ["Transformer", "summarization", "code generation"]
+---
+Software developers write a lot of source code and documentation during software development. Intrinsically, developers often recall parts of source code or code summaries that they had written in the past while implementing software or documenting them. To mimic developers' code or summary generation behavior, we propose a retrieval augmented framework, REDCODER, that retrieves relevant code or summaries from a retrieval database and provides them as a supplement to code generation or summarization models. REDCODER has a couple of uniqueness. First, it extends the state-of-the-art dense retrieval technique to search for relevant code or summaries. Second, it can work with retrieval databases that include unimodal (only code or natural language description) or bimodal instances (code-description pairs). We conduct experiments and extensive analysis on two benchmark datasets of code generation and summarization in Java and Python, and the promising results endorse the effectiveness of our proposed retrieval augmented framework. 
diff --git a/_publications/pashakhanloo2022codetrek.markdown b/_publications/pashakhanloo2022codetrek.markdown
new file mode 100644
index 00000000..bac7858f
--- /dev/null
+++ b/_publications/pashakhanloo2022codetrek.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "CodeTrek: Flexible Modeling of Code using an Extensible Relational Representation"
+authors: Pardis Pashakhanloo, Aaditya Naik, Yuepeng Wang, Hanjun Dai, Petros Maniatis, Mayur Naik
+conference: ICLR
+year: 2022
+additional_links:
+   - {name: "OpenReview", url: "https://openreview.net/forum?id=WQc075jmBmf"}
+tags: ["representation", "variable misuse"]
+---
+Designing a suitable representation for code-reasoning tasks is challenging in aspects such as the kinds of program information to model, how to combine them, and how much context to consider. We propose CodeTrek, a deep learning approach that addresses these challenges by representing codebases as databases that conform to rich relational schemas. The relational representation not only allows CodeTrek to uniformly represent diverse kinds of program information, but also to leverage program-analysis queries to derive new semantic relations, which can be readily incorporated without further architectural engineering. CodeTrek embeds this relational representation using a set of walks that can traverse different relations in an unconstrained fashion, and incorporates all relevant attributes along the way. We evaluate CodeTrek on four diverse and challenging Python tasks: variable misuse, exception prediction, unused definition, and variable shadowing.
+CodeTrek achieves an accuracy of 91%, 63%, 98%, and 94% on these tasks respectively, and outperforms state-of-the-art neural models by 2-19% points.
diff --git a/_publications/patil2022exploring.markdown b/_publications/patil2022exploring.markdown
new file mode 100644
index 00000000..be5a7c12
--- /dev/null
+++ b/_publications/patil2022exploring.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Exploring Dimensions of Generalizability and Few-shot Transfer for Text-to-SQL Semantic Parsing"
+authors: Rajaswa Patil, Manasi Patwardhan, Shirish Karande, Lovekesh Vig, Gautam Shroff
+conference: The 1st Transfer Learning for Natural Language Processing Workshop (TL4NLP 2022)
+year: 2022
+additional_links:
+   - {name: "PDF", url: "https://proceedings.mlr.press/v203/patil23a.html"}
+   - {name: "Data", url: "https://github.com/ManasiPat/Spider-Gen"}
+tags: ["dataset", "evaluation", "Transformer", "benchmark", "generalizability"]
+---
+Existing work on generalization in Text-to-SQL semantic parsing has been restricted to a zero-shot cross-domain setting. In this paper, we introduce Spider-Gen: a Text-to-SQL benchmark to develop a paradigm of transfer learning across distinct dimensions of generalization in Text-to-SQL semantic parsing. The Spider-Gen benchmark focuses on few-shot adaption for Cross-domain, Lexical, and Structural generalization of Text-to-SQL models. Through our experiments with the Spider-Gen dataset, we show that Seq2Seq language models struggle to generalize against change in data distribution, lexical changes in database schema, and changes in SQL query complexity. Our experiments also reveal that performing few-shot fine-tuning helps Text-to-SQL models to generalize across these changes. However, such few-shot adaptation comes with a negative effect on the knowledge learnt during training. Hence, we also explore Parameter-efficient Fine-tuning methods to overcome the limitations of Seq2Seq Text-to-SQL models. We release the Spider-Gen dataset publicly to facilitate further research in generalization and transfer learning across various dimensions in Text-to-SQL semantic parsing.
diff --git a/_publications/patra2016learning.markdown b/_publications/patra2016learning.markdown
index 4816b918..ca22ea7c 100644
--- a/_publications/patra2016learning.markdown
+++ b/_publications/patra2016learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Fuzz: Application-Independent Fuzz Testing with Probabilistic, Generative Models of Input Data"
-authors: J. Patra, M. Pradel
+authors: Jibesh Patra, Michael Pradel
 conference: 
 year: 2016
-bibkey: patra2016learning
 tags: ["fuzzing"]
 ---
 Fuzzing is a popular technique to create test inputs for software that processes structured data. It has been successfully
diff --git a/_publications/patra2021semantic.markdown b/_publications/patra2021semantic.markdown
new file mode 100644
index 00000000..0ac60a9a
--- /dev/null
+++ b/_publications/patra2021semantic.markdown
@@ -0,0 +1,32 @@
+---
+layout: publication
+title: "A Semantic Bug Seeding: A Learning-Based Approach for Creating Realistic Bugs"
+authors: Jibesh Patra, Michael Pradel
+conference: FSE
+year: 2021
+tags: ["repair", "edit"]
+---
+When working on techniques to address the wide-spread problem
+of software bugs, one often faces the need for a large number of
+realistic bugs in real-world programs. Such bugs can either help
+evaluate an approach, e.g., in form of a bug benchmark or a suite
+of program mutations, or even help build the technique, e.g., in
+learning-based bug detection. Because gathering a large number ofreal bugs is difficult,
+a common approach is to rely on automatically
+seeded bugs. Prior work seeds bugs based on syntactic transformation patterns,
+which often results in unrealistic bugs and typically 
+cannot introduce new, application-specific code tokens. This paper
+presents SemSeed, a technique for automatically seeding bugs in
+a semantics-aware way. The key idea is to imitate how a given
+real-world bug would look like in other programs by semantically
+adapting the bug pattern to the local context. To reason about the
+semantics of pieces of code, our approach builds on learned token embeddings
+that encode the semantic similarities of identifiers and literals. Our
+evaluation with real-world JavaScript softwares
+hows that the approach effectively reproduces real bugs and clearly
+outperforms a semantics-unaware approach. The seeded bugs are
+useful as training data for learning-based bug detection, where
+they significantly improve the bug detection ability. Moreover, we
+show that SemSeed-created bugs complement existing mutation
+testing operators, and that our approach is efficient enough to seed
+hundreds of thousands of bugs within an hour.
diff --git a/_publications/pearce2021empirical.markdown b/_publications/pearce2021empirical.markdown
new file mode 100644
index 00000000..426ca0ee
--- /dev/null
+++ b/_publications/pearce2021empirical.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "An Empirical Cybersecurity Evaluation of GitHub Copilot's Code Contributions"
+authors: Hammond Pearce, Baleegh Ahmad, Benjamin Tan, Brendan Dolan-Gavitt, Ramesh Karri
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.09293"}
+tags: ["Transformer", "language model"]
+---
+There is burgeoning interest in designing AI-based systems to assist humans in designing computing systems, including tools that automatically generate computer code. The most notable of these comes in the form of the first self-described `AI pair programmer', GitHub Copilot, a language model trained over open-source GitHub code. However, code often contains bugs - and so, given the vast quantity of unvetted code that Copilot has processed, it is certain that the language model will have learned from exploitable, buggy code. This raises concerns on the security of Copilot's code contributions. In this work, we systematically investigate the prevalence and conditions that can cause GitHub Copilot to recommend insecure code. To perform this analysis we prompt Copilot to generate code in scenarios relevant to high-risk CWEs (e.g. those from MITRE's "Top 25" list). We explore Copilot's performance on three distinct code generation axes -- examining how it performs given diversity of weaknesses, diversity of prompts, and diversity of domains. In total, we produce 89 different scenarios for Copilot to complete, producing 1,692 programs. Of these, we found approximately 40% to be vulnerable.
diff --git a/_publications/peng2021how.markdown b/_publications/peng2021how.markdown
new file mode 100644
index 00000000..4bb0e65f
--- /dev/null
+++ b/_publications/peng2021how.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "How could Neural Networks understand Programs?"
+authors: Dinglan Peng, Shuxin Zheng, Yatao Li, Guolin Ke, Di He, Tie-Yan Liu
+conference: ICML
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.04297"}
+tags: ["Transformer"]
+---
+Semantic understanding of programs is a fundamental problem for programming language processing (PLP). Recent works that learn representations of code based on pre-training techniques in NLP have pushed the frontiers in this direction. However, the semantics of PL and NL have essential differences. These being ignored, we believe it is difficult to build a model to better understand programs, by either directly applying off-the-shelf NLP pre-training techniques to the source code, or adding features to the model by the heuristic. In fact, the semantics of a program can be rigorously defined by formal semantics in PL theory. For example, the operational semantics, describes the meaning of a valid program as updating the environment (i.e., the memory address-value function) through fundamental operations, such as memory I/O and conditional branching. Inspired by this, we propose a novel program semantics learning paradigm, that the model should learn from information composed of (1) the representations which align well with the fundamental operations in operational semantics, and (2) the information of environment transition, which is indispensable for program understanding. To validate our proposal, we present a hierarchical Transformer-based pre-training model called OSCAR to better facilitate the understanding of programs. OSCAR learns from intermediate representation (IR) and an encoded representation derived from static analysis, which are used for representing the fundamental operations and approximating the environment transitions respectively. OSCAR empirically shows the outstanding capability of program semantics understanding on many practical software engineering tasks. 
diff --git a/_publications/peng2023generative.markdown b/_publications/peng2023generative.markdown
new file mode 100644
index 00000000..7238aea7
--- /dev/null
+++ b/_publications/peng2023generative.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Generative Type Inference for Python"
+authors: Yun Peng, Chaozheng Wang, Wenxuan Wang, Cuiyun Gao, Michael R. Lyu
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2307.09163"}
+tags: ["types"]
+---
+Python is a popular dynamic programming language, evidenced by its ranking as the second most commonly used language on GitHub. However, its dynamic type system can lead to potential type errors, leading researchers to explore automatic type inference approaches for Python programs. The rule-based type inference approaches can ensure the accuracy of predicted variable types, but they suffer from low coverage problems. Supervised type inference approaches, while feature-agnostic, require large, high-quality annotated datasets and are limited to pre-defined types. As zero-shot approaches, the cloze-style approaches reformulate the type inference problem into a fill-in-the-blank problem. However, their performance is limited.   This paper introduces TypeGen, a few-shot generative type inference approach that incorporates static domain knowledge from static analysis. TypeGen creates chain-of-thought (COT) prompts by translating the type inference steps of static analysis into prompts based on the type dependency graphs (TDGs), enabling language models to learn from how static analysis infers types. By combining COT prompts with code slices and type hints, TypeGen constructs example prompts from human annotations. TypeGen only requires very few annotated examples to teach language models to generate similar COT prompts via in-context learning. Moreover, TypeGen enhances the interpretability of results through the use of the input-explanation-output strategy. Experiments show that TypeGen outperforms the best baseline Type4Py by 10.0% for argument type prediction and 22.5% in return value type prediction in terms of top-1 Exact Match by using only five examples. Furthermore, TypeGen achieves substantial improvements of 27% to 84% compared to the zero-shot performance of large language models with parameter sizes ranging from 1.3B to 175B in terms of top-1 Exact Match.
diff --git a/_publications/phan2021cotext.markdown b/_publications/phan2021cotext.markdown
new file mode 100644
index 00000000..e2d5b220
--- /dev/null
+++ b/_publications/phan2021cotext.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "CoTexT: Multi-task Learning with Code-Text Transformer"
+authors: Long Phan, Hieu Tran, Daniel Le, Hieu Nguyen, James Anibal, Alec Peltekian, Yanfang Ye
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.08645"}
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.5.pdf"}
+tags: ["Transformer"]
+---
+We present CoTexT, a transformer-based architecture encoder-decoder pre-trained model that learns the representative context between natural language (NL) and programming language (PL) through multi-task learning. CoTexT is pre-trained, in self-supervised fashion, based on large programming language corpus to learn general-purpose understanding and code-text generation supporting downstream NL-PL task such as code summarizing/documentation, code generation, defect detection, code debugging, etc. We train CoTexT on different combination of available PL corpus including both "bimodal" and "unimodal" data where the former is the combinations of both natural texts and their corresponding code snippets in an input sequence and the latter is merely code snippets. We evaluate multi-task learning CoTexT on different generation and classification tasks on CodeXGLUE and it achieves state-of-the-art on all downstream tasks. 
diff --git a/_publications/piech2015learning.markdown b/_publications/piech2015learning.markdown
index 03801549..3ff5d0f5 100644
--- a/_publications/piech2015learning.markdown
+++ b/_publications/piech2015learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Program Embeddings to Propagate Feedback on Student Code"
-authors: C. Piech, J. Huang, A. Nguyen, M. Phulsuksombati, M, Sahami, L. Guibas
+authors: Chris Piech, Jonathan Huang, Andy Nguyen, Mike Phulsuksombati, Mehran Sahami, Leonidas Guibas
 conference: ICML
 year: 2015
-bibkey: piech2015learning
 tags: ["representation", "repair", "education"]
 ---
 Providing feedback, both assessing final work
diff --git a/_publications/poesia2022synchromesh.markdown b/_publications/poesia2022synchromesh.markdown
new file mode 100644
index 00000000..6ea48c6f
--- /dev/null
+++ b/_publications/poesia2022synchromesh.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Synchromesh: Reliable code generation from pre-trained language models"
+authors: Gabriel Poesia, Oleksandr Polozov, Vu Le, Ashish Tiwari, Gustavo Soares, Christopher Meek, Sumit Gulwani
+conference: ICLR
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2201.11227"}
+tags: ["Transformer", "language model"]
+---
+Large pre-trained language models have been used to generate code,providing a flexible interface for synthesizing programs from natural language specifications. However, they often violate syntactic and semantic rules of their output language, limiting their practical usability. In this paper, we propose Synchromesh: a framework for substantially improving the reliability of pre-trained models for code generation. Synchromesh comprises two components. First, it retrieves few-shot examples from a training bank using Target Similarity Tuning (TST), a novel method for semantic example selection. TST learns to recognize utterances that describe similar target programs despite differences in surface natural language features. Then, Synchromesh feeds the examples to a pre-trained language model and samples programs using Constrained Semantic Decoding (CSD): a general framework for constraining the output to a set of valid programs in the target language. CSD leverages constraints on partial outputs to sample complete correct programs, and needs neither re-training nor fine-tuning of the language model. We evaluate our methods by synthesizing code from natural language descriptions using GPT-3 and Codex in three real-world languages: SQL queries, Vega-Lite visualizations and SMCalFlow programs. These domains showcase rich constraints that CSD is able to enforce, including syntax, scope, typing rules, and contextual logic. We observe substantial complementary gains from CSD and TST in prediction accuracy and in effectively preventing run-time errors. 
diff --git a/_publications/popov2021time.markdown b/_publications/popov2021time.markdown
new file mode 100644
index 00000000..9dd73056
--- /dev/null
+++ b/_publications/popov2021time.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Time-Efficient Code Completion Model for the R Programming Language"
+authors: Artem Popov, Dmitrii Orekhov, Denis Litvinov, Nikolay Korolev, Gleb Morgachev
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.4.pdf"}
+tags: ["dataset", "language model", "code generation", "Transformer"]
+---
+In this paper we present a deep learning code completion model for the R language. We introduce several techniques to utilize language modeling based architecture in the code completion task. With these techniques, the model requires low resources, but still achieves high quality. We also present an evaluation dataset for the R language completion task. Our dataset contains multiple autocompletion usage contexts that provides robust validation results. The dataset is publicly available.
diff --git a/_publications/pradel2017deep.markdown b/_publications/pradel2017deep.markdown
index 4b2a2c09..ddbb674f 100644
--- a/_publications/pradel2017deep.markdown
+++ b/_publications/pradel2017deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Learning to Find Bugs"
-authors: M. Pradel, K. Sen
+authors: Michael Pradel, Koushik Sen
 conference: 
 year: 2017
-bibkey: pradel2017deep
 additional_links:
    - {name: "PDF", url: "http://mp.binaervarianz.de/DeepBugs_TR_Nov2017.pdf"}
 tags: ["defect", "program analysis"]
diff --git a/_publications/pradel2019typewriter.markdown b/_publications/pradel2019typewriter.markdown
index 262f5fbc..89ae5d5e 100644
--- a/_publications/pradel2019typewriter.markdown
+++ b/_publications/pradel2019typewriter.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "TypeWriter: Neural Type Prediction with Search-based Validation"
-authors: M. Pradel, G. Gousios, J. Liu, S. Chandra
+authors: Michael Pradel, Georgios Gousios, Jason Liu, Satish Chandra.
 conference:
 year: 2019
-bibkey: pradel2019typewriter
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1912.03768"}
 tags: ["types", "bimodal"]
diff --git a/_publications/pradel2020neural.markdown b/_publications/pradel2020neural.markdown
new file mode 100644
index 00000000..d781673d
--- /dev/null
+++ b/_publications/pradel2020neural.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Neural Software Analysis"
+authors: Michael Pradel, Satish Chandra
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2011.07986"}
+tags: ["program analysis", "survey"]
+---
+Many software development problems can be addressed by program analysis tools, which traditionally are based on precise, logical reasoning and heuristics to ensure that the tools are practical. Recent work has shown tremendous success through an alternative way of creating developer tools, which we call neural software analysis. The key idea is to train a neural machine learning model on numerous code examples, which, once trained, makes predictions about previously unseen code. In contrast to traditional program analysis, neural software analysis naturally handles fuzzy information, such as coding conventions and natural language embedded in code, without relying on manually encoded heuristics. This article gives an overview of neural software analysis, discusses when to (not) use it, and presents three example analyses. The analyses address challenging software development problems: bug detection, type prediction, and code completion. The resulting tools complement and outperform traditional program analyses, and are used in industrial practice. 
diff --git a/_publications/pravilov2021unsupervised.markdown b/_publications/pravilov2021unsupervised.markdown
new file mode 100644
index 00000000..5b6c23ec
--- /dev/null
+++ b/_publications/pravilov2021unsupervised.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Unsupervised Learning of General-Purpose Embeddings for Code Changes"
+authors: Mikhail Pravilov, Egor Bogomolov, Yaroslav Golubev, Timofey Bryksin
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.02087"}
+tags: ["edit", "representation"]
+---
+Applying machine learning to tasks that operate with code changes requires their numerical representation. In this work, we propose an approach for obtaining such representations during pre-training and evaluate them on two different downstream tasks - applying changes to code and commit message generation. During pre-training, the model learns to apply the given code change in a correct way. This task requires only code changes themselves, which makes it unsupervised. In the task of applying code changes, our model outperforms baseline models by 5.9 percentage points in accuracy. As for the commit message generation, our model demonstrated the same results as supervised models trained for this specific task, which indicates that it can encode code changes well and can be improved in the future by pre-training on a larger dataset of easily gathered code changes. 
diff --git a/_publications/proksch2015intelligent.markdown b/_publications/proksch2015intelligent.markdown
index 49f04ee1..9d8870a9 100644
--- a/_publications/proksch2015intelligent.markdown
+++ b/_publications/proksch2015intelligent.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Intelligent Code Completion with Bayesian Networks"
-authors: S. Proksch, J. Lerch, M. Mezini
+authors: Sebastian Proksch, Johannes Lerch, Mira Mezini
 conference: TSE
 year: 2015
-bibkey: proksch2015intelligent
 tags: ["autocomplete"]
 ---
 Code completion is an integral part of modern Integrated Development Environments (IDEs). Developers
diff --git a/_publications/pu2016skp.markdown b/_publications/pu2016skp.markdown
index 66b14547..e716a21d 100644
--- a/_publications/pu2016skp.markdown
+++ b/_publications/pu2016skp.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "sk_p: a neural program corrector for MOOCs"
-authors: Y. Pu, K. Narasimhan, A. Solar-Lezama, R. Barzilay
+authors: Yewen Pu, Karthik Narasimhan, Armando Solar-Lezama, Regina Barzilay
 conference: SPLASH 
 year: 2016
-bibkey: pu2016skp
 tags: ["repair"]
 ---
 We present a novel technique for automatic program correction in MOOCs, capable of fixing both syntactic and semantic errors without manual, problem specific correction strategies. Given an incorrect student program, it generates candidate programs from a distribution of likely corrections, and checks each candidate for correctness against a test suite.
diff --git a/_publications/puri2021project.markdown b/_publications/puri2021project.markdown
new file mode 100644
index 00000000..22090941
--- /dev/null
+++ b/_publications/puri2021project.markdown
@@ -0,0 +1,33 @@
+---
+layout: publication
+title: "Project CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks"
+authors: Ruchir Puri, David S. Kung, Geert Janssen, Wei Zhang, Giacomo Domeniconi, Vladmir Zolotov, Julian Dolby, Jie Chen, Mihir Choudhury, Lindsey Decker, Veronika Thost, Luca Buratti, Saurabh Pujar, Ulrich Finkler
+conference:
+year: 2021
+additional_links:
+   - {name: "GitHub", url: "https://github.com/IBM/Project_CodeNet"}
+tags: ["dataset"]
+---
+Advancements in deep learning and machine learning algorithms have enabled
+breakthrough progress in computer vision, speech recognition, natural language
+processing and beyond.  In addition, over the last several decades, software has
+been built into the fabric of every aspect of our society.   Together,  these two
+trends have generated new interest in the fast-emerging research area of “AI for
+Code”. As software development becomes ubiquitous across all industries and code
+infrastructure of enterprise legacy applications ages, it is more critical than ever
+to increase software development productivity and modernize legacy applications.
+Over the last decade, datasets like ImageNet, with its large scale and diversity,
+have played a pivotal role in algorithmic advancements from computer vision to
+language and speech understanding. In this paper, we present "Project CodeNet",
+a first-of-its-kind, very large scale, diverse, and high-quality dataset to accelerate
+the algorithmic advancements in AI for Code.  It consists of 14M code samples
+and about 500M lines of code in 55 different programming languages.  Project
+CodeNet is not only unique in its scale, but also in the diversity of coding tasks
+it can help benchmark:  from code similarity and classification for advances in
+code recommendation algorithms, and code translation between a large variety
+programming languages, to advances in code performance (both runtime, and
+memory) improvement techniques. CodeNet also provides sample input and output
+test sets for over 7M code samples, which can be critical for determining code
+equivalence in different languages. As a usability feature, we provide several 
+preprocessing tools in Project CodeNet to transform source codes into representations
+that can be readily used as inputs into machine learning models.
diff --git a/_publications/rabin2019testing.markdown b/_publications/rabin2019testing.markdown
new file mode 100644
index 00000000..60a0bfb5
--- /dev/null
+++ b/_publications/rabin2019testing.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Testing Neural Program Analyzers"
+authors: Md Rafiqul Islam Rabin, Ke Wang, Mohammad Amin Alipour
+conference: ASE (LBR-Track)
+year: 2019
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1908.10711"}
+   - {name: "code", url: "https://github.com/mdrafiqulrabin/tnpa-framework"}
+tags: ["evaluation", "refactoring"]
+---
+Deep neural networks have been increasingly used in software engineering and program analysis tasks. They usually take a program and make some predictions about it, e.g., bug prediction. We call these models neural program analyzers. The reliability of neural programs can impact the reliability of the encompassing analyses. In this paper, we describe our ongoing efforts to develop effective techniques for testing neural programs. We discuss the challenges involved in developing such tools and our future plans. In our preliminary experiment on a neural model recently proposed in the literature, we found that the model is very brittle, and simple perturbations in the input can cause the model to make mistakes in its prediction.
diff --git a/_publications/rabin2020demystifying.markdown b/_publications/rabin2020demystifying.markdown
new file mode 100644
index 00000000..89ff6934
--- /dev/null
+++ b/_publications/rabin2020demystifying.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Towards Demystifying Dimensions of Source Code Embeddings"
+authors: Md Rafiqul Islam Rabin, Arjun Mukherjee, Omprakash Gnawali, Mohammad Amin Alipour
+conference: "RL+SE&PL (Co-located with ESEC/FSE)"
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2008.13064"}
+   - {name: "code", url: "https://github.com/mdrafiqulrabin/handcrafted-embeddings"}
+tags: ["evaluation", "representation", "naming", "interpretability"]
+---
+Source code representations are key in applying machine learning techniques for processing and analyzing programs. A popular approach in representing source code is neural source code embeddings that represents programs with high-dimensional vectors computed by training deep neural networks on a large volume of programs. Although successful, there is little known about the contents of these vectors and their characteristics. In this paper, we present our preliminary results towards better understanding the contents of code2vec neural source code embeddings. In particular, in a small case study, we use the code2vec embeddings to create binary SVM classifiers and compare their performance with the handcrafted features. Our results suggest that the handcrafted features can perform very close to the highly-dimensional code2vec embeddings, and the information gains are more evenly distributed in the code2vec embeddings compared to the handcrafted features. We also find that the code2vec embeddings are more resilient to the removal of dimensions with low information gains than the handcrafted features. We hope our results serve a stepping stone toward principled analysis and evaluation of these code representations.
diff --git a/_publications/rabin2021generalizability.markdown b/_publications/rabin2021generalizability.markdown
new file mode 100644
index 00000000..df8f78e0
--- /dev/null
+++ b/_publications/rabin2021generalizability.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "On the Generalizability of Neural Program Models with respect to Semantic-Preserving Program Transformations"
+authors: Md Rafiqul Islam Rabin, Nghi D. Q. Bui, Ke Wang, Yijun Yu, Lingxiao Jiang, Mohammad Amin Alipour
+conference: IST
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2008.01566"}
+   - {name: "code", url: "https://github.com/mdrafiqulrabin/tnpa-generalizability"}
+tags: ["evaluation", "adversarial", "generalizability", "refactoring", "summarization"]
+---
+With the prevalence of publicly available source code repositories to train deep neural network models, neural program models can do well in source code analysis tasks such as predicting method names in given programs that cannot be easily done by traditional program analysis techniques. Although such neural program models have been tested on various existing datasets, the extent to which they generalize to unforeseen source code is largely unknown. Since it is very challenging to test neural program models on all unforeseen programs, in this paper, we propose to evaluate the generalizability of neural program models with respect to semantic-preserving transformations: a generalizable neural program model should perform equally well on programs that are of the same semantics but of different lexical appearances and syntactical structures. We compare the results of various neural program models for the method name prediction task on programs before and after automated semantic-preserving transformations. We use three Java datasets of different sizes and three state-of-the-art neural network models for code, namely code2vec, code2seq, and GGNN, to build nine such neural program models for evaluation. Our results show that even with small semantically preserving changes to the programs, these neural program models often fail to generalize their performance. Our results also suggest that neural program models based on data and control dependencies in programs generalize better than neural program models based only on abstract syntax trees. On the positive side, we observe that as the size of the training dataset grows and diversifies the generalizability of correct predictions produced by the neural program models can be improved too. Our results on the generalizability of neural program models provide insights to measure their limitations and provide a stepping stone for their improvement.
diff --git a/_publications/rabin2021understanding.markdown b/_publications/rabin2021understanding.markdown
new file mode 100644
index 00000000..05455697
--- /dev/null
+++ b/_publications/rabin2021understanding.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Understanding Neural Code Intelligence Through Program Simplification"
+authors: Md Rafiqul Islam Rabin, Vincent J. Hellendoorn, Mohammad Amin Alipour
+conference: ESEC/FSE
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.03353"}
+   - {name: "code", url: "https://github.com/mdrafiqulrabin/SIVAND"}
+tags: ["interpretability", "refactoring", "information extraction"]
+---
+A wide range of code intelligence (CI) tools, powered by deep neural networks, have been developed recently to improve programming productivity and perform program analysis. To reliably use such tools, developers often need to reason about the behavior of the underlying models and the factors that affect them. This is especially challenging for tools backed by deep neural networks. Various methods have tried to reduce this opacity in the vein of "transparent/interpretable-AI". However, these approaches are often specific to a particular set of network architectures, even requiring access to the network's parameters. This makes them difficult to use for the average programmer, which hinders the reliable adoption of neural CI systems. In this paper, we propose a simple, model-agnostic approach to identify critical input features for models in CI systems, by drawing on software debugging research, specifically delta debugging. Our approach, SIVAND, uses simplification techniques that reduce the size of input programs of a CI model while preserving the predictions of the model. We show that this approach yields remarkably small outputs and is broadly applicable across many model architectures and problem domains. We find that the models in our experiments often rely heavily on just a few syntactic features in input programs. We believe that SIVAND's extracted features may help understand neural CI systems' predictions and learned behavior.
diff --git a/_publications/rabin2022memorization.markdown b/_publications/rabin2022memorization.markdown
new file mode 100644
index 00000000..b75d7827
--- /dev/null
+++ b/_publications/rabin2022memorization.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Memorization and Generalization in Neural Code Intelligence Models"
+authors: Md Rafiqul Islam Rabin, Aftab Hussain, Mohammad Amin Alipour, Vincent J. Hellendoorn
+conference: IST
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.08704"}
+   - {name: "code", url: "https://github.com/mdrafiqulrabin/CI-Memorization"}
+tags: ["evaluation", "memorization", "generalizability", "refactoring", "language model"]
+---
+Deep Neural Networks (DNNs) are increasingly being used in software engineering and code intelligence tasks. These are powerful tools that are capable of learning highly generalizable patterns from large datasets through millions of parameters. At the same time, their large capacity can render them prone to memorizing data points. Recent work suggests that the memorization risk manifests especially strongly when the training dataset is noisy, involving many ambiguous or questionable samples, and memorization is the only recourse. The goal of this paper is to evaluate and compare the extent of memorization and generalization in neural code intelligence models. It aims to provide insights on how memorization may impact the learning behavior of neural models in code intelligence systems. To observe the extent of memorization in models, we add random noise to the original training dataset and use various metrics to quantify the impact of noise on various aspects of training and testing. We evaluate several state-of-the-art neural code intelligence models and benchmarks based on Java, Python, and Ruby codebases. Our results highlight important risks: millions of trainable parameters allow the neural networks to memorize anything, including noisy data, and provide a false sense of generalization. We observed all models manifest some forms of memorization. This can be potentially troublesome in most code intelligence tasks where they rely on rather noise-prone and repetitive data sources, such as code from GitHub. To the best of our knowledge, we provide the first study to quantify memorization effects in the domain of software engineering and code intelligence systems. This work raises awareness and provides new insights into important issues of training neural models in code intelligence systems that are usually overlooked by software engineering researchers.
diff --git a/_publications/rabin2022understanding.markdown b/_publications/rabin2022understanding.markdown
new file mode 100644
index 00000000..d4879a84
--- /dev/null
+++ b/_publications/rabin2022understanding.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Syntax-Guided Program Reduction for Understanding Neural Code Intelligence Models"
+authors: Md Rafiqul Islam Rabin, Aftab Hussain, Mohammad Amin Alipour
+conference: MAPS
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2205.14374"}
+   - {name: "code", url: "https://github.com/mdrafiqulrabin/ci-dd-perses"}
+tags: ["interpretability", "refactoring", "adversarial"]
+---
+Neural code intelligence (CI) models are opaque black-boxes and offer little insight on the features they use in making predictions. This opacity may lead to distrust in their prediction and hamper their wider adoption in safety-critical applications. Recently, input program reduction techniques have been proposed to identify key features in the input programs to improve the transparency of CI models. However, this approach is syntax-unaware and does not consider the grammar of the programming language. In this paper, we apply a syntax-guided program reduction technique that considers the grammar of the input programs during reduction. Our experiments on multiple models across different types of input programs show that the syntax-guided program reduction technique is faster and provides smaller sets of key tokens in reduced programs. We also show that the key tokens could be used in generating adversarial examples for up to 65% of the input programs.
diff --git a/_publications/rabinovich2017abstract.markdown b/_publications/rabinovich2017abstract.markdown
index 56807833..84c36cfd 100644
--- a/_publications/rabinovich2017abstract.markdown
+++ b/_publications/rabinovich2017abstract.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Abstract Syntax Networks for Code Generation and Semantic Parsing"
-authors: M. Rabinovich, M. Stern, D. Klein
+authors: Maxim Rabinovich, Mitchell Stern, Dan Klein
 conference: ACL
 year: 2017
-bibkey: rabinovich2017abstract
-tags: ["generation", "AST"]
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1704.07535"}
+tags: ["code generation", "grammar"]
 ---
 Tasks like code generation and semantic parsing require mapping unstructured (or partially structured) inputs to well-formed, executable outputs. We introduce abstract syntax networks, a modeling framework for these problems. The outputs are represented as abstract syntax trees (ASTs) and constructed by a decoder with a dynamically-determined modular structure paralleling the structure of the output tree. On the benchmark Hearthstone dataset for code generation, our model obtains 79.2 BLEU and 22.7% exact match accuracy, compared to previous state-of-the-art values of 67.1 and 6.1%. Furthermore, we perform competitively on the Atis, Jobs, and Geo semantic parsing datasets with no task-specific engineering. 
diff --git a/_publications/raghothaman2018user.markdown b/_publications/raghothaman2018user.markdown
index 7117575d..d7334c65 100644
--- a/_publications/raghothaman2018user.markdown
+++ b/_publications/raghothaman2018user.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "User-guided program reasoning using Bayesian inference"
-authors: M. Raghothaman, S. Kulkarni, K. Helo, M. Naik
+authors: Mukund Raghothaman, Sulekha Kulkarni, Kihong Heo, Mayur Naik
 conference: PLDI
 year: 2018
-bibkey: raghothaman2018user
 additional_links:
    - {name: "Paper", url: "https://www.cis.upenn.edu/~kheo/paper/pldi18-rakuhena.pdf"}
 tags: ["program analysis"]
diff --git a/_publications/rahman2019natural.markdown b/_publications/rahman2019natural.markdown
index 6a6505a4..e5f73629 100644
--- a/_publications/rahman2019natural.markdown
+++ b/_publications/rahman2019natural.markdown
@@ -1,9 +1,8 @@
 ---
 layout: publication
 title: "Natural Software Revisited"
-authors: M. Rahman, D. Palani, P. Rigby
+authors: Musfiqur Rahman, Dharani Palani, Peter C. Rigby
 conference: ICSE
 year: 2019
-bibkey: rahman2019natural
 ---
 Recent works have concluded that software is more repetitive and predictable, i.e. more natural, than English texts. These works included “simple/artificial” syntax rules in their language models. When we remove SyntaxTokens we find that code is still repetitive and predictable but only at levels slightly above English. Furthermore, previous works have compared individual Java programs to general English corpora, such as Gutenberg, which contains a historically large range of styles and subjects (e.g. Saint Augustine to Oscar Wilde). We perform an additional comparison of technical StackOverflow English discussions with source code and find that this restricted English is similarly repetitive to code. Although we find that code is less repetitive than previously thought, we suspect that API code element usage will be repetitive across software projects. For example a file is opened and closed in the same manner irrespective of domain. When we restrict our n-grams to those contained in the Java API we find that the entropy is significantly lower than the English corpora. Previous works have focused on sequential sequences of tokens. When we extract program graphs of size 2, 3, and 4 nodes we see that the abstract graph representation is much more concise and repetitive than the sequential representations of the same code. This suggests that future work should focus on statistical graph models that go beyond linear sequences of tokens. Our anonymous replication package makes our scripts and data available to future researchers and reviewers.
diff --git a/_publications/ramakrishnan2020backdoors.markdown b/_publications/ramakrishnan2020backdoors.markdown
new file mode 100644
index 00000000..35d4d059
--- /dev/null
+++ b/_publications/ramakrishnan2020backdoors.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Backdoors in Neural Models of Source Code"
+authors: Goutham Ramakrishnan, Aws Albarghouthi
+conference: ICPR
+year: 2022
+additional_links:
+    - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9956690"}
+    - {name: "ArXiV", url: "https://arxiv.org/abs/2006.06841"}
+    - {name: "Code", url: "https://github.com/goutham7r/backdoors-for-code"}
+tags: ["adversarial"]
+---
+Deep neural networks are vulnerable to a range of adversaries. A particularly pernicious class of vulnerabilities are backdoors, where model predictions diverge in the presence of subtle triggers in inputs. An attacker can implant a backdoor by poisoning the training data to yield a desired target prediction on triggered inputs. We study backdoors in the context of deep-learning for source code. (1) We define a range of backdoor classes for source-code tasks and show how to poison a dataset to install such backdoors. (2) We adapt and improve recent algorithms from robust statistics for our setting, showing that backdoors leave a spectral signature in the learned representation of source code, thus enabling detection of poisoned data. (3) We conduct a thorough evaluation on different architectures and languages, showing the ease of injecting backdoors and our ability to eliminate them.
diff --git a/_publications/ramakrishnan2020semantic.markdown b/_publications/ramakrishnan2020semantic.markdown
deleted file mode 100644
index 56006a67..00000000
--- a/_publications/ramakrishnan2020semantic.markdown
+++ /dev/null
@@ -1,14 +0,0 @@
----
-layout: publication
-title: "Semantic Robustness of Models of Source Code"
-authors: G. Ramakrishnan, J. Henkel, Z. Wang, A. Albarghouthi, S. Jha, T. Reps
-conference:
-year: 2020
-bibkey: ramakrishnan2020semantic
-additional_links:
-   - {name: "ArXiV", url: "https://arxiv.org/abs/2002.03043"}
-tags: ["adversarial", "naming"]
----
-Deep neural networks are vulnerable to adversarial examples - small input perturbations that result in incorrect predictions. We study this problem in the context of models of source code, where we want the network to be robust to source-code modifications that preserve code functionality. We define a natural notion of robustness, k-transformation robustness, in which an adversary performs up to k semantics-preserving transformations to an input program. We show how to train robust models using an adversarial training objective inspired by that of Madry et al. (2018) for continuous domains.
-
-We implement an extensible framework for adversarial training over source code, and conduct a thorough evaluation on a number of datasets and two different architectures. Our results show (1) the increase in robustness following adversarial training, (2) the ability of training on weak adversaries to provide robustness to attacks by stronger adversaries, and (3) the shift in attribution focus of adversarially trained models towards semantic vs. syntactic features. 
diff --git a/_publications/ray2015naturalness.markdown b/_publications/ray2015naturalness.markdown
index 3b6aab95..d5c521ed 100644
--- a/_publications/ray2015naturalness.markdown
+++ b/_publications/ray2015naturalness.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "On the “Naturalness” of Buggy Code"
-authors: B. Ray, V. Hellendoorn, S. Godhane, Z. Tu, A. Bacchelli, P. Devanbu
+authors: Baishakhi Ray, Vincent Hellendoorn, Saheel Godhane, Zhaopeng Tu, Alberto Bacchelli, Premkumar Devanbu
 conference: ICSE
 year: 2015
-bibkey: ray2015naturalness
 tags: ["defect"]
 ---
 Real software, the kind working programmers produce by the kLOC
diff --git a/_publications/raychev2014code.markdown b/_publications/raychev2014code.markdown
index a9f4e68b..fd2f4d38 100644
--- a/_publications/raychev2014code.markdown
+++ b/_publications/raychev2014code.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Code Completion with Statistical Language Models"
-authors: V. Raychev, M. Vechev, E. Yahav
+authors: Veselin Raychev, Martin Vechev, Eran Yahav
 conference: PLDI
 year: 2014
-bibkey: raychev2014code
-tags: ["language model", "autocomplete", "generation"]
+tags: ["language model", "autocomplete", "code generation"]
 ---
 We address the problem of synthesizing code completions for programs using APIs. Given a program with holes, we synthesize completions for holes with the most likely sequences of method calls.
 
diff --git a/_publications/raychev2015predicting.markdown b/_publications/raychev2015predicting.markdown
index 6ba2e760..b35116d1 100644
--- a/_publications/raychev2015predicting.markdown
+++ b/_publications/raychev2015predicting.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Predicting Program Properties from “Big Code”"
-authors: V. Raychev, M. Vechev, A. Krause
+authors: Veselin Raychev, Martin Vechev, Andreas Krause
 conference: POPL
 year: 2015
-bibkey: raychev2015predicting
 tags: ["program analysis", "naming", "types", "deobfuscation"]
 ---
 We present a new approach for predicting program properties from
diff --git a/_publications/raychev2016learning.markdown b/_publications/raychev2016learning.markdown
index 5a5a632b..009e4fd8 100644
--- a/_publications/raychev2016learning.markdown
+++ b/_publications/raychev2016learning.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "Learning Programs from Noisy Data"
-authors: V. Raychev, P. Bielik, M. Vechev, A. Krause
+authors: Veselin Raychev, Pavol lBielik, Martin Vechev, Andreas Krause
 conference: POPL
 year: 2016
-bibkey: raychev2016learning
-tags: ["generation", "grammar"]
+tags: ["code generation", "grammar"]
 ---
 We present a new approach for learning programs from noisy
 datasets. Our approach is based on two new concepts: a regularized
diff --git a/_publications/reid2022learning.markdown b/_publications/reid2022learning.markdown
new file mode 100644
index 00000000..a33f8eff
--- /dev/null
+++ b/_publications/reid2022learning.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Learning to Model Editing Processes"
+authors: Machel Reid, Graham Neubig
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2205.12374"}
+tags: ["Transformer", "edit"]
+---
+Most existing sequence generation models produce outputs in one pass, usually left-to-right. However, this is in contrast with a more natural approach that humans use in generating content; iterative refinement and editing. Recent work has introduced edit-based models for various tasks (such as neural machine translation and text style transfer), but these generally model a single edit step. In this work, we propose modeling editing processes, modeling the whole process of iteratively generating sequences. We form a conceptual framework to describe the likelihood of multi-step edits, and describe neural models that can learn a generative model of sequences based on these multistep edits. We introduce baseline results and metrics on this task, finding that modeling editing processes improves performance on a variety of axes on both our proposed task and related downstream tasks compared to previous single-step models of edits.
diff --git a/_publications/ren2020codebleu.markdown b/_publications/ren2020codebleu.markdown
new file mode 100644
index 00000000..209815a6
--- /dev/null
+++ b/_publications/ren2020codebleu.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis"
+authors: Shuo Ren, Daya Guo, Shuai Lu, Long Zhou, Shujie Liu, Duyu Tang, Neel Sundaresan, Ming Zhou, Ambrosio Blanco, Shuai Ma
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.10297"}
+tags: ["evaluation"]
+---
+Evaluation metrics play a vital role in the growth of an area as it defines the standard of distinguishing between good and bad models. In the area of code synthesis, the commonly used evaluation metric is BLEU or perfect accuracy, but they are not suitable enough to evaluate codes, because BLEU is originally designed to evaluate the natural language, neglecting important syntactic and semantic features of codes, and perfect accuracy is too strict thus it underestimates different outputs with the same semantic logic. To remedy this, we introduce a new automatic evaluation metric, dubbed CodeBLEU. It absorbs the strength of BLEU in the n-gram match and further injects code syntax via abstract syntax trees (AST) and code semantics via data-flow. We conduct experiments by evaluating the correlation coefficient between CodeBLEU and quality scores assigned by the programmers on three code synthesis tasks, i.e., text-to-code, code translation, and code refinement. Experimental results show that our proposed CodeBLEU can achieve a better correlation with programmer assigned scores compared with BLEU and accuracy. 
diff --git a/_publications/richardson2017code2text.markdown b/_publications/richardson2017code2text.markdown
index 305de096..e2b66e38 100644
--- a/_publications/richardson2017code2text.markdown
+++ b/_publications/richardson2017code2text.markdown
@@ -4,7 +4,6 @@ title: "The Code2Text Challenge: Text Generation in Source Code Libraries"
 authors: Kyle Richardson, Sina Zarrieß, Jonas Kuhn
 conference: INLG
 year: 2017
-bibkey: richardson2017code2text
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1708.00098"}
 tags: ["bimodal"]
diff --git a/_publications/richardson2017function.markdown b/_publications/richardson2017function.markdown
index b41e93f8..65d1063f 100644
--- a/_publications/richardson2017function.markdown
+++ b/_publications/richardson2017function.markdown
@@ -4,7 +4,6 @@ title: "Function Assistant: A Tool for NL Querying of APIs"
 authors: Kyle Richardson, Jonas Kuhn
 conference: EMNLP
 year: 2017
-bibkey: richardson2017function
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1706.00468"}
 tags: ["bimodal", "API"]
diff --git a/_publications/richardson2017learning.markdown b/_publications/richardson2017learning.markdown
index 01db6780..cf6f1cb6 100644
--- a/_publications/richardson2017learning.markdown
+++ b/_publications/richardson2017learning.markdown
@@ -4,7 +4,6 @@ title: "Learning Technical Correspondences in Technical Documentation"
 authors: Kyle Richardson, Jonas Kuhn
 conference: ACL
 year: 2017
-bibkey: richardson2017learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1705.04815"}
 tags: ["documentation", "API", "bimodal"]
diff --git a/_publications/richardson2018polyglot.markdown b/_publications/richardson2018polyglot.markdown
index 46b1f009..a2d9bf7e 100644
--- a/_publications/richardson2018polyglot.markdown
+++ b/_publications/richardson2018polyglot.markdown
@@ -4,7 +4,6 @@ title: "Polyglot Semantic Parsing in APIs"
 authors: Kyle Richardson, Jonathan Berant, Jonas Kuhn
 conference: NAACL
 year: 2018
-bibkey: richardson2018polyglot
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1803.06966"}
 tags: ["bimodal", "API"]
diff --git a/_publications/richter2022can.markdown b/_publications/richter2022can.markdown
new file mode 100644
index 00000000..d462f424
--- /dev/null
+++ b/_publications/richter2022can.markdown
@@ -0,0 +1,14 @@
+---
+layout: publication
+title: "Can we learn from developer mistakes? Learning to localize and repair real bugs from real bug fixes"
+authors: Cedric Richter, Heike Wehrheim
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2207.00301"}
+   - {name: "Code", url: "https://github.com/cedricrupb/nbfbaselines"}
+tags: ["Transformer", "repair", "defect"]
+---
+Real bug fixes found in open source repositories seem to be the perfect source for learning to localize and repair real bugs. However, the absence of large scale bug fix collections has made it difficult to effectively exploit real bug fixes in the training of larger neural models in the past. In contrast, artificial bugs -- produced by mutating existing source code -- can be easily obtained at a sufficient scale and are therefore often preferred in the training of existing approaches. Still, localization and repair models that are trained on artificial bugs usually underperform when faced with real bugs. This raises the question whether bug localization and repair models trained on real bug fixes are more effective in localizing and repairing real bugs.
+
+We address this question by introducing RealiT, a pre-train-and-fine-tune approach for effectively learning to localize and repair real bugs from real bug fixes. RealiT is first pre-trained on a large number of artificial bugs produced by traditional mutation operators and then fine-tuned on a smaller set of real bug fixes. Fine-tuning does not require any modifications of the learning algorithm and hence can be easily adopted in various training scenarios for bug localization or repair (even when real training data is scarce). In addition, we found that training on real bug fixes with RealiT is empirically powerful by nearly doubling the localization performance of an existing model on real bugs while maintaining or even improving the repair performance.
diff --git a/_publications/roziere2021dobf.markdown b/_publications/roziere2021dobf.markdown
new file mode 100644
index 00000000..8b653e2f
--- /dev/null
+++ b/_publications/roziere2021dobf.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DOBF: A Deobfuscation Pre-Training Objective for Programming Languages"
+authors: Baptiste Roziere, Marie-Anne Lachaux, Marc Szafraniec, Guillaume Lample
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2102.07492"}
+tags: ["pretraining"]
+---
+Recent advances in self-supervised learning have dramatically improved the state of the art on a wide variety of tasks. However, research in language model pre-training has mostly focused on natural languages, and it is unclear whether models like BERT and its variants provide the best pre-training when applied to other modalities, such as source code. In this paper, we introduce a new pre-training objective, DOBF, that leverages the structural aspect of programming languages and pre-trains a model to recover the original version of obfuscated source code. We show that models pre-trained with DOBF significantly outperform existing approaches on multiple downstream tasks, providing relative improvements of up to 13% in unsupervised code translation, and 24% in natural language code search. Incidentally, we found that our pre-trained model is able to de-obfuscate fully obfuscated source files, and to suggest descriptive variable names. 
diff --git a/_publications/roziere2021leveraging.markdown b/_publications/roziere2021leveraging.markdown
new file mode 100644
index 00000000..bdd4ce54
--- /dev/null
+++ b/_publications/roziere2021leveraging.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Leveraging Automated Unit Tests for Unsupervised Code Translation"
+authors: Baptiste Roziere, Jie M. Zhang, Francois Charton, Mark Harman, Gabriel Synnaeve, Guillaume Lample
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2110.06773"}
+tags: ["migration"]
+---
+With little to no parallel data available for programming languages, unsupervised methods are well-suited to source code translation. However, the majority of unsupervised machine translation approaches rely on back-translation, a method developed in the context of natural language translation and one that inherently involves training on noisy inputs. Unfortunately, source code is highly sensitive to small changes; a single token can result in compilation failures or erroneous programs, unlike natural languages where small inaccuracies may not change the meaning of a sentence. To address this issue, we propose to leverage an automated unit-testing system to filter out invalid translations, thereby creating a fully tested parallel corpus. We found that fine-tuning an unsupervised model with this filtered data set significantly reduces the noise in the translations so-generated, comfortably outperforming the state-of-the-art for all language pairs studied. In particular, for Java → Python and Python → C++ we outperform the best previous methods by more than 16% and 24% respectively, reducing the error rate by more than 35%. 
diff --git a/_publications/russell2018automated.markdown b/_publications/russell2018automated.markdown
index 2563e9e9..1cdb1e1f 100644
--- a/_publications/russell2018automated.markdown
+++ b/_publications/russell2018automated.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Automated Vulnerability Detection in Source Code Using Deep Representation Learning"
-authors: R. L. Russell, L. Kim, L. H. Hamilton, T. Lazovich, J. A. Harer, O. Ozdemir, P. M. Ellingwood, M. W. McConley
+authors: Rebecca L. Russell, Louis Kim, Lei H. Hamilton, Tomo Lazovich, Jacob A. Harer, Onur Ozdemir, Paul M. Ellingwood, Marc W. McConley
 conference: 
 year: 2018
-bibkey: russell2018automated
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1807.04320"}
 tags: ["program analysis"]
diff --git a/_publications/saberi2023model.markdown b/_publications/saberi2023model.markdown
new file mode 100644
index 00000000..7dcdc632
--- /dev/null
+++ b/_publications/saberi2023model.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Model-Agnostic Syntactical Information for Pre-Trained Programming Language Models"
+authors: Iman Saberi, Fateme H. Fard
+conference: MSR
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2303.06233"}
+tags: ["Transformer", "repair", "summarization"]
+---
+Pre-trained Programming Language Models (PPLMs) achieved many recent states of the art results for many code-related software engineering tasks. Though some studies use data flow or propose tree-based models that utilize Abstract Syntax Tree (AST), most PPLMs do not fully utilize the rich syntactical information in source code. Still, the input is considered a sequence of tokens. There are two issues; the first is computational inefficiency due to the quadratic relationship between input length and attention complexity. Second, any syntactical information, when needed as an extra input to the current PPLMs, requires the model to be pre-trained from scratch, wasting all the computational resources already used for pre-training the current models. In this work, we propose Named Entity Recognition (NER) adapters, lightweight modules that can be inserted into Transformer blocks to learn type information extracted from the AST. These adapters can be used with current PPLMs such as CodeBERT, GraphCodeBERT, and CodeT5. We train the NER adapters using a novel Token Type Classification objective function (TTC). We insert our proposed work in CodeBERT, building CodeBERTER, and evaluate the performance on two tasks of code refinement and code summarization. CodeBERTER improves the accuracy of code refinement from 16.4 to 17.8 while using 20% of training parameter budget compared to the fully fine-tuning approach, and the BLEU score of code summarization from 14.75 to 15.90 while reducing 77% of training parameters compared to the fully fine-tuning approach.
diff --git a/_publications/sahu2022learning.markdown b/_publications/sahu2022learning.markdown
new file mode 100644
index 00000000..c80232b7
--- /dev/null
+++ b/_publications/sahu2022learning.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Learning to Answer Semantic Queries over Code"
+authors: Surya Prakash Sahu, Madhurima Mandal, Shikhar Bharadwaj, Aditya Kanade, Petros Maniatis, Shirish Shevade
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2209.08372"}
+tags: ["static analysis", "Transformer"]
+---
+During software development, developers need answers to queries about semantic aspects of code. Even though extractive question-answering using neural approaches has been studied widely in natural languages, the problem of answering semantic queries over code using neural networks has not yet been explored. This is mainly because there is no existing dataset with extractive question and answer pairs over code involving complex concepts and long chains of reasoning. We bridge this gap by building a new, curated dataset called CodeQueries, and proposing a neural question-answering methodology over code.
+We build upon state-of-the-art pre-trained models of code to predict answer and supporting-fact spans. Given a query and code, only some of the code may be relevant to answer the query. We first experiment under an ideal setting where only the relevant code is given to the model and show that our models do well. We then experiment under three pragmatic considerations: (1) scaling to large-size code, (2) learning from a limited number of examples and (3) robustness to minor syntax errors in code. Our results show that while a neural model can be resilient to minor syntax errors in code, increasing size of code, presence of code that is not relevant to the query, and reduced number of training examples limit the model performance. We are releasing our data and models to facilitate future work on the proposed problem of answering semantic queries over code.
diff --git a/_publications/saini2018oreo.markdown b/_publications/saini2018oreo.markdown
index 898c5fa1..599a9c86 100644
--- a/_publications/saini2018oreo.markdown
+++ b/_publications/saini2018oreo.markdown
@@ -1,14 +1,13 @@
 ---
 layout: publication
 title: "Oreo: detection of clones in the twilight zone"
-authors: V. Saini, F. Farmahinifarahani, Y. Lu, P. Baldi, C. Lopes
+authors: Vaibhav Saini, Farima Farmahinifarahani, Yadong Lu, Pierre Baldi, Cristina Lopes
 conference: ESEC/FSE
 year: 2018
-bibkey: saini2018oreo
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1806.05837"}
    - {name: "website", url: "https://dl.acm.org/doi/abs/10.1145/3236024.3236026"}
    - {name: "code", url: "https://github.com/Mondego/oreo-artifact"}
-tags: ["clone", "metrics"]
+tags: ["clone"]
 ---
 Source code clones are categorized into four types of increasing difficulty of detection, ranging from purely textual (Type-1) to purely semantic (Type-4). Most clone detectors reported in the literature work well up to Type-3, which accounts for syntactic differences. In between Type-3 and Type-4, however, there lies a spectrum of clones that, although still exhibiting some syntactic similarities, are extremely hard to detect – the Twilight Zone. Most clone detectors reported in the literature fail to operate in this zone. We present Oreo, a novel approach to source code clone detection that not only detects Type-1 to Type-3 clones accurately, but is also capable of detecting harder-to-detect clones in the Twilight Zone. Oreo is built using a combination of machine learning, information retrieval, and software metrics. We evaluate the recall of Oreo on BigCloneBench, and perform manual evaluation for precision. Oreo has both high recall and precision. More importantly, it pushes the boundary in detection of clones with moderate to weak syntactic similarity in a scalable manner.
diff --git a/_publications/santos2018syntax.markdown b/_publications/santos2018syntax.markdown
index 9229c502..a8345ce3 100644
--- a/_publications/santos2018syntax.markdown
+++ b/_publications/santos2018syntax.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Syntax and Sensibility: Using language models to detect and correct syntax errors"
-authors: E. A. Santos, J. C. Campbell, D. Patel, A. Hindle, J. N. Amaral
+authors: Eddie Antonio Santos, Joshua Charles Campbell, Dhvani Patel, Abram Hindle, José Nelson Amaral
 conference: SANER
 year: 2018
-bibkey: santos2018syntax
 additional_links:
    - {name: "PDF", url: "http://softwareprocess.es/pubs/santos2018SANER-syntax.pdf"}
    - {name: "code", url: "https://github.com/naturalness/sensibility"}
diff --git a/_publications/saraiva2015products.markdown b/_publications/saraiva2015products.markdown
index fb845d8b..a011b299 100644
--- a/_publications/saraiva2015products.markdown
+++ b/_publications/saraiva2015products.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Products, Developers, and Milestones: How Should I Build My N-Gram Language Model"
-authors: C. Saraiva, C. Bird, T. Zimmermann
+authors: Juliana Saraiva, Christian Bird, Thomas Zimmermann
 conference: FSE
 year: 2015
-bibkey: saraiva2015products
 tags: ["language model"]
 ---
 Recent work has shown that although programming languages en-
diff --git a/_publications/sarkar2022what.markdown b/_publications/sarkar2022what.markdown
new file mode 100644
index 00000000..e8507132
--- /dev/null
+++ b/_publications/sarkar2022what.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "What is it like to program with artificial intelligence?"
+authors: Advait Sarkar, Andrew D. Gordon, Carina Negreanu, Christian Poelitz, Sruti Srinivasa Ragavan, Ben Zorn
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2208.06213"}
+tags: ["human evaluation", "review"]
+---
+Large language models, such as OpenAI's codex and Deepmind's AlphaCode, can generate code to solve a variety of problems expressed in natural language. This technology has already been commercialised in at least one widely-used programming editor extension: GitHub Copilot.
+
+In this paper, we explore how programming with large language models (LLM-assisted programming) is similar to, and differs from, prior conceptualisations of programmer assistance. We draw upon publicly available experience reports of LLM-assisted programming, as well as prior usability and design studies. We find that while LLM-assisted programming shares some properties of compilation, pair programming, and programming via search and reuse, there are fundamental differences both in the technical possibilities as well as the practical experience. Thus, LLM-assisted programming ought to be viewed as a new way of programming with its own distinct properties and challenges.
+
+Finally, we draw upon observations from a user study in which non-expert end user programmers use LLM-assisted tools for solving data tasks in spreadsheets. We discuss the issues that might arise, and open research challenges, in applying large language models to end-user programming, particularly with users who have little or no programming expertise.
diff --git a/_publications/schrouff2019inferring.markdown b/_publications/schrouff2019inferring.markdown
index c0f8e33e..84901d2c 100644
--- a/_publications/schrouff2019inferring.markdown
+++ b/_publications/schrouff2019inferring.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Inferring Javascript types using Graph Neural Networks"
-authors: J. Schrouff, K. Wohlfahrt, B. Marnette, L. Atkinson
+authors: Jessica Schrouff, Kai Wohlfahrt, Bruno Marnette, Liam Atkinson
 conference: Representation Learning on Graphs and Manifolds ICLR 2019 workshop 
 year: 2019
-bibkey: schrouff2019inferring
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1905.06707"}
 tags: ["GNN", "types", "program analysis"]
 ---
 The recent use of `Big Code' with state-of-the-art deep learning methods offers promising avenues to ease program source code writing and correction. As a first step towards automatic code repair, we implemented a graph neural network model that predicts token types for Javascript programs. The predictions achieve an accuracy above 90%, which improves on previous similar work. 
diff --git a/_publications/schuster2021you.markdown b/_publications/schuster2021you.markdown
new file mode 100644
index 00000000..e44b7f4a
--- /dev/null
+++ b/_publications/schuster2021you.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion"
+authors: Roei Schuster, Congzheng Song, Eran Tromer, Vitaly Shmatikov
+conference: USENIX Security
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2007.02220"}
+tags: ["autocomplete", "adversarial"]
+---
+Code autocompletion is an integral feature of modern code editors and IDEs. The latest generation of autocompleters uses neural language models, trained on public open-source code repositories, to suggest likely (not just statically feasible) completions given the current context.
+
+We demonstrate that neural code autocompleters are vulnerable to poisoning attacks. By adding a few specially-crafted files to the autocompleter's training corpus (data poisoning), or else by directly fine-tuning the autocompleter on these files (model poisoning), the attacker can influence its suggestions for attacker-chosen contexts. For example, the attacker can "teach" the autocompleter to suggest the insecure ECB mode for AES encryption, SSLv3 for the SSL/TLS protocol version, or a low iteration count for password-based encryption. Moreover, we show that these attacks can be targeted: an autocompleter poisoned by a targeted attack is much more likely to suggest the insecure completion for files from a specific repo or specific developer.
+
+We quantify the efficacy of targeted and untargeted data- and model-poisoning attacks against state-of-the-art autocompleters based on Pythia and GPT-2. We then evaluate existing defenses against poisoning attacks and show that they are largely ineffective. 
diff --git a/_publications/sharma2015nirmal.markdown b/_publications/sharma2015nirmal.markdown
index 4bd3421f..66d67e35 100644
--- a/_publications/sharma2015nirmal.markdown
+++ b/_publications/sharma2015nirmal.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "NIRMAL: Automatic Identification of Software Relevant Tweets Leveraging Language Model"
-authors: A. Sharma, Y. Tian, D. Lo
+authors: Abhishek Sharma, Yuan Tian, David Lo
 conference: SANER
 year: 2015
-bibkey: sharma2015nirmal
 tags: ["information extraction"]
 ---
 Twitter is one of the most widely used social media
diff --git a/_publications/sharma2019feasibility.markdown b/_publications/sharma2019feasibility.markdown
index 4d740a9c..daeec516 100644
--- a/_publications/sharma2019feasibility.markdown
+++ b/_publications/sharma2019feasibility.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "On the Feasibility of Transfer-learning Code Smells using Deep Learning"
-authors: T. Sharma, V. Eftathiou, P. Louridas, D. Spinellis
+authors: Tushar Sharma, Vasiliki Efstathiou, Panos Louridas, Diomidis Spinellis
 conference: 
 year: 2019
-bibkey: sharma2019feasibility
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1904.03031"}
 tags: ["representation", "program analysis"]
 ---
 **Context**: A substantial amount of work has been done to detect smells in source code using metrics-based and heuristics-based methods. Machine learning methods have been recently applied to detect source code smells; however, the current practices are considered far from mature.
diff --git a/_publications/sharma2022exploratory.markdown b/_publications/sharma2022exploratory.markdown
new file mode 100644
index 00000000..0954a171
--- /dev/null
+++ b/_publications/sharma2022exploratory.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "An Exploratory Study on Code Attention in BERT"
+authors: Rishab Sharma, Fuxiang Chen, Fatemeh H. Fard, David Lo
+conference: ICPC
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2204.10200"}
+   - {name: "code", url: "https://github.com/fardfh-lab/Code-Attention-BERT"}
+tags: ["Transformer", "representation", "language model", "interpretability", "pretraining", "clone"]
+---
+Many recent models in software engineering introduced deep neural models based on the Transformer architecture or use transformer-based Pre-trained Language Models (PLM) trained on code. Although these models achieve the state of the arts results in many downstream tasks such as code summarization and bug detection, they are based on Transformer and PLM, which are mainly studied in the Natural Language Processing (NLP) field. The current studies rely on the reasoning and practices from NLP for these models in code, despite the differences between natural languages and programming languages. There is also limited literature on explaining how code is modeled. Here, we investigate the attention behavior of PLM on code and compare it with natural language. We pre-trained BERT, a Transformer based PLM, on code and explored what kind of information it learns, both semantic and syntactic. We run several experiments to analyze the attention values of code constructs on each other and what BERT learns in each layer. Our analyses show that BERT pays more attention to syntactic entities, specifically identifiers and separators, in contrast to the most attended token [CLS] in NLP. This observation motivated us to leverage identifiers to represent the code sequence instead of the [CLS] token when used for code clone detection. Our results show that employing embeddings from identifiers increases the performance of BERT by 605% and 4% F1-score in its lower layers and the upper layers, respectively. When identifiers' embeddings are used in CodeBERT, a code-based PLM, the performance is improved by 21--24% in the F1-score of clone detection. The findings can benefit the research community by using code-specific representations instead of applying the common embeddings used in NLP, and open new directions for developing smaller models with similar performance.
+
diff --git a/_publications/sharma2022lamner.markdown b/_publications/sharma2022lamner.markdown
new file mode 100644
index 00000000..bc839cea
--- /dev/null
+++ b/_publications/sharma2022lamner.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "LAMNER: Code Comment Generation Using Character Language Model and Named Entity Recognition"
+authors: Rishab Sharma, Fuxiang Chen, Fatemeh H. Fard
+conference: ICPC
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2204.09654"}
+   - {name: "code", url: "https://github.com/fardfh-lab/LAMNER"}
+tags: ["summarization", "documentation", "language model", "types", "representation"]
+---
+Code comment generation is the task of generating a high-level natural language description for a given code method/function. Although researchers have been studying multiple ways to generate code comments automatically, previous work mainly considers representing a code token in its entirety semantics form only (e.g., a language model is used to learn the semantics of a code token), and additional code properties such as the tree structure of a code are included as an auxiliary input to the model. There are two limitations: 1) Learning the code token in its entirety form may not be able to capture information succinctly in source code, and 2)The code token does not contain additional syntactic information, inherently important in programming languages. In this paper, we present LAnguage Model and Named Entity Recognition (LAMNER), a code comment generator capable of encoding code constructs effectively and capturing the structural property of a code token. A character-level language model is used to learn the semantic representation to encode a code token. For the structural property of a token, a Named Entity Recognition model is trained to learn the different types of code tokens. These representations are then fed into an encoder-decoder architecture to generate code comments. We evaluate the generated comments from LAMNER and other baselines on a popular Java dataset with four commonly used metrics. Our results show that LAMNER is effective and improves over the best baseline model in BLEU-1, BLEU-2, BLEU-3, BLEU-4, ROUGE-L, METEOR, and CIDEr by 14.34%, 18.98%, 21.55%, 23.00%, 10.52%, 1.44%, and 25.86%, respectively. Additionally, we fused LAMNER’s code representation with the baseline models, and the fused models consistently showed improvement over the nonfused models. The human evaluation further shows that LAMNER produces high-quality code comments.
+
diff --git a/_publications/she2019neuzz.markdown b/_publications/she2019neuzz.markdown
index 886f0a6b..d0ca1ce8 100644
--- a/_publications/she2019neuzz.markdown
+++ b/_publications/she2019neuzz.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "NEUZZ: Efficient Fuzzing with Neural Program Smoothing"
-authors: D. She, K. Pei, D. Epstein, J. Yang, B. Ray, S. Jana
+authors: Dongdong She, Kexin Pei, Dave Epstein, Junfeng Yang, Baishakhi Ray, Suman Jana
 conference: "IEEE S&P"
 year: 2019
-bibkey: she2019neuzz
 additional_links:
    - {name: "Code", url: "https://github.com/Dongdongshe/neuzz"}
 tags: ["fuzzing"]
diff --git a/_publications/shi2019learning.markdown b/_publications/shi2019learning.markdown
index fff4d57a..aac96ea4 100644
--- a/_publications/shi2019learning.markdown
+++ b/_publications/shi2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Execution through Neural Code Fusion"
-authors: Z. Shi, K. Swersky, D. Tarlow, P. Ranganathan, M. Hashemi
+authors: Zhan Shi, Kevin Swersky, Daniel Tarlow, Parthasarathy Ranganathan, Milad Hashemi
 conference:
 year: 2019
-bibkey: shi2019learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1906.07181"}
 tags: ["representation"]
diff --git a/_publications/shi2022cv4code.markdown b/_publications/shi2022cv4code.markdown
new file mode 100644
index 00000000..5c9f78cf
--- /dev/null
+++ b/_publications/shi2022cv4code.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CV4Code: Sourcecode Understanding via Visual Code Representations"
+authors: Ruibo Shi, Lili Tao, Rohan Saphal, Fran Silavong, Sean J. Moran
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2205.08585"}
+tags: ["code similarity", "Transformer"]
+---
+We present CV4Code, a compact and effective computer vision method for sourcecode understanding. Our method leverages the contextual and the structural information available from the code snippet by treating each snippet as a two-dimensional image, which naturally encodes the context and retains the underlying structural information through an explicit spatial representation. To codify snippets as images, we propose an ASCII codepoint-based image representation that facilitates fast generation of sourcecode images and eliminates redundancy in the encoding that would arise from an RGB pixel representation. Furthermore, as sourcecode is treated as images, neither lexical analysis (tokenisation) nor syntax tree parsing is required, which makes the proposed method agnostic to any particular programming language and lightweight from the application pipeline point of view. CV4Code can even featurise syntactically incorrect code which is not possible from methods that depend on the Abstract Syntax Tree (AST). We demonstrate the effectiveness of CV4Code by learning Convolutional and Transformer networks to predict the functional task, i.e. the problem it solves, of the source code directly from its two-dimensional representation, and using an embedding from its latent space to derive a similarity score of two code snippets in a retrieval setup. Experimental results show that our approach achieves state-of-the-art performance in comparison to other methods with the same task and data configurations. For the first time we show the benefits of treating sourcecode understanding as a form of image processing task.
diff --git a/_publications/shido2019automatic.markdown b/_publications/shido2019automatic.markdown
new file mode 100644
index 00000000..4ecc4ddd
--- /dev/null
+++ b/_publications/shido2019automatic.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Automatic Source Code Summarization with Extended Tree-LSTM"
+authors: Yusuke Shido, Yasuaki Kobayashi, Akihiro Yamamoto, Atsushi Miyamoto, Tadayuki Matsumura
+conference: International Joint Conference on Neural Networks
+year: 2019
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1906.08094"}
+   - {name: "Dataset", url: "https://github.com/xing-hu/DeepCom"}
+   - {name: "code", url: "https://github.com/sh1doy/summarization_tf"}
+tags: ["summarization", "grammar"]
+---
+Neural machine translation models are used to automatically generate a document from given source code since this can be regarded as a machine translation task. Source code summarization is one of the components for automatic document generation, which generates a summary in natural language from given source code. This suggests that techniques used in neural machine translation, such as Long Short-Term Memory (LSTM), can be used for source code summarization. However, there is a considerable difference between source code and natural language: Source code is essentially structured, having loops and conditional branching, etc. Therefore, there is some obstacle to apply known machine translation models to source code.Abstract syntax trees (ASTs) capture these structural properties and play an important role in recent machine learning studies on source code. Tree-LSTM is proposed as a generalization of LSTMs for tree-structured data. However, there is a critical issue when applying it to ASTs: It cannot handle a tree that contains nodes having an arbitrary number of children and their order simultaneously, which ASTs generally have such nodes. To address this issue, we propose an extension of Tree-LSTM, which we call Multi-way Tree-LSTM and apply it for source code summarization. As a result of computational experiments, our proposal achieved better results when compared with several state-of-the-art techniques.
diff --git a/_publications/shirani2018evaluation.markdown b/_publications/shirani2018evaluation.markdown
index 5de0092c..acff0ea8 100644
--- a/_publications/shirani2018evaluation.markdown
+++ b/_publications/shirani2018evaluation.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Evaluation of Type Inference with Textual Cues"
-authors: A. Shirani, A. P. Lopez-Monroy, F. Gonzalez, T. Solorio, M.A. Alipour
+authors: Amirreza A. Shirani, A. Pastor Lopez-Monroy, Fabio Gonzalez, Thamar Solorio, Mohammad Amin Alipour
 conference: NLSE
 year: 2018
-bibkey: shirani2018evaluation
 additional_links:
    - {name: "PDF", url: "https://alipourm.github.io/pub/nl4se18.pdf"}
 tags: ["information extraction"]
diff --git a/_publications/shrivastava2020on-the-fly.markdown b/_publications/shrivastava2020on-the-fly.markdown
index aa544398..699fb839 100644
--- a/_publications/shrivastava2020on-the-fly.markdown
+++ b/_publications/shrivastava2020on-the-fly.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "On-the-Fly Adaptation of Source Code Models using Meta-Learning"
-authors: D. Shrivastava, H. Larochelle, D. Tarlow
+authors: Disha Shrivastava, Hugo Larochelle, Daniel Tarlow
 conference:
 year: 2020
-bibkey: shrivastava2020on-the-fly
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2003.11768"}
    - {name: "Code", url: "https://github.com/shrivastavadisha/meta_learn_source_code"}
diff --git a/_publications/shrivastava2020repository.markdown b/_publications/shrivastava2020repository.markdown
new file mode 100644
index 00000000..5af6a384
--- /dev/null
+++ b/_publications/shrivastava2020repository.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Repository-Level Prompt Generation for Large Language Models of Code"
+authors: Disha Shrivastava, Hugo Larochelle, Daniel Tarlow
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2206.12839"}
+tags: ["Transformer", "code completion"]
+---
+With the success of large language models (LLMs) of code and their use as code assistants (e.g. Codex used in GitHub Copilot), techniques for introducing domain-specific knowledge in the prompt design process become important. In this work, we propose a framework called Repo-Level Prompt Generator that learns to generate example-specific prompts using a set of rules. These rules take context from the entire repository, thereby incorporating both the structure of the repository and the context from other relevant files (e.g. imports, parent class files). Our technique doesn't require any access to the weights of the LLM, making it applicable in cases where we only have black-box access to the LLM. We conduct experiments on the task of single-line code-autocompletion using code repositories taken from Google Code archives. We demonstrate that an oracle constructed from our proposed rules gives up to 36% relative improvement over Codex, showing the quality of the rules. Further, we show that when we train a model to select the best rule, we can achieve significant performance gains over Codex. The code for our work can be found at: https://github.com/shrivastavadisha/repo_level_prompt_generation .
diff --git a/_publications/shrivastava2023repofusion.markdown b/_publications/shrivastava2023repofusion.markdown
new file mode 100644
index 00000000..e450ec90
--- /dev/null
+++ b/_publications/shrivastava2023repofusion.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "RepoFusion: Training Code Models to Understand Your Repository"
+authors: Disha Shrivastava, Denis Kocetkov, Harm de Vries, Dzmitry Bahdanau, Torsten Scholak
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2306.10998"}
+tags: ["completion"]
+---
+Despite the huge success of Large Language Models (LLMs) in coding assistants like GitHub Copilot, these models struggle to understand the context present in the repository (e.g., imports, parent classes, files with similar names, etc.), thereby producing inaccurate code completions. This effect is more pronounced when using these assistants for repositories that the model has not seen during training, such as proprietary software or work-in-progress code projects. Recent work has shown the promise of using context from the repository during inference. In this work, we extend this idea and propose RepoFusion, a framework to train models to incorporate relevant repository context. Experiments on single-line code completion show that our models trained with repository context significantly outperform much larger code models as CodeGen-16B-multi ($\sim73\times$ larger) and closely match the performance of the $\sim 70\times$ larger StarCoderBase model that was trained with the Fill-in-the-Middle objective. We find these results to be a novel and compelling demonstration of the gains that training with repository context can bring. We carry out extensive ablation studies to investigate the impact of design choices such as context type, number of contexts, context length, and initialization within our framework. Lastly, we release Stack-Repo, a dataset of 200 Java repositories with permissive licenses and near-deduplicated files that are augmented with three types of repository contexts. Additionally, we are making available the code and trained checkpoints for our work. Our released resources can be found at \url{https://huggingface.co/RepoFusion}.
diff --git a/_publications/shuai2020improving.markdown b/_publications/shuai2020improving.markdown
new file mode 100644
index 00000000..cada5da0
--- /dev/null
+++ b/_publications/shuai2020improving.markdown
@@ -0,0 +1,14 @@
+---
+layout: publication
+title: "Improving Code Search with Co-Attentive Representation Learning"
+authors: Jianhang Shuai, Ling Xu, Chao Liu, Meng Yan, Xin Xia, Yan Lei
+conference: ICPC
+year: 2020
+additional_links:
+  - { name: "ACM", url: "https://dl.acm.org/doi/abs/10.1145/3387904.3389269" }
+tags: ["search"]
+---
+
+Searching and reusing existing code from a large-scale codebase, e.g, GitHub, can help developers complete a programming task efficiently. Recently, Gu et al. proposed a deep learning-based model (i.e., DeepCS), which significantly outperformed prior models. The DeepCS embedded codebase and natural language queries into vectors by two LSTM (long and short-term memory) models separately, and returned developers the code with higher similarity to a code search query. However, such embedding method learned two isolated representations for code and query but ignored their internal semantic correlations. As a result, the learned isolated representations of code and query may limit the effectiveness of code search.
+
+To address the aforementioned issue, we propose a co-attentive representation learning model, i.e., Co-Attentive Representation Learning Code Search-CNN (CARLCS-CNN). CARLCS-CNN learns interdependent representations for the embedded code and query with a co-attention mechanism. Generally, such mechanism learns a correlation matrix between embedded code and query, and co-attends their semantic relationship via row/column-wise max-pooling. In this way, the semantic correlation between code and query can directly affect their individual representations. We evaluate the effectiveness of CARLCS-CNN on Gu et al.'s dataset with 10k queries. Experimental results show that the proposed CARLCS-CNN model significantly outperforms DeepCS by 26.72% in terms of MRR (mean reciprocal rank). Additionally, CARLCS-CNN is five times faster than DeepCS in model training and four times in testing.
diff --git a/_publications/si2018learning.markdown b/_publications/si2018learning.markdown
index 288f430a..99a01b33 100644
--- a/_publications/si2018learning.markdown
+++ b/_publications/si2018learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Loop Invariants for Program Verification"
-authors: X. Si, H. Dai, M. Raghothaman, M. Naik, L. Song
-conference: NIPS
+authors: Xujie Si, Hanjun Dai, Mukund Raghothaman, Mayur Naik, Le Song
+conference: NeurIPS
 year: 2018
-bibkey: si2018learning
 additional_links:
    - {name: "Preprint", url: "https://www.cis.upenn.edu/~mhnaik/papers/nips18.pdf"}
 tags: ["program analysis", "verification"]
diff --git a/_publications/silavong2022senatus.markdown b/_publications/silavong2022senatus.markdown
new file mode 100644
index 00000000..40067f6f
--- /dev/null
+++ b/_publications/silavong2022senatus.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Senatus - A Fast and Accurate Code-to-Code Recommendation Engine"
+authors: Fran Silavong, Sean Moran, Antonios Georgiadis, Rohan Saphal, Robert Otter
+conference: MSR
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2111.04473"}
+tags: ["code similarity", "search"]
+---
+Machine learning on source code (MLOnCode) is a popular research field that has been driven by the availability of large-scale code repositories and the development of powerful probabilistic and deep learning models for mining source code. Code-to-code recommendation is a task in MLOnCode that aims to recommend relevant, diverse and concise code snippets that usefully extend the code currently being written by a developer in their development environment (IDE). Code-to-code recommendation engines hold the promise of increasing developer productivity by reducing context switching from the IDE and increasing code-reuse. Existing code-to-code recommendation engines do not scale gracefully to large codebases, exhibiting a linear growth in query time as the code repository increases in size. In addition, existing code-to-code recommendation engines fail to account for the global statistics of code repositories in the ranking function, such as the distribution of code snippet lengths, leading to sub-optimal retrieval results. We address both of these weaknesses with Senatus, a new code-to-code recommendation engine. At the core of Senatus is De-Skew LSH a new locality sensitive hashing (LSH) algorithm that indexes the data for fast (sub-linear time) retrieval while also counteracting the skewness in the snippet length distribution using novel abstract syntax tree-based feature scoring and selection algorithms. We evaluate Senatus and find the recommendations to be of higher quality than competing baselines, while achieving faster search. For example on the CodeSearchNet dataset Senatus improves performance by 31.21% F1 and  147.9x faster query time compared to Facebook Aroma. Senatus also outperforms standard MinHash LSH by 29.2% F1 and 51.02x faster query time. 
diff --git a/_publications/silva2023repairllama.markdown b/_publications/silva2023repairllama.markdown
new file mode 100644
index 00000000..42df7795
--- /dev/null
+++ b/_publications/silva2023repairllama.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair"
+authors: André Silva, Sen Fang, Martin Monperrus
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2312.15698"}
+tags: ["repair"]
+---
+Automated Program Repair (APR) has evolved significantly with the advent of Large Language Models (LLMs). Fine-tuning LLMs for program repair is a recent avenue of research, with many dimensions which have not been explored. Existing work mostly fine-tunes LLMs with naive code representations and is fundamentally limited in its ability to fine-tune larger LLMs. To address this problem, we propose RepairLLaMA, a novel program repair approach that combines 1) code representations for APR and 2) the state-of-the-art parameter-efficient LLM fine-tuning technique called LoRA. This results in RepairLLaMA producing a highly effective `program repair adapter' for fixing bugs with language models. Our experiments demonstrate the validity of both concepts. First, fine-tuning adapters with program repair specific code representations enables the model to use meaningful repair signals. Second, parameter-efficient fine-tuning helps fine-tuning to converge and contributes to the effectiveness of the repair adapter to fix data-points outside the fine-tuning data distribution. Overall, RepairLLaMA correctly fixes 125 Defects4J v2 and 82 HumanEval-Java bugs, outperforming all baselines.
diff --git a/_publications/singh2016question.markdown b/_publications/singh2016question.markdown
index f745c443..053a00ec 100644
--- a/_publications/singh2016question.markdown
+++ b/_publications/singh2016question.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Question Independent Grading using Machine Learning: The Case of Computer Program Grading"
-authors: G. Singh, S. Srikant, V. Aggarwal
+authors: Gursimran Singh, Shashank Srikant, Varun Aggarwal
 conference: KDD
 year: 2016
-bibkey: singh2016question
 additional_links:
    - {name: "PDF", url: "https://dl.acm.org/citation.cfm?id=2939696"}
    - {name: "website", url: "http://research.aspiringminds.com/"}
diff --git a/_publications/siow2019core.markdown b/_publications/siow2019core.markdown
index 57b9e734..c3efdf28 100644
--- a/_publications/siow2019core.markdown
+++ b/_publications/siow2019core.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "CORE: Automating Review Recommendation for Code Changes"
-authors: J. Siow, C. Gao, L. Fan, S. Chen, Y. Liu
+authors: JingKai Siow, Cuiyun Gao, Lingling Fan, Sen Chen, Yang Liu
 conference: SANER
 year: 2019
-bibkey: siow2019core
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1912.09652"}
 tags: ["review"]
diff --git a/_publications/siow2022learning.markdown b/_publications/siow2022learning.markdown
new file mode 100644
index 00000000..301f42dd
--- /dev/null
+++ b/_publications/siow2022learning.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Learning Program Semantics with Code Representations: An Empirical Study"
+authors: Jing Kai Siow, Shangqing Liu, Xiaofei Xie, Guozhu Meng, Yang Liu
+conference: SANER
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.11790"}
+tags: ["representation"]
+---
+Program semantics learning is the core and fundamental for various code intelligent tasks e.g., vulnerability detection, clone detection. A considerable amount of existing works propose diverse approaches to learn the program semantics for different tasks and these works have achieved state-of-the-art performance. However, currently, a comprehensive and systematic study on evaluating different program representation techniques across diverse tasks is still missed.
+
+From this starting point, in this paper, we conduct an empirical study to evaluate different program representation techniques. Specifically, we categorize current mainstream code representation techniques into four categories i.e., Feature-based, Sequence-based, Tree-based, and Graph-based program representation technique and evaluate its performance on three diverse and popular code intelligent tasks i.e., {Code Classification}, Vulnerability Detection, and Clone Detection on the public released benchmark. We further design three {research questions (RQs)} and conduct a comprehensive analysis to investigate the performance. By the extensive experimental results, we conclude that (1) The graph-based representation is superior to the other selected techniques across these tasks. (2) Compared with the node type information used in tree-based and graph-based representations, the node textual information is more critical to learning the program semantics. (3) Different tasks require the task-specific semantics to achieve their highest performance, however combining various program semantics from different dimensions such as control dependency, data dependency can still produce promising results. 
diff --git a/_publications/sivaraman2021mining.markdown b/_publications/sivaraman2021mining.markdown
new file mode 100644
index 00000000..c2aacc0e
--- /dev/null
+++ b/_publications/sivaraman2021mining.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "Mining Idioms in the Wild"
+authors: Aishwarya Sivaraman, Rui Abreu, Andrew Scott, Tobi Akomolede, Satish Chandra
+conference: 
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2107.06402"}
+tags: ["pattern mining", "refactoring"]
+---
+Existing code repositories contain numerous instances of code patterns that are idiomatic ways of accomplishing a particular programming task. Sometimes, the programming language in use supports specific operators or APIs that can express the same idiomatic imperative code much more succinctly. However, those code patterns linger in repositories because the developers may be unaware of the new APIs or have not gotten around to them. Detection of idiomatic code can also point to the need for new APIs.
+
+We share our experiences in mine idiomatic patterns from the Hack repo at Facebook. We found that existing techniques either cannot identify meaningful patterns from syntax trees or require test-suite-based dynamic analysis to incorporate semantic properties to mine useful patterns. The key insight of the approach proposed in this paper -- Jezero -- is that semantic idioms from a large codebase can be learned from canonicalized dataflow trees. We propose a scalable, lightweight static analysis-based approach to construct such a tree that is well suited to mine semantic idioms using nonparametric Bayesian methods.
+
+Our experiments with Jezero on Hack code shows a clear advantage of adding canonicalized dataflow information to ASTs: Jezero was significantly more effective than a baseline that did not have the dataflow augmentation in being able to effectively find refactoring opportunities from unannotated legacy code. 
diff --git a/_publications/souza2023lexecutor.markdown b/_publications/souza2023lexecutor.markdown
new file mode 100644
index 00000000..1ad8eb1b
--- /dev/null
+++ b/_publications/souza2023lexecutor.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "LExecutor: Learning-Guided Execution"
+authors: Beatriz Souza, Michael Pradel
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2302.02343"}
+   - {name: "Code", url: "https://github.com/michaelpradel/LExecutor"}
+tags: ["execution"]
+---
+Executing code is essential for various program analysis tasks, e.g., to detect bugs that manifest through exceptions or to obtain execution traces for further dynamic analysis. However, executing an arbitrary piece of code is often difficult in practice, e.g., because of missing variable definitions, missing user inputs, and missing third-party dependencies. This paper presents LExecutor, a learning-guided approach for executing arbitrary code snippets in an underconstrained way. The key idea is to let a neural model predict missing values that otherwise would cause the program to get stuck, and to inject these values into the execution. For example, LExecutor injects likely values for otherwise undefined variables and likely return values of calls to otherwise missing functions. We evaluate the approach on Python code from popular open-source projects and on code snippets extracted from Stack Overflow. The neural model predicts realistic values with an accuracy between 80.1% and 94.2%, allowing LExecutor to closely mimic real executions. As a result, the approach successfully executes significantly more code than any available technique, such as simply executing the code as-is. For example, executing the open-source code snippets as-is covers only 4.1% of all lines, because the code crashes early on, whereas LExecutor achieves a coverage of 50.1%.
+
diff --git a/_publications/spirin2021psiminer.markdown b/_publications/spirin2021psiminer.markdown
new file mode 100644
index 00000000..b64df54f
--- /dev/null
+++ b/_publications/spirin2021psiminer.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "PSIMiner: A Tool for Mining Rich Abstract Syntax Trees from Code"
+authors: Egor Spirin, Egor Bogomolov, Vladimir Kovalenko, Timofey Bryksin
+conference: MSR
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2103.12778"}
+   - {name: "website", url: "https://research.jetbrains.org/groups/ml_methods/publications/"}
+   - {name: "code", url: "https://github.com/JetBrains-Research/psiminer"}
+tags: ["tool"]
+---
+The application of machine learning algorithms to source code has grown in the past years. Since these algorithms are quite sensitive to input data, it is not surprising that researchers experiment with input representations. Nowadays, a popular starting point to represent code is abstract syntax trees (ASTs). Abstract syntax trees have been used for a long time in various software engineering domains, and in particular in IDEs. The API of modern IDEs allows to manipulate and traverse ASTs, resolve references between code elements, etc. Such algorithms can enrich ASTs with new data and therefore may be useful in ML-based code analysis. In this work, we present PSIMINER— a tool for processing PSI trees from the IntelliJ Platform. PSI trees contain code syntax trees as well as functions to work with them, and therefore can be used to enrich code representation using static analysis algorithms of modern IDEs. To showcase this idea, we use our tool to infer types of identifiers in Java ASTs and extend the code2seq model for the method name prediction problem.
diff --git a/_publications/srikant2014system.markdown b/_publications/srikant2014system.markdown
index c51f06e4..15dd7d08 100644
--- a/_publications/srikant2014system.markdown
+++ b/_publications/srikant2014system.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "A system to grade computer programming skills using machine learning"
-authors: S. Srikant, V. Aggarwal
+authors: Shashank Srikant, Varun Aggarwal
 conference: KDD
 year: 2014
-bibkey: srikant2014system
 additional_links:
    - {name: "PDF", url: "https://dl.acm.org/citation.cfm?id=2623377"}
    - {name: "website", url: "http://research.aspiringminds.com/"}
diff --git a/_publications/sun2019grammar.markdown b/_publications/sun2019grammar.markdown
index bd4a669a..48f61ddc 100644
--- a/_publications/sun2019grammar.markdown
+++ b/_publications/sun2019grammar.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "A Grammar-Based Structural CNN Decoder for Code Generation"
-authors: Z. Sun, Q. Zhu, L. Mou, Y. Xiong, G. Li, L. Zhang
+authors: Zeyu Sun, Qihao Zhu, Lili Mou, Yingfei Xiong, Ge Li, Lu Zhang
 conference: AAAI
 year: 2019
-bibkey: sun2019grammar
-tags: ["generation", "grammar"]
+tags: ["code generation", "grammar"]
 ---
 Code  generation  maps  a  program  description  to  executable
 source code in a programming language. Existing approaches
diff --git a/_publications/sun2020pscs.markdown b/_publications/sun2020pscs.markdown
new file mode 100644
index 00000000..87048135
--- /dev/null
+++ b/_publications/sun2020pscs.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "PSCS: A Path-based Neural Model for Semantic Code Search"
+authors: Zhensu Sun, Yan Liu, Chen Yang, Yu Qian
+conference: 
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2008.03042"}
+tags: ["grammar", "search"]
+---
+To obtain code snippets for reuse, programmers prefer to search for related documents, e.g., blogs or Q&A, instead of code itself. The major reason is due to the semantic diversity and mismatch between queries and code snippets. Deep learning models have been proposed to address this challenge. Compared with approaches using information retrieval techniques, deep learning models do not suffer from the information loss caused by refining user intention into keywords. However, the performance of previous works is not satisfactory because they ignore the importance of code structure. When the semantics of code (e.g., identifier names, APIs) are ambiguous, code structure may be the only feature for the model to utilize. In that case, previous works relearn the structural information from lexical tokens of code, which is extremely difficult for a model without any domain knowledge. In this work, we propose PSCS, a path-based neural model for semantic code search. Our model encodes both the semantics and structures of code represented by AST paths. We train and evaluate our model over 330k-19k query-function pairs, respectively. The evaluation results demonstrate that PSCS achieves a SuccessRate of 47.6% and a Mean Reciprocal Rank (MRR) of 30.4% when considering the top-10 results with a match. The proposed approach significantly outperforms both DeepCS, the first approach that applies deep learning to code search task, and CARLCS, a state-of-the-art approach that introduces a co-attentive representation learning model on the basis of DeepCS. The importance of code structure is demonstrated with an ablation study on code features, which enlightens model design for further studies. 
diff --git a/_publications/svyatkovskiy2019pythia.markdown b/_publications/svyatkovskiy2019pythia.markdown
index 839261fc..0a891339 100644
--- a/_publications/svyatkovskiy2019pythia.markdown
+++ b/_publications/svyatkovskiy2019pythia.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Pythia: AI-assisted Code Completion System"
-authors: A. Svyatkovskiy, Y. Zhao, S. Fu, Neel Sundaresan
+authors: Alexey Svyatkovskiy, Ying Zhao, Shengyu Fu, Neel Sundaresan
 conference: KDD
 year: 2019
-bibkey: svyatkovskiy2019pythia
 tags: ["autocomplete", "language model"]
 ---
 
diff --git a/_publications/svyatkovskiy2020fast.markdown b/_publications/svyatkovskiy2020fast.markdown
index eabfd40b..43b2ba38 100644
--- a/_publications/svyatkovskiy2020fast.markdown
+++ b/_publications/svyatkovskiy2020fast.markdown
@@ -4,10 +4,9 @@ title: "Fast and Memory-Efficient Neural Code Completion"
 authors: Alexey Svyatkovskiy, Sebastian Lee, Anna Hadjitofi, Maik Riechert, Juliana Franco, Miltiadis Allamanis
 conference:
 year: 2020
-bibkey: svyatkovskiy2020fast
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.13651"}
-tags: ["autocompletion"]
+tags: ["autocomplete"]
 ---
 Code completion is one of the most widely used features of modern integrated development environments (IDEs). Deep learning has recently made significant progress in the statistical prediction of source code. However, state-of-the-art neural network models consume prohibitively large amounts of memory, causing computational burden to the development environment, especially when deployed in lightweight client devices.
 
diff --git a/_publications/svyatkovskiy2020intellicode.markdown b/_publications/svyatkovskiy2020intellicode.markdown
index 21130ed9..5428fdcd 100644
--- a/_publications/svyatkovskiy2020intellicode.markdown
+++ b/_publications/svyatkovskiy2020intellicode.markdown
@@ -2,14 +2,11 @@
 layout: publication
 title: "IntelliCode Compose: Code Generation Using Transformer"
 authors: Alexey Svyatkovskiy, Shao Kun Deng, Shengyu Fu, Neel Sundaresan
-conference: 
 year: 2020
-bibkey: svyatkovskiy2020intellicode
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.08025"}
-tags: ["autocompletion"]
+tags: ["autocomplete", "code generation", "synthesis", "language model", "pretraining"]
 ---
 In software development through integrated development environments (IDEs), code completion is one of the most widely used features. Nevertheless, majority of integrated development environments only support completion of methods and APIs, or arguments.
-
 In this paper, we introduce IntelliCode Compose − a general-purpose multilingual code completion tool which is capable of predicting sequences of code tokens of arbitrary types, generating up to entire lines of syntactically correct code. It leverages state-of-the-art generative transformer model trained on 1.2 billion lines of source code in Python, C#, JavaScript and TypeScript programming languages. IntelliCode Compose is deployed as a cloud-based web service. It makes use of client-side tree-based caching, efficient parallel implementation of the beam search decoder, and compute graph optimizations to meet edit-time completion suggestion requirements in the Visual Studio Code IDE and Azure Notebook.
-Our best model yields an average edit similarity of 86.7% and a perplexity of 1.82 for Python programming language. 
+Our best model yields an average edit similarity of 86.7% and a perplexity of 1.82 for Python programming language.
diff --git a/_publications/szafraniec2022code.markdown b/_publications/szafraniec2022code.markdown
new file mode 100644
index 00000000..2f5c4072
--- /dev/null
+++ b/_publications/szafraniec2022code.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Code Translation with Compiler Representations"
+authors: Marc Szafraniec, Baptiste Roziere, Hugh Leather, Francois Charton, Patrick Labatut, Gabriel Synnaeve
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2207.03578"}
+tags: ["Transformer", "migration", "decompilation"]
+---
+In this paper, we leverage low-level compiler intermediate representations (IR) to improve code translation. Traditional transpilers rely on syntactic information and handcrafted rules, which limits their applicability and produces unnatural-looking code. Applying neural machine translation (NMT) approaches to code has successfully broadened the set of programs on which one can get a natural-looking translation. However, they treat the code as sequences of text tokens, and still do not differentiate well enough between similar pieces of code which have different semantics in different languages. The consequence is low quality translation, reducing the practicality of NMT, and stressing the need for an approach significantly increasing its accuracy. Here we propose to augment code translation with IRs, specifically LLVM IR, with results on the C++, Java, Rust, and Go languages. Our method improves upon the state of the art for unsupervised code translation, increasing the number of correct translations by 11% on average, and up to 79% for the Java - Rust pair. We extend previous test sets for code translation, by adding hundreds of Go and Rust functions. Additionally, we train models with high performance on the problem of IR decompilation, generating programming source code from IR, and study using IRs as intermediary pivot for translation.
diff --git a/_publications/tabassum2020code.markdown b/_publications/tabassum2020code.markdown
index 4326e230..2e88a6d4 100644
--- a/_publications/tabassum2020code.markdown
+++ b/_publications/tabassum2020code.markdown
@@ -4,9 +4,9 @@ title: "Code and Named Entity Recognition in StackOverflow"
 authors: Jeniya Tabassum, Mounica Maddela, Wei Xu, Alan Ritter
 conference: ACL
 year: 2020
-bibkey: tabassum2020code
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.01634"}
+   - {name: "Code", url: "https://github.com/jeniyat/StackOverflowNER/"}
 tags: ["dataset", "information extraction"]
 ---
-There is an increasing interest in studying natural language and computer code together, as large corpora of programming texts become readily available on the Internet. For example, StackOverflow currently has over 15 million programming related questions written by 8.5 million users. Meanwhile, there is still a lack of fundamental NLP techniques for identifying code tokens or software-related named entities that appear within natural language sentences. In this paper, we introduce a new named entity recognition (NER) corpus for the computer programming domain, consisting of 15,372 sentences annotated with 20 fine-grained entity types. We also present the SoftNER model that combines contextual information with domain specific knowledge using an attention network. The code token recognizer combined with an entity segmentation model we proposed, consistently improves the performance of the named entity tagger. Our proposed SoftNER tagger outperforms the BiLSTM-CRF model with an absolute increase of +9.73 F-1 score on StackOverflow data. 
+There is an increasing interest in studying natural language and computer code together, as large corpora of programming texts become readily available on the Internet. For example, StackOverflow currently has over 15 million programming related questions written by 8.5 million users. Meanwhile, there is still a lack of fundamental NLP techniques for identifying code tokens or software-related named entities that appear within natural language sentences. In this paper, we introduce a new named entity recognition (NER) corpus for the computer programming domain, consisting of 15,372 sentences annotated with 20 fine-grained entity types. We trained in-domain BERT representations (BERTOverflow) on 152 million sentences from StackOverflow, which lead to an absolute increase of +10 F-1 score over off-the-shelf BERT. We also present the SoftNER model which achieves an overall 79.10 F1 score for code and named entity recognition on StackOverflow data. Our SoftNER model incorporates a context-independent code token classifier with corpus-level features to improve the BERT-based tagging model. 
diff --git a/_publications/tan2024llm4decompile.markdown b/_publications/tan2024llm4decompile.markdown
new file mode 100644
index 00000000..8ea0b686
--- /dev/null
+++ b/_publications/tan2024llm4decompile.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "LLM4Decompile: Decompiling Binary Code with Large Language Models"
+authors: Hanzhuo Tan, Qi Luo, Jing Li, Yuqun Zhang
+conference:
+year: 2024
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2403.05286"}
+   - {name: "code", url: "https://github.com/albertan017/LLM4Decompile"}
+tags: ["decompilation", "translation", "evaluation", "large language models", "LLM"]
+---
+Decompilation aims to restore compiled code to human-readable source code, but struggles with details like names and structure. Large language models (LLMs) show promise for programming tasks, motivating their application to decompilation. However, there does not exist any open-source LLM for decompilation. Moreover, existing decompilation evaluation systems mainly consider token-level accuracy and largely ignore code executability, which is the most important feature of any program. Therefore, we release the first open-access decompilation LLMs ranging from 1B to 33B pre-trained on 4 billion tokens of C source code and the corresponding assembly code. The open-source LLMs can serve as baselines for further development in the field. To ensure practical program evaluation, we introduce Decompile-Eval, the first dataset that considers re-compilability and re-executability for decompilation. The benchmark emphasizes the importance of evaluating the decompilation model from the perspective of program semantics. Experiments indicate that our LLM4Decompile has demonstrated the capability to accurately decompile 21% of the assembly code, which achieves a 50% improvement over GPT-4. Our code, dataset, and models are released at this [https URL](https://github.com/albertan017/LLM4Decompile)
diff --git a/_publications/tarlow2019learning.markdown b/_publications/tarlow2019learning.markdown
index 802083fe..bb4b956b 100644
--- a/_publications/tarlow2019learning.markdown
+++ b/_publications/tarlow2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Fix Build Errors with Graph2Diff Neural Networks"
-authors: D. Tarlow, S. Moitra, A. Rcie, Z. Chen, P.A. Manzagol, C. Sutton, E. Aftandilian
+authors: Daniel Tarlow, Subhodeep Moitra, Andrew Rice, Zimin Chen, Pierre-Antoine Manzagol, Charles Sutton, Edward Aftandilian
 conference: 
 year: 2019
-bibkey: tarlow2019learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1911.01205"}
    - {name: "preprint", url: "http://www.cs.toronto.edu/~dtarlow/papers/graph2diff_preprint.pdf"}
diff --git a/_publications/template b/_publications/template
index c2e362fd..8e8f760a 100644
--- a/_publications/template
+++ b/_publications/template
@@ -1,13 +1,12 @@
 ---
 layout: publication
 title: "Add title here"
-authors: F. LastName, F. LastName
-conference: Optional
+authors: FirstName LastName, FirstName LastName
+conference: Optional  # OR journal
 year: 2000
-bibkey: sameAsFilename
 additional_links:
-   - {name: "ArXiV", url: "https://arxiv.org/abs/xxxx.xxxxxx"}
-   - {name: "Dataset", url: "https://blah/blah"}
+  - {name: "ArXiV", url: "https://arxiv.org/abs/xxxx.xxxxxx"}
+  - {name: "Dataset", url: "https://blah/blah"}
 tags: ["dataset"]
 ---
 Abstract here
diff --git a/_publications/theeten2019import2vec.markdown b/_publications/theeten2019import2vec.markdown
index 6dc85d41..11b38e26 100644
--- a/_publications/theeten2019import2vec.markdown
+++ b/_publications/theeten2019import2vec.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Import2vec - Learning Embeddings for Software Libraries"
-authors: B. Theeten, F. Vandeputte, T.Van Cutsem
+authors: Bart Theeten, Frederik Vandeputte, Tom Van Cutsem
 conference: MSR
 year: 2019
-bibkey: theeten2019import2vec
 tags: ["representation"]
 ---
 We consider the problem of developing suitable learning representations (embeddings) for library packages that capture semantic similarity among libraries. Such representations are known to improve the performance of downstream learning tasks (e.g. classification) or applications such as contextual search and analogical reasoning.
diff --git a/_publications/tian2020evaluating.markdown b/_publications/tian2020evaluating.markdown
new file mode 100644
index 00000000..a9d418f8
--- /dev/null
+++ b/_publications/tian2020evaluating.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Evaluating Representation Learning of Code Changes for Predicting Patch Correctness in Program Repair"
+authors: Haoye Tian, Kui Liu, Abdoul Kader Kaboreé, Anil Koyuncu, Li Li, Jacques Klein, Tegawendé F. Bissyandé
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2008.02944"}
+tags: ["repair", "Transformer"]
+---
+A large body of the literature of automated program repair develops approaches where patches are generated to be validated against an oracle (e.g., a test suite). Because such an oracle can be imperfect, the generated patches, although validated by the oracle, may actually be incorrect. While the state of the art explore research directions that require dynamic information or rely on manually-crafted heuristics, we study the benefit of learning code representations to learn deep features that may encode the properties of patch correctness. Our work mainly investigates different representation learning approaches for code changes to derive embeddings that are amenable to similarity computations. We report on findings based on embeddings produced by pre-trained and re-trained neural networks. Experimental results demonstrate the potential of embeddings to empower learning algorithms in reasoning about patch correctness: a machine learning predictor with BERT transformer-based embeddings associated with logistic regression yielded an AUC value of about 0.8 in predicting patch correctness on a deduplicated dataset of 1000 labeled patches. Our study shows that learned representations can lead to reasonable performance when comparing against the state-of-the-art, PATCH-SIM, which relies on dynamic information. These representations may further be complementary to features that were carefully (manually) engineered in the literature. 
diff --git a/_publications/tian2024debugbench.markdown b/_publications/tian2024debugbench.markdown
new file mode 100644
index 00000000..10dd79a9
--- /dev/null
+++ b/_publications/tian2024debugbench.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "DebugBench: Evaluating Debugging Capability of Large Language Models"
+authors: Runchu Tian, Yining Ye, Yujia Qin, Xin Cong, Yankai Lin, Yinxu Pan, Yesai Wu, Zhiyuan Liu, Maosong Sun
+conference:
+year: 2024
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2401.04621"}
+tags: ["repair"]
+---
+Large Language Models (LLMs) have demonstrated exceptional coding capability. However, as another critical component of programming proficiency, the debugging capability of LLMs remains relatively unexplored. Previous evaluations of LLMs' debugging ability are significantly limited by the risk of data leakage, the scale of the dataset, and the variety of tested bugs. To overcome these deficiencies, we introduce `DebugBench', an LLM debugging benchmark consisting of 4,253 instances. It covers four major bug categories and 18 minor types in C++, Java, and Python. To construct DebugBench, we collect code snippets from the LeetCode community, implant bugs into source data with GPT-4, and assure rigorous quality checks. We evaluate two commercial and three open-source models in a zero-shot scenario. We find that (1) while closed-source models like GPT-4 exhibit inferior debugging performance compared to humans, open-source models such as Code Llama fail to attain any pass rate scores; (2) the complexity of debugging notably fluctuates depending on the bug category; (3) incorporating runtime feedback has a clear impact on debugging performance which is not always helpful. As an extension, we also compare LLM debugging and code generation, revealing a strong correlation between them for closed-source models. These findings will benefit the development of LLMs in debugging.
diff --git a/_publications/tomczak2019simulating.markdown b/_publications/tomczak2019simulating.markdown
index a1ad0b29..0f5c90f8 100644
--- a/_publications/tomczak2019simulating.markdown
+++ b/_publications/tomczak2019simulating.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Simulating Execution Time of Tensor Programs using Graph Neural Networks"
-authors: J. M. Tomczak, R. Lepert, A. Wiggers
+authors: Jakub M. Tomczak, Romain Lepert, Auke Wiggers
 conference: Representation Learning on Graphs and Manifolds at ICLR
 year: 2019
-bibkey: tomczak2019simulating
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1904.11876"}
 tags: ["GNN"]
diff --git a/_publications/tran2019recovering.markdown b/_publications/tran2019recovering.markdown
index 5511569e..0366fcc5 100644
--- a/_publications/tran2019recovering.markdown
+++ b/_publications/tran2019recovering.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Recovering Variable Names for Minified Code with Usage Contexts"
-authors: H. Tran, N. Tran, S. Nguyen, H. Nguyen, T. Nguyen
+authors: Hieu Tran, Ngoc Tran, Son Nguyen, Hoan Nguyen, Tien N. Nguyen
 conference: ICSE
 year: 2019
-bibkey: tran2019recovering
 tags: ["naming", "deobfuscation"]
 ---
 In modern Web technology, JavaScript (JS) code plays an important role. To avoid the exposure of original source code, the variable names in JS code deployed in the wild are often replaced by short, meaningless names, thus making the code extremely difficult to manually understand and analysis. This paper presents JSNeat, an information retrieval (IR)-based approach to recover the variable names in minified JS code. JSNeat follows a data-driven approach to recover names by searching for them in a large corpus of open-source JS code. We use three types of contexts to match a variable in given minified code against the corpus including the context of properties and roles of the variable, the context of that variable and relations with other variables under recovery, and the context of the task of the function to which the variable contributes. We performed several empirical experiments to evaluate JSNeat on the dataset of more than 322K JS files with 1M functions, and 3.5M variables with 176K unique variable names. We found that JSNeat achieves a high accuracy of 69.1%, which is the relative improvements of 66.1% and 43% over two state-of-the-art approaches JSNice and JSNaughty, respectively. The time to recover for a file or for a variable with JSNeat is twice as fast as with JSNice and 4x as fast as with JNaughty, respectively.
\ No newline at end of file
diff --git a/_publications/tu2014localness.markdown b/_publications/tu2014localness.markdown
index 4cbfae31..af7dbda6 100644
--- a/_publications/tu2014localness.markdown
+++ b/_publications/tu2014localness.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "On the Localness of Software"
-authors: Z. Tu, Z. Su, P. Devanbu 
+authors: Zhaopeng Tu, Zhendong Su, Premkumar Devanbu
 conference: FSE
 year: 2014
-bibkey: tu2014localness
 tags: ["language model"]
 ---
 The n-gram language model, which has its roots in statistical natural
diff --git a/_publications/tufano2018deep.markdown b/_publications/tufano2018deep.markdown
index 7dbcc391..ee69587b 100644
--- a/_publications/tufano2018deep.markdown
+++ b/_publications/tufano2018deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Learning Similarities from Different Representations of Source Code"
-authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk
+authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk
 conference: MSR
 year: 2018
-bibkey: tufano2018deep
 tags: ["representation", "clone"]
 ---
 Assessing the similarity between code components plays a pivotal
diff --git a/_publications/tufano2018empirical.markdown b/_publications/tufano2018empirical.markdown
index 631816e5..a8fd9cdc 100644
--- a/_publications/tufano2018empirical.markdown
+++ b/_publications/tufano2018empirical.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation"
-authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk
+authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk
 conference: 
 year: 2018
-bibkey: tufano2018empirical
 tags: ["repair"]
 ---
 Millions of open-source projects with numerous bug fixes are available in code repositories. This proliferation of software development histories can be leveraged to learn how to fix common programming bugs. To explore such a potential, we perform an empirical study to assess the feasibility of using Neural Machine Translation techniques for learning bug-fixing patches for real defects. First, we mine millions of bug-fixes from the change histories of projects hosted on GitHub, in order to extract meaningful examples of such bug-fixes. Next, we abstract the buggy and corresponding fixed code, and use them to train an Encoder-Decoder model able to translate buggy code into its fixed version. In our empirical investigation we found that such a model is able to fix thousands of unique buggy methods in the wild. Overall, this model is capable of predicting fixed patches generated by developers in 9-50% of the cases, depending on the number of candidate patches we allow it to generate. Also, the model is able to emulate a variety of different Abstract Syntax Tree operations and generate candidate patches in a split second. 
diff --git a/_publications/tufano2018learning.markdown b/_publications/tufano2018learning.markdown
index f6cceb60..5f0761df 100644
--- a/_publications/tufano2018learning.markdown
+++ b/_publications/tufano2018learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning How to Mutate Source Code from Bug-Fixes"
-authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk
+authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk
 conference: 
 year: 2018
-bibkey: tufano2018learning
 tags: ["repair", "edit"]
 ---
 Mutation testing has been widely accepted as an approach to guide test case generation or to assess the effectiveness of test suites. Empirical studies have shown that mutants are representative of real faults; yet they also indicated a clear need for better, possibly customized, mutation operators and strategies. While some recent papers have tried to devise domain-specific or general purpose mutator operators by manually analyzing real faults, such an activity is effort- (and error-) prone and does not deal with an important practical question as to how to really mutate a given source code element. We propose a novel approach to automatically learn mutants from faults in real programs. First, our approach processes bug fixing changes using fine-grained differencing, code abstraction, and change clustering. Then, it learns mutation models using a deep learning strategy. We have trained and evaluated our technique on a set of ~787k bugs mined from GitHub. Starting from code fixed by developers in the context of a bug-fix, our empirical evaluation showed that our models are able to predict mutants that resemble original fixed bugs in between 9% and 45% of the cases (depending on the model). Moreover, over 98% of the automatically generated mutants are lexically and syntactically correct.
diff --git a/_publications/tufano2019learning.markdown b/_publications/tufano2019learning.markdown
index c6dba557..79f7c352 100644
--- a/_publications/tufano2019learning.markdown
+++ b/_publications/tufano2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "On Learning Meaningful Code Changes via Neural Machine Translation"
-authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk
+authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk
 conference: ICSE
 year: 2019
-bibkey: tufano2019learning
 tags: ["repair", "edit"]
 ---
 Recent years have seen the rise of Deep Learning (DL) techniques applied to source code. Researchers have exploited DL to automate several development and maintenance tasks, such as writing commit messages, generating comments and detecting vulnerabilities among others. One of the long lasting dreams of applying DL to code is the possibility to automate non-trivial coding activities. While some steps in this direction have been taken (e.g., learning how to fix bugs), there is still a lack of empirical evidence on the types of code changes that can be learned and automatically applied by DL. Our goal is to make this first step by quantitatively and qualitatively investigating the ability of a Neural Machine Translation (NMT) model to learn how to automatically apply code changes implemented by developers during pull requests. We train and experiment with the NMT model on a set of 236k pairs of code components before and after the implementation of the changes provided in the pull requests. We show that, when applied in a narrow enough context (i.e., small/medium-sized pairs of methods before/after the pull request changes), NMT can automatically replicate the changes implemented by developers during pull requests in up to 36% of the cases. Moreover, our qualitative analysis shows that the model is capable of learning and replicating a wide variety of meaningful code changes, especially refactorings and bug-fixing activities. Our results pave the way to novel research in the area of DL on code, such as the automatic learning and applications of refactoring.
diff --git a/_publications/tufano2020generating.markdown b/_publications/tufano2020generating.markdown
new file mode 100644
index 00000000..2d73625a
--- /dev/null
+++ b/_publications/tufano2020generating.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Generating Accurate Assert Statements for Unit Test Cases using Pretrained Transformers"
+authors: Michele Tufano, Dawn Drain, Alexey Svyatkovskiy, Shao Kun Deng, Neel Sundaresan
+conference: ICSE
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.05634"}
+tags: ["code generation", "synthesis", "test generation"]
+---
+Unit testing represents the foundational basis of the software testing pyramid, beneath integration and end-to-end testing. Automated software testing researchers have proposed a variety of techniques to assist developers in this time-consuming task. In this paper we present an approach to support developers in writing unit test cases by generating accurate and useful assert statements. Our approach is based on a state-of-the-art transformer model initially pretrained on an English textual corpus. This semantically rich model is then trained in a semi-supervised fashion on a large corpus of source code. Finally, we finetune this model on the task of generating assert statements for unit tests. The resulting model is able to generate accurate assert statements for a given method under test. In our empirical evaluation, the model was able to predict the exact assert statements written by developers in 62% of the cases in the first attempt. The results show 80% relative improvement for top-1 accuracy over the previous RNN-based approach in the literature. We also show the substantial impact of the pretraining process on the performances of our model, as well as comparing it with assert auto-completion task. Finally, we demonstrate how our approach can be used to augment EvoSuite test cases, with additional asserts leading to improved test coverage.
diff --git a/_publications/tufano2020unit.markdown b/_publications/tufano2020unit.markdown
new file mode 100644
index 00000000..fc3fd110
--- /dev/null
+++ b/_publications/tufano2020unit.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Unit Test Case Generation with Transformers"
+authors: Michele Tufano, Dawn Drain, Alexey Svyatkovskiy, Shao Kun Deng, Neel Sundaresan
+conference: ICSE
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.05617"}
+tags: ["code generation", "synthesis", "test generation"]
+---
+Automated Unit Test Case generation has been the focus of extensive literature within the research community. Existing approaches are usually guided by the test coverage criteria, generating synthetic test cases that are often difficult to read or understand for developers. In this paper we propose AthenaTest, an approach that aims at generating unit test cases by learning from real-world, developer-written test cases. Our approach relies on a state-of-the-art sequence-to-sequence transformer model which is able to write useful test cases for a given method under test (i.e., focal method). We also introduce methods2test - the largest publicly available supervised parallel corpus of unit test case methods and corresponding focal methods in Java, which comprises 630k test cases mined from 70k open-source repositories hosted on GitHub. We use this dataset to train a transformer model to translate focal methods into the corresponding test cases. We evaluate the ability of our model in generating test cases using natural language processing as well as code-specific criteria. First, we assess the quality of the translation compared to the target test case, then we analyze properties of the test case such as syntactic correctness and number and variety of testing APIs (e.g., asserts). We execute the test cases, collect test coverage information, and compare them with test cases generated by EvoSuite and GPT-3. Finally, we survey professional developers on their preference in terms of readability, understandability, and testing effectiveness of the generated test cases.
diff --git a/_publications/vaithilingam2022expectation.markdown b/_publications/vaithilingam2022expectation.markdown
new file mode 100644
index 00000000..4852cb5e
--- /dev/null
+++ b/_publications/vaithilingam2022expectation.markdown
@@ -0,0 +1,24 @@
+---
+layout: publication
+title: "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models"
+authors: Priyan Vaithilingam, Tianyi Zhang, Elena Glassman
+conference: CHI
+year: 2022
+additional_links:
+   - {name: "Preprint", url: "https://tianyi-zhang.github.io/files/chi2022-lbw-copilot.pdf"}
+tags: ["human evaluation", "code generation", "language model"]
+---
+Recent advances in Large Language Models (LLM) have made automatic code generation possible for real-world programming tasks in
+general-purpose programming languages such as Python. However,
+there are few human studies on the usability of these tools and how
+they fit the programming workflow. In this work, we conducted
+a within-subjects user study with 24 participants to understand
+how programmers use and perceive Copilot, a LLM-based code
+generation tool. We found that, while Copilot did not necessarily
+improve the task completion time or success rate, most participants preferred to use Copilot in daily programming tasks, since
+Copilot often provided a useful starting point and saved the effort
+of searching online. However, participants did face difficulties in
+understanding, editing, and debugging code snippets generated
+by Copilot, which significantly hindered their task-solving effectiveness. Finally, we highlighted several promising directions for
+improving the design of Copilot based on our observations and
+participants’ feedback.
diff --git a/_publications/vasic2019neural.markdown b/_publications/vasic2019neural.markdown
index ca52e1a1..69424536 100644
--- a/_publications/vasic2019neural.markdown
+++ b/_publications/vasic2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Program Repair by Jointly Learning to Localize and Repair"
-authors: M. Vasic, A. Kanade, P. Maniatis, D. Bieber, R. Singh
+authors: Marko Vasic, Aditya Kanade, Petros Maniatis, David Bieber, Rishabh Singh
 conference: ICLR
 year: 2019
-bibkey: vasic2019neural
 tags: ["repair", "program analysis", "variable misuse"]
 ---
 Due to its potential to improve programmer productivity and software quality, automated program repair has been an active topic of research. Newer techniques harness neural networks to learn directly from examples of buggy programs and their fixes. In this work, we consider a recently identified class of bugs called variable-misuse bugs. The state-of-the-art solution for variable misuse enumerates potential fixes for all possible bug locations in a program, before selecting the best prediction. We show that it is beneficial to train a model that jointly and directly localizes and repairs variable-misuse bugs. We present multi-headed pointer networks for this purpose, with one head each for localization and repair. The experimental results show that the joint model significantly outperforms an enumerative solution that uses a pointer based model for repair alone.
diff --git a/_publications/vasilescu2017recovering.markdown b/_publications/vasilescu2017recovering.markdown
index e103e690..cbec38ba 100644
--- a/_publications/vasilescu2017recovering.markdown
+++ b/_publications/vasilescu2017recovering.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Recovering Clear, Natural Identifiers from Obfuscated JS Names"
-authors: B. Vasilescu, C. Casalnuovo, P. Devanbu
+authors: Bogdan Vasilescu, Casey Casalnuovo, Premkumar Devanbu
 conference: FSE
 year: 2017
-bibkey: vasilescu2017recovering
 tags: ["deobfuscation", "naming"]
 ---
  Well-chosen variable names are critical to source code readability, reusability, and maintainability. Unfortunately, in deployed JavaScript code (which is ubiquitous on the web) the identifier names are frequently minified and overloaded. This is done both for efficiency and also to protect potentially proprietary intellectual property. In this paper, we describe an approach based on statistical machine translation (SMT) that recovers some of the original names from the JavaScript programs minified by the very popular UglifyJS. This simple tool, Autonym, performs comparably to the best currently available deobfuscator for JavaScript, JSNice, which uses sophisticated static analysis. In fact, Autonym is quite complementary to JSNice, performing well when it does not, and vice versa. We also introduce a new tool, JSNaughty, which blends Autonym and JSNice, and significantly outperforms both at identifier name recovery, while remaining just as easy to use as JSNice. JSNaughty is available online at http://jsnaughty.org.
diff --git a/_publications/villmow2021contest.markdown b/_publications/villmow2021contest.markdown
new file mode 100644
index 00000000..4e5e976c
--- /dev/null
+++ b/_publications/villmow2021contest.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "ConTest: A Unit Test Completion Benchmark featuring Context"
+authors: Johannes Villmow, Jonas Depoix, Adrian Ulges
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.2.pdf"}
+tags: ["benchmark", "dataset", "verification", "Transformer"]
+---
+We introduce CONTEST, a benchmark for NLP-based unit test completion, the task of predicting a test’s assert statements given its setup and focal method, i.e. the method to be tested. ConTest is large-scale (with 365k datapoints). Besides the test code and tested code, it also features context code called by either. We found context to be crucial for accurately predicting assertions. We also introduce baselines based on transformer encoder-decoders, and study the effects of including syntactic information and context. Overall, our models achieve a BLEU score of 38.2, while only generating unparsable code in 1.92% of cases.
diff --git a/_publications/wan2018improving.markdown b/_publications/wan2018improving.markdown
index e66a2e53..232a4f63 100644
--- a/_publications/wan2018improving.markdown
+++ b/_publications/wan2018improving.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Improving Automatic Source Code Summarization via Deep Reinforcement Learning"
-authors: Y. Wan, Z. Zhao, M. Yang, G. Xu, H. Ying, J. Wu, P.S. Yu
+authors: Yao Wan, Zhou Zhao, Min Yang, Guandong Xu, Haochao Ying, Jian Wu, Philip S. Yu
 conference: ASE
 year: 2018
-bibkey: wan2018improving
 additional_links:
    - {name: "ACM", url: "https://dl.acm.org/citation.cfm?id=3238206"}
 tags: ["summarization", "documentation"]
diff --git a/_publications/wan2019multimodal.markdown b/_publications/wan2019multimodal.markdown
index a17e0be6..88d486a0 100644
--- a/_publications/wan2019multimodal.markdown
+++ b/_publications/wan2019multimodal.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Multi-Modal Attention Network Learning for Semantic Source Code Retrieval"
-authors: Y. Wan, J. Shu, Y. Sui, G. Xu, Z. Zhao, J. Wu, P. S. Yu
+authors: Yao Wan, Jingdong Shu, Yulei Sui, Guandong Xu, Zhou Zhao, Jian Wu, Philip S. Yu
 conference: 
 year: 2019
-bibkey: wan2019multimodal
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1909.13516"}
 tags: ["search"]
diff --git a/_publications/wan2020naturalcc.markdown b/_publications/wan2020naturalcc.markdown
new file mode 100644
index 00000000..ae4639ff
--- /dev/null
+++ b/_publications/wan2020naturalcc.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "NaturalCC: A Toolkit to Naturalize the Source Code Corpus"
+authors: Yao Wan, Yang He, Jian-Guo Zhang, Yulei Sui, Hai Jin, Guandong Xu, Caiming Xiong, Philip S. Yu
+conference: 
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2012.03225"}
+   - {name: "website", url: "https://xcodemind.github.io/"}
+   - {name: "code", url: "https://github.com/CGCL-codes/naturalcc"}
+tags: ["documentation", "search", "summarization"]
+---
+We present NaturalCC, an efficient and extensible toolkit to bridge the gap between natural language and programming language, and facilitate the research on big code analysis. Using NaturalCC, researchers both from natural language or programming language communities can quickly and easily reproduce the state-of-the-art baselines and implement their approach. NaturalCC is built upon Fairseq and PyTorch, providing (1) an efficient computation with multi-GPU and mixed-precision data processing for fast model training, (2) a modular and extensible framework that makes it easy to reproduce or implement an approach for big code analysis, and (3) a command line interface and a graphical user interface to demonstrate each model's performance. Currently, we have included several state-of-the-art baselines across different tasks (e.g., code completion, code comment generation, and code retrieval) for demonstration. The video of this demo is available at https://www.youtube.com/watch?v=q4W5VSI-u3E&t=25s.
diff --git a/_publications/wan2022what.markdown b/_publications/wan2022what.markdown
new file mode 100644
index 00000000..5c8be571
--- /dev/null
+++ b/_publications/wan2022what.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "What Do They Capture? -- A Structural Analysis of Pre-Trained Language Models for Source Code"
+authors: Yao Wan, Wei Zhao, Hongyu Zhang, Yulei Sui, Guandong Xu, Hai Jin
+conference: ICSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2202.06840"}
+   - {name: "Code", url: "https://github.com/CGCL-codes/naturalcc"}
+tags: ["Transformer", "pretraining", "program analysis"]
+---
+Recently, many pre-trained language models for source code have been proposed to model the context of code and serve as a basis for downstream code intelligence tasks such as code completion, code search, and code summarization. These models leverage masked pre-training and Transformer and have achieved promising results. However, currently there is still little progress regarding interpretability of existing pre-trained code models. It is not clear why these models work and what feature correlations they can capture. In this paper, we conduct a thorough structural analysis aiming to provide an interpretation of pre-trained language models for source code (e.g., CodeBERT, and GraphCodeBERT) from three distinctive perspectives: (1) attention analysis, (2) probing on the word embedding, and (3) syntax tree induction. Through comprehensive analysis, this paper reveals several insightful findings that may inspire future studies: (1) Attention aligns strongly with the syntax structure of code. (2) Pre-training language models of code can preserve the syntax structure of code in the intermediate representations of each Transformer layer. (3) The pre-trained models of code have the ability of inducing syntax trees of code. Theses findings suggest that it may be helpful to incorporate the syntax structure of code into the process of pre-training for better code representations.
\ No newline at end of file
diff --git a/_publications/wang2016automatically.markdown b/_publications/wang2016automatically.markdown
index 95712819..20a03a7a 100644
--- a/_publications/wang2016automatically.markdown
+++ b/_publications/wang2016automatically.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Automatically Learning Semantic Features for Defect Prediction"
-authors: S. Wang, T. Liu, L. Tan
+authors: Song Wang, Taiyue Liu, Lin Tan
 conference: ICSE
 year: 2016
-bibkey: wang2016automatically
 tags: ["defect", "representation"]
 ---
 Software defect prediction, which predicts defective code regions, can help developers find bugs and prioritize their testing efforts. To build accurate prediction models, previous
diff --git a/_publications/wang2016bugram.markdown b/_publications/wang2016bugram.markdown
index 5bcb72f4..34fd759b 100644
--- a/_publications/wang2016bugram.markdown
+++ b/_publications/wang2016bugram.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Bugram: bug detection with n-gram language models"
-authors: S. Wang, D. Chollak, D. Movshovitz-Attias, L. Tan
+authors: Song Wang, Devin Chollak, Dana Movshovitz-Attias, Lin Tan
 conference: ASE
 year: 2016
-bibkey: wang2016bugram
 tags: ["defect", "representation"]
 ---
 
diff --git a/_publications/wang2016neural.markdown b/_publications/wang2016neural.markdown
index 9c8be0b7..3c82e8e0 100644
--- a/_publications/wang2016neural.markdown
+++ b/_publications/wang2016neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Code Completion"
-authors: C. Liu, X. Wang, R. Shin, J.E. Gonzalez, D. Song
+authors: Chang Liu, Xin Wang, Richard Shin, Joseph E. Gonzalez, Dawn Song
 conference: 
 year: 2016
-bibkey: wang2016neural
 tags: ["autocomplete"]
 ---
 Code completion, an essential part of modern software development, yet can be
diff --git a/_publications/wang2019learning.markdown b/_publications/wang2019learning.markdown
index 14bfbce8..de57b30d 100644
--- a/_publications/wang2019learning.markdown
+++ b/_publications/wang2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Scalable and Precise Representation of Program Semantics"
-authors: K. Wang
+authors: Ke Wang
 conference:
 year: 2019
-bibkey: wang2019learning
 tags: ["representation", "dynamic"]
 ---
 Neural program embedding has shown potential in aiding the analysis of large-scale, complicated software. Newly proposed deep neural architectures pride themselves on learning program semantics rather than superficial syntactic features. However, by considering the source code only, the vast majority of neural networks do not capture a deep, precise representation of program semantics. In this paper, we present \dypro, a novel deep neural network that learns from program execution traces. Compared to the prior dynamic models, not only is \dypro capable of generalizing across multiple executions for learning a program's dynamic semantics in its entirety, but \dypro is also more efficient when dealing with programs yielding long execution traces. For evaluation, we task \dypro with semantic classification (i.e. categorizing programs based on their semantics) and compared it against two prominent static models: Gated Graph Neural Network and TreeLSTM. We find that \dypro achieves the highest prediction accuracy among all models. To further reveal the capacity of all aforementioned deep neural architectures, we examine if the models can learn to detect deeper semantic properties of a program. In particular given a task of recognizing loop invariants, we show \dypro beats all static models by a wide margin. 
diff --git a/_publications/wang2020blended.markdown b/_publications/wang2020blended.markdown
new file mode 100644
index 00000000..fa57dff5
--- /dev/null
+++ b/_publications/wang2020blended.markdown
@@ -0,0 +1,9 @@
+---
+layout: publication
+title: "Blended, precise semantic program embeddings"
+authors: Ke Wang, Zhendong Su
+conference: PLDI
+year: 2020
+tags: ["dynamic"]
+---
+Learning neural program embeddings is key to utilizing deep neural networks in program languages research --- precise and efficient program representations enable the application of deep models to a wide range of program analysis tasks. Existing approaches predominately learn to embed programs from their source code, and, as a result, they do not capture deep, precise program semantics. On the other hand, models learned from runtime information critically depend on the quality of program executions, thus leading to trained models with highly variant quality. This paper tackles these inherent weaknesses of prior approaches by introducing a new deep neural network, Liger, which learns program representations from a mixture of symbolic and concrete execution traces. We have evaluated Liger on two tasks: method name prediction and semantics classification. Results show that Liger is significantly more accurate than the state-of-the-art static model code2seq in predicting method names, and requires on average around 10x fewer executions covering nearly 4x fewer paths than the state-of-the-art dynamic model DYPRO in both tasks. Liger offers a new, interesting design point in the space of neural program embeddings and opens up this new direction for exploration.
diff --git a/_publications/wang2020cocogum.markdown b/_publications/wang2020cocogum.markdown
index d933331a..061f7c11 100644
--- a/_publications/wang2020cocogum.markdown
+++ b/_publications/wang2020cocogum.markdown
@@ -4,7 +4,6 @@ title: "CoCoGUM: Contextual Code Summarization with Multi-Relational GNN on UMLs
 authors: Yanlin Wang, Lun Du, Ensheng Shi, Yuxuan Hu, Shi Han, Dongmei Zhang 
 conference:
 year: 2020
-bibkey: wang2020cocogum
 additional_links:
    - {name: "TR", url: "https://www.microsoft.com/en-us/research/publication/cocogum-contextual-code-summarization-with-multi-relational-gnn-on-umls/"}
 tags: ["summarization"]
diff --git a/_publications/wang2020detecting.markdown b/_publications/wang2020detecting.markdown
new file mode 100644
index 00000000..9b164e70
--- /dev/null
+++ b/_publications/wang2020detecting.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree
+authors: Wenhan Wang, Ge Li, Bo Ma, Xin Xia, Zhi Jin
+conference: IEEE International Conference on Software Analysis, Evolution, and Reengineering
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2002.08653"}
+tags: ["clone", "GNN"]
+---
+
+Code clones are semantically similar code fragments pairs that are syntactically similar or different. Detection of code clones can help to reduce the cost of software maintenance and prevent bugs. Numerous approaches of detecting code clones have been proposed previously, but most of them focus on detecting syntactic clones and do not work well on semantic clones with different syntactic features. To detect semantic clones, researchers have tried to adopt deep learning for code clone detection to automatically learn latent semantic features from data. Especially, to leverage grammar information, several approaches used abstract syntax trees (AST) as input and achieved significant progress on code clone benchmarks in various programming languages. However, these AST-based approaches still can not fully leverage the structural information of code fragments, especially semantic information such as control flow and data flow. To leverage control and data flow information, in this paper, we build a graph representation of programs called flow-augmented abstract syntax tree (FA-AST). We construct FA-AST by augmenting original ASTs with explicit control and data flow edges. Then we apply two different types of graph neural networks (GNN) on FA-AST to measure the similarity of code pairs. As far as we have concerned, we are the first to apply graph neural networks on the domain of code clone detection. We apply our FA-AST and graph neural networks on two Java datasets: Google Code Jam and BigCloneBench. Our approach outperforms the state-of-the-art approaches on both Google Code Jam and BigCloneBench tasks.
diff --git a/_publications/wang2020learning.markdown b/_publications/wang2020learning.markdown
index 16245ac0..01863571 100644
--- a/_publications/wang2020learning.markdown
+++ b/_publications/wang2020learning.markdown
@@ -4,7 +4,6 @@ title: "Learning Semantic Program Embeddings with Graph Interval Neural Network"
 authors: Yu Wang, Fengjuan Gao, Linzhang Wang, Ke Wang
 conference:
 year: 2020
-bibkey: wang2020learning
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.09997"}
 tags: ["GNN", "defect"]
diff --git a/_publications/wang2020learning2.markdown b/_publications/wang2020learning2.markdown
new file mode 100644
index 00000000..4817270e
--- /dev/null
+++ b/_publications/wang2020learning2.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "Learning to Represent Programs with Heterogeneous Graphs"
+authors: Wenhan Wang, Kechi Zhang, Ge Li, Zhi Jin
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2012.04188"}
+tags: ["GNN", "summarization"]
+---
+Program source code contains complex structure information, which can be represented in structured data forms like trees or graphs. To acquire the structural information in source code, most existing researches use abstract syntax trees (AST). A group of works add additional edges to ASTs to convert source code into graphs and use graph neural networks to learn representations for program graphs. Although these works provide additional control or data flow information to ASTs for downstream tasks, they neglect an important aspect of structure information in AST itself: the different types of nodes and edges. In ASTs, different nodes contain different kinds of information like variables or control flow, and the relation between a node and all its children can also be different.
+
+To address the information of node and edge types, we bring the idea of heterogeneous graphs to learning on source code and present a new formula of building heterogeneous program graphs from ASTs with additional type information for nodes and edges. We use the ASDL grammar of programming language to define the node and edge types of program graphs. Then we use heterogeneous graph neural networks to learn on these graphs. We evaluate our approach on two tasks: code comment generation and method naming. Both tasks require reasoning on the semantics of complete code snippets. Experiment results show that our approach outperforms baseline models, including homogeneous graph-based models, showing that leveraging the type information of nodes and edges in program graphs can help in learning program semantics. 
diff --git a/_publications/wang2020modular.markdown b/_publications/wang2020modular.markdown
new file mode 100644
index 00000000..96bd32bd
--- /dev/null
+++ b/_publications/wang2020modular.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Modular Tree Network for Source Code Representation Learning"
+authors: Wenhan Wang, Ge Li, Sijie Shen, Xin Xia, Zhi Jin
+conference: TOSEM
+year: 2020
+additional_links:
+   - {name: "ACM", url: "https://dl.acm.org/doi/10.1145/3409331"}
+tags: ["grammar", "representation"]
+---
+Learning representation for source code is a foundation of many program analysis tasks. In recent years, neural networks have already shown success in this area, but most existing models did not make full use of the unique structural information of programs. Although abstract syntax tree (AST)-based neural models can handle the tree structure in the source code, they cannot capture the richness of different types of substructure in programs. In this article, we propose a modular tree network that dynamically composes different neural network units into tree structures based on the input AST. Different from previous tree-structural neural network models, a modular tree network can capture the semantic differences between types of AST substructures. We evaluate our model on two tasks: program classification and code clone detection. Our model achieves the best performance compared with state-of-the-art approaches in both tasks, showing the advantage of leveraging more elaborate structure information of the source code.
diff --git a/_publications/wang2020trans.markdown b/_publications/wang2020trans.markdown
index ca1ed0e6..49f05064 100644
--- a/_publications/wang2020trans.markdown
+++ b/_publications/wang2020trans.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "TranS^3: A Transformer-based Framework for Unifying Code Summarization and Code Search"
-authors: W. Wang, Y. Zhang, Z. Zeng, G. Xu
+authors: Wenhua Wang, Yuqun Zhang, Zhengran Zeng, Guandong Xu
 conference:
 year: 2020
-bibkey: wang2020trans
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2003.03238"}
-tags: ["retrieval", "documentation"]
+tags: ["search", "documentation"]
 ---
 Code summarization and code search have been widely adopted in sofwaredevelopmentandmaintenance. However, fewstudieshave explored the efcacy of unifying them. In this paper, we propose TranS^3 , a transformer-based framework to integrate code summarization with code search. Specifcally, for code summarization,TranS^3 enables an actor-critic network, where in the actor network, we encode the collected code snippets via transformer- and tree-transformer-based encoder and decode the given code snippet to generate its comment. Meanwhile, we iteratively tune the actor network via the feedback from the critic network for enhancing the quality of the generated comments. Furthermore, we import the generated comments to code search for enhancing its accuracy. To evaluatetheefectivenessof TranS^3 , we conduct a set of experimental studies and case studies where the experimental results suggest that TranS^3 can signifcantly outperform multiple state-of-the-art approaches in both code summarization and code search and the study results further strengthen the efcacy of TranS^3 from the developers' points of view. 
diff --git a/_publications/wang2021codet5.markdown b/_publications/wang2021codet5.markdown
new file mode 100644
index 00000000..9b1ba6d4
--- /dev/null
+++ b/_publications/wang2021codet5.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation"
+authors: Yue Wang, Weishi Wang, Shafiq Joty, Steven C.H. Hoi
+conference: EMNLP
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2109.00859"}
+   - {name: "Code & Model", url: "https://github.com/salesforce/CodeT5"}
+tags: ["Transformer"]
+---
+Pre-trained models for Natural Languages (NL) like BERT and GPT have been recently shown to transfer well to Programming Languages (PL) and largely benefit a broad set of code-related tasks. Despite their success, most current methods either rely on an encoder-only (or decoder-only) pre-training that is suboptimal for generation (resp. understanding) tasks or process the code snippet in the same way as NL, neglecting the special characteristics of PL such as token types. We present CodeT5, a unified pre-trained encoder-decoder Transformer model that better leverages the code semantics conveyed from the developer-assigned identifiers. Our model employs a unified framework to seamlessly support both code understanding and generation tasks and allows for multi-task learning. Besides, we propose a novel identifier-aware pre-training task that enables the model to distinguish which code tokens are identifiers and to recover them when they are masked. Furthermore, we propose to exploit the user-written code comments with a bimodal dual generation task for better NL-PL alignment. Comprehensive experiments show that CodeT5 significantly outperforms prior methods on understanding tasks such as code defect detection and clone detection, and generation tasks across various directions including PL-NL, NL-PL, and PL-PL. Further analysis reveals that our model can better capture semantic information from code. Our code and pre-trained models are released at https://github.com/salesforce/CodeT5 . 
diff --git a/_publications/wang2021syncobert.markdown b/_publications/wang2021syncobert.markdown
new file mode 100644
index 00000000..1478c85f
--- /dev/null
+++ b/_publications/wang2021syncobert.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "SynCoBERT: Syntax-Guided Multi-Modal Contrastive Pre-Training for Code Representation"
+authors: Xin Wang, Yasheng Wang, Fei Mi, Pingyi Zhou, Yao Wan, Xiao Liu, Li Li, Hao Wu, Jin Liu, Xin Jiang
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2108.04556"}
+tags: ["pretraining"]
+---
+Code representation learning, which aims to encode the semantics of source code into distributed vectors, plays an important role in recent deep-learning-based models for code intelligence. Recently, many pre-trained language models for source code (e.g., CuBERT and CodeBERT) have been proposed to model the context of code and serve as a basis for downstream code intelligence tasks such as code search, code clone detection, and program translation. Current approaches typically consider the source code as a plain sequence of tokens, or inject the structure information (e.g., AST and data-flow) into the sequential model pre-training. To further explore the properties of programming languages, this paper proposes SynCoBERT, a syntax-guided multi-modal contrastive pre-training approach for better code representations. Specially, we design two novel pre-training objectives originating from the symbolic and syntactic properties of source code, i.e., Identifier Prediction (IP) and AST Edge Prediction (TEP), which are designed to predict identifiers, and edges between two nodes of AST, respectively. Meanwhile, to exploit the complementary information in semantically equivalent modalities (i.e., code, comment, AST) of the code, we propose a multi-modal contrastive learning strategy to maximize the mutual information among different modalities. Extensive experiments on four downstream tasks related to code intelligence show that SynCoBERT advances the state-of-the-art with the same pre-training corpus and model size.
\ No newline at end of file
diff --git a/_publications/wang2023codet5.markdown b/_publications/wang2023codet5.markdown
new file mode 100644
index 00000000..a75b04a2
--- /dev/null
+++ b/_publications/wang2023codet5.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CodeT5+: Open Code Large Language Models for Code Understanding and Generation"
+authors: Yue Wang, Hung Le, Akhilesh Deepak Gotmare, Nghi D. Q. Bui, Junnan Li, Steven C. H. Hoi
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2305.07922"}
+tags: ["Transformer"]
+---
+Large language models (LLMs) pretrained on vast source code have achieved prominent progress in code intelligence. However, existing code LLMs have two main limitations in terms of architecture and pretraining tasks. First, they often adopt a specific architecture (encoder-only or decoder-only) or rely on a unified encoder-decoder network for different downstream tasks. The former paradigm is limited by inflexibility in applications while in the latter, the model is treated as a single system for all tasks, leading to suboptimal performance on a subset of tasks. Secondly, they often employ a limited set of pretraining objectives which might not be relevant to some downstream tasks and hence result in substantial performance degrade. To address these limitations, we propose ``CodeT5+'', a family of encoder-decoder LLMs for code in which component modules can be flexibly combined to suit a wide range of downstream code tasks. Such flexibility is enabled by our proposed mixture of pretraining objectives to mitigate the pretrain-finetune discrepancy. These objectives cover span denoising, contrastive learning, text-code matching, and causal LM pretraining tasks, on both unimodal and bimodal multilingual code corpora. Furthermore, we propose to initialize CodeT5+ with frozen off-the-shelf LLMs without training from scratch to efficiently scale up our models, and explore instruction-tuning to align with natural language instructions. We extensively evaluate CodeT5+ on over 20 code-related benchmarks in different settings, including zero-shot, finetuning, and instruction-tuning. We observe state-of-the-art (SoTA) model performance on various code-related tasks, such as code generation and completion, math programming, and text-to-code retrieval tasks. Particularly, our instruction-tuned CodeT5+ 16B achieves new SoTA results on HumanEval code generation task against other open code LLMs.
diff --git a/_publications/wang2023deepvd.markdown b/_publications/wang2023deepvd.markdown
new file mode 100644
index 00000000..5e797eaf
--- /dev/null
+++ b/_publications/wang2023deepvd.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "DeepVD: Toward Class-Separation Features for Neural Network Vulnerability Detection"
+authors: Wenbo Wang, Tien N. Nguyen, Shaohua Wang, Yi Li, Jiyuan Zhang, Aashish Yadavally
+conference: ICSE
+year: 2023
+additional_links:
+   - {name: "website", url: "https://aashishyadavally.github.io/publication/C4"}
+   - {name: "code", url: "https://github.com/deepvd2022/deepvd2022"}
+tags: ["vulnerability"]
+---
+The advances of machine learning (ML) including deep learning (DL) have enabled several approaches to implicitly learn vulnerable code patterns to automatically detect software vulnerabilities. A recent study showed that despite successes, the existing ML/DL-based vulnerability detection (VD) models are limited in the ability to distinguish between the two classes of vulnerability and benign code. We propose DeepVD, a graph-based neural network VD model that emphasizes on class-separation features between vulnerability and benign code. DeepVD leverages three types of class-separation features at different levels of abstraction: statement types (similar to Part-of-Speech tagging), Post-Dominator Tree (covering regular flows of execution), and Exception Flow Graph (covering the exception and error-handling flows). We conducted several experiments to evaluate DeepVD in a real-world vulnerability dataset of 303 projects with 13,130 vulnerable methods. Our results show that DeepVD relatively improves over the state-of-the-art ML/DL-based VD approaches 13%–29.6% in precision, 15.6%–28.9% in recall, and 16.4%–25.8% in F-score. Our ablation study confirms that our designed features and components help DeepVD achieve high class-separability for vulnerability and benign code.
diff --git a/_publications/watson2021systematic.markdown b/_publications/watson2021systematic.markdown
new file mode 100644
index 00000000..01067564
--- /dev/null
+++ b/_publications/watson2021systematic.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: A Systematic Literature Review on the Use of Deep Learning in Software Engineering Research
+authors: Cody Watson, Nathan Cooper, David Nader Palacio, Kevin Moran, Denys Poshyvanyk
+conference: TSE
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2009.06520"}
+   - {name: "website", url: "https://wm-semeru.github.io/dl4se/"}
+   - {name: "code", url: "https://github.com/WM-SEMERU/dl4se"}
+tags: ["survey"]
+---
+An increasingly popular set of techniques adopted by software engineering (SE) researchers to automate development tasks are those rooted in the concept of Deep Learning (DL). The popularity of such techniques largely stems from their automated feature engineering capabilities, which aid in modeling software artifacts. However, due to the rapid pace at which DL techniques have been adopted, it is difficult to distill the current successes, failures, and opportunities of the current research landscape. In an effort to bring clarity to this crosscutting area of work, from its modern inception to the present, this paper presents a systematic literature review of research at the intersection of SE & DL. The review canvases work appearing in the most prominent SE and DL conferences and journals and spans 128 papers across 23 unique SE tasks. We center our analysis around the components of learning, a set of principles that govern the application of machine learning techniques (ML) to a given problem domain, discussing several aspects of the surveyed work at a granular level. The end result of our analysis is a research roadmap that both delineates the foundations of DL techniques applied to SE research, and highlights likely areas of fertile exploration for the future.
diff --git a/_publications/waunakh2019evaluating.markdown b/_publications/waunakh2019evaluating.markdown
deleted file mode 100644
index aacea92d..00000000
--- a/_publications/waunakh2019evaluating.markdown
+++ /dev/null
@@ -1,12 +0,0 @@
----
-layout: publication
-title: "Evaluating Semantic Representations of Source Code"
-authors: Y. Wainakh, M. Rauf, M. Pradel
-conference:
-year: 2019
-bibkey: waunakh2019evaluating
-additional_links:
-   - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05177"}
-tags: ["representation"]
----
-Learned representations of source code enable various software developer tools, e.g., to detect bugs or to predict program properties. At the core of code representations often are word embeddings of identifier names in source code, because identifiers account for the majority of source code vocabulary and convey important semantic information. Unfortunately, there currently is no generally accepted way of evaluating the quality of word embeddings of identifiers, and current evaluations are biased toward specific downstream tasks. This paper presents IdBench, the first benchmark for evaluating to what extent word embeddings of identifiers represent semantic relatedness and similarity. The benchmark is based on thousands of ratings gathered by surveying 500 software developers. We use IdBench to evaluate state-of-the-art embedding techniques proposed for natural language, an embedding technique specifically designed for source code, and lexical string distance functions, as these are often used in current developer tools. Our results show that the effectiveness of embeddings varies significantly across different embedding techniques and that the best available embeddings successfully represent semantic relatedness. On the downside, no existing embedding provides a satisfactory representation of semantic similarities, e.g., because embeddings consider identifiers with opposing meanings as similar, which may lead to fatal mistakes in downstream developer tools. IdBench provides a gold standard to guide the development of novel embeddings that address the current limitations. 
diff --git a/_publications/waunakh2019idbench.markdown b/_publications/waunakh2019idbench.markdown
new file mode 100644
index 00000000..4bc73df9
--- /dev/null
+++ b/_publications/waunakh2019idbench.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "IdBench: Evaluating Semantic Representations of Identifier Names in Source Code"
+authors: Yaza Wainakh, Moiz Rauf, Michael Pradel
+conference: ICSE
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05177"}
+tags: ["representation"]
+---
+Identifier names convey useful information about the intended semantics of code. Name-based program analyses use this information, e.g., to detect bugs, to predict types, and to improve the readability of code. At the core of namebased analyses are semantic representations of identifiers, e.g., in the form of learned embeddings. The high-level goal of such a representation is to encode whether two identifiers, e.g., len and size, are semantically similar. Unfortunately, it is currently unclear to what extent semantic representations match the semantic relatedness and similarity perceived by developers. This paper presents IdBench, the first benchmark for evaluating semantic representations against a ground truth created from thousands of ratings by 500 software developers. We use IdBench to study state-of-the-art embedding techniques proposed for natural language, an embedding technique specifically designed for source code, and lexical string distance functions. Our results show that the effectiveness of semantic representations varies significantly and that the best available embeddings successfully represent semantic relatedness. On the downside, no existing technique provides a satisfactory representation of semantic similarities, among other reasons because identifiers with opposing meanings are incorrectly considered to be similar, which may lead to fatal mistakes, e.g., in a refactoring tool. Studying the strengths and weaknesses of the different techniques shows that they complement each other. As a first step toward exploiting this complementarity, we present an ensemble model that combines existing techniques and that clearly outperforms the best available semantic representation.
diff --git a/_publications/wei2019code.markdown b/_publications/wei2019code.markdown
index 27202b72..e83ac638 100644
--- a/_publications/wei2019code.markdown
+++ b/_publications/wei2019code.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "Code Generation as a Dual Task of Code Summarization"
-authors: B. Wei, G. Li, X. Xia, Z. Fu, Z. Jin
+authors: Bolin Wei, Ge Li, Xin Xia, Zhiyi Fu, Zhi Jin
 conference: NeurIPS
 year: 2019
-bibkey: wei2019code
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05923"}
-tags: ["generation", "summarization"]
+tags: ["code generation", "summarization"]
 ---
 Code summarization (CS) and code generation (CG) are two crucial tasks in the field of automatic software development. Various neural network-based approaches are proposed to solve these two tasks separately. However, there exists a specific intuitive correlation between CS and CG, which have not been exploited in previous work. In this paper, we apply the relations between two tasks to improve the performance of both tasks. In other words, exploiting the duality between the two tasks, we propose a dual training framework to train the two tasks simultaneously. In this framework, we consider the dualities on probability and attention weights, and design corresponding regularization terms to constrain the duality. We evaluate our approach on two datasets collected from GitHub, and experimental results show that our dual framework can improve the performance of CS and CG tasks over baselines. 
diff --git a/_publications/wei2020lambdanet.markdown b/_publications/wei2020lambdanet.markdown
index cc7c99ed..12de7074 100644
--- a/_publications/wei2020lambdanet.markdown
+++ b/_publications/wei2020lambdanet.markdown
@@ -1,12 +1,12 @@
 ---
 layout: publication
 title: "LambdaNet: Probabilistic Type Inference using Graph Neural Networks"
-authors: J. Wei, M. Goyal, G. Durrett, I. Dillig
+authors: Jiayi Wei, Maruth Goyal, Greg Durrett, Isil Dillig
 conference: ICLR
 year: 2020
-bibkey: wei2020lambdanet
 additional_links:
    - {name: "OpenReview", url: "https://openreview.net/forum?id=Hkx6hANtwH&noteId=Hkx6hANtwH"}
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2005.02161"}
    - {name: "Code", url: "https://github.com/MrVPlusOne/LambdaNet"}
 tags: ["GNN", "types"]
 ---
diff --git a/_publications/wei2023typet5.markdown b/_publications/wei2023typet5.markdown
new file mode 100644
index 00000000..03b7262a
--- /dev/null
+++ b/_publications/wei2023typet5.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "TypeT5: Seq2seq Type Inference using Static Analysis"
+authors: Jiayi Wei, Greg Durrett, Isil Dillig
+conference: ICLR
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2303.09564"}
+tags: ["types", "Transformer"]
+---
+There has been growing interest in automatically predicting missing type annotations in programs written in Python and JavaScript. While prior methods have achieved impressive accuracy when predicting the most common types, they often perform poorly on rare or complex types. In this paper, we present a new type inference method that treats type prediction as a code infilling task by leveraging CodeT5, a state-of-the-art seq2seq pre-trained language model for code. Our method uses static analysis to construct dynamic contexts for each code element whose type signature is to be predicted by the model. We also propose an iterative decoding scheme that incorporates previous type predictions in the model's input context, allowing information exchange between related code elements. Our evaluation shows that the proposed approach, TypeT5, not only achieves a higher overall accuracy (particularly on rare and complex types) but also produces more coherent results with fewer type errors -- while enabling easy user intervention. 
diff --git a/_publications/white2015toward.markdown b/_publications/white2015toward.markdown
index 2c8e2793..0c4406d7 100644
--- a/_publications/white2015toward.markdown
+++ b/_publications/white2015toward.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Toward Deep Learning Software Repositories"
-authors: M. White, C. Vendome, M. Linares-Vásquez, D. Poshyvanyk
+authors: Martin White, Christopher Vendome, Mario Linares-Vasquez, Denys Poshyvanyk
 conference: MSR
 year: 2015
-bibkey: white2015toward
 tags: ["representation"]
 ---
 Deep learning subsumes algorithms that automatically learn compositional representations. The ability of these
diff --git a/_publications/white2016deep.markdown b/_publications/white2016deep.markdown
index d19800a3..b31d11da 100644
--- a/_publications/white2016deep.markdown
+++ b/_publications/white2016deep.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Deep Learning Code Fragments for Code Clone Detection"
-authors: M. White, M. Tufano, C. Vendome, D. Poshyvanyk
+authors: Martin White, Michele Tufano, Christopher Vendome, Denys Poshyvanyk.
 conference: ASE
 year: 2016
-bibkey: white2016deep
 tags: ["clone"]
 ---
 Code clone detection is an important problem for software
diff --git a/_publications/white2017sorting.markdown b/_publications/white2017sorting.markdown
index 4a8e2341..f4653c43 100644
--- a/_publications/white2017sorting.markdown
+++ b/_publications/white2017sorting.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Sorting and Transforming Program Repair Ingredients via Deep Learning Code Similarities"
-authors: M. White, M. Tufano, M. Martínez, M. Monperrus, D. Poshyvanyk
-conference: 
+authors: Martin White, Michele Tufano, Matias Martinez, Martin Monperrus, Denys Poshyvanyk
+conference: SANER
 year: 2017
-bibkey: white2017sorting
 tags: ["repair"]
 ---
 In  the  field  of  automated  program  repair,  the  redundancy  assumption  claims  large  programs  contain  the  seeds
diff --git a/_publications/wong2021leveraging.markdown b/_publications/wong2021leveraging.markdown
new file mode 100644
index 00000000..414b7031
--- /dev/null
+++ b/_publications/wong2021leveraging.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Leveraging Language to Learn Program Abstractions and Search Heuristics"
+authors: Catherine Wong, Kevin Ellis, Joshua B. Tenenbaum, Jacob Andreas
+conference: Thirty-eighth International Conference on Machine Learning (ICML 2021)
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.11053"}
+   - {name: "Poster", url: "https://icml.cc/Conferences/2021/ScheduleMultitrack?event=10372"}
+tags: ["synthesis", "search"]
+---
+Inductive program synthesis, or inferring programs from examples of desired behavior, offers a general paradigm for building interpretable, robust, and generalizable machine learning systems. Effective program synthesis depends on two key ingredients: a strong library of functions from which to build programs, and an efficient search strategy for finding programs that solve a given task. We introduce LAPS (Language for Abstraction and Program Search), a technique for using natural language annotations to guide joint learning of libraries and neurally-guided search models for synthesis. When integrated into a state-of-the-art library learning system (DreamCoder), LAPS produces higher-quality libraries and improves search efficiency and generalization on three domains -- string editing, image composition, and abstract reasoning about scenes -- even when no natural language hints are available at test time.
\ No newline at end of file
diff --git a/_publications/wu2021prototransformer.markdown b/_publications/wu2021prototransformer.markdown
new file mode 100644
index 00000000..802b2466
--- /dev/null
+++ b/_publications/wu2021prototransformer.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "ProtoTransformer: A Meta-Learning Approach to Providing Student Feedback"
+authors: Mike Wu, Noah D. Goodman, Chris Piech, Chelsea Finn
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2107.14035"}
+tags: ["Transformer", "education"]
+---
+High-quality computer science education is limited by the difficulty of providing instructor feedback to students at scale. While this feedback could in principle be automated, supervised approaches to predicting the correct feedback are bottlenecked by the intractability of annotating large quantities of student code. In this paper, we instead frame the problem of providing feedback as few-shot classification, where a meta-learner adapts to give feedback to student code on a new programming question from just a few examples annotated by instructors. Because data for meta-training is limited, we propose a number of amendments to the typical few-shot learning framework, including task augmentation to create synthetic tasks, and additional side information to build stronger priors about each task. These additions are combined with a transformer architecture to embed discrete sequences (e.g. code) to a prototypical representation of a feedback class label. On a suite of few-shot natural language processing tasks, we match or outperform state-of-the-art performance. Then, on a collection of student solutions to exam questions from an introductory university course, we show that our approach reaches an average precision of 88% on unseen questions, surpassing the 82% precision of teaching assistants. Our approach was successfully deployed to deliver feedback to 16,000 student exam-solutions in a programming course offered by a tier 1 university. This is, to the best of our knowledge, the first successful deployment of a machine learning based feedback to open-ended student code.
diff --git a/_publications/xia2023universal.markdown b/_publications/xia2023universal.markdown
new file mode 100644
index 00000000..0f20b845
--- /dev/null
+++ b/_publications/xia2023universal.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Universal Fuzzing via Large Language Models"
+authors: Chunqiu Steven Xia, Matteo Paltenghi, Jia Le Tian, Michael Pradel, Lingming Zhang
+conference:
+year: 2023
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2308.04748"}
+tags: ["fuzzing"]
+---
+Fuzzing has achieved tremendous success in discovering bugs and vulnerabilities in various software systems. Systems under test (SUTs) that take in programming or formal language as inputs, e.g., compilers, runtime engines, constraint solvers, and software libraries with accessible APIs, are especially important as they are fundamental building blocks of software development. However, existing fuzzers for such systems often target a specific language, and thus cannot be easily applied to other languages or even other versions of the same language. Moreover, the inputs generated by existing fuzzers are often limited to specific features of the input language, and thus can hardly reveal bugs related to other or new features. This paper presents Fuzz4All, the first fuzzer that is universal in the sense that it can target many different input languages and many different features of these languages. The key idea behind Fuzz4All is to leverage large language models (LLMs) as an input generation and mutation engine, which enables the approach to produce diverse and realistic inputs for any practically relevant language. To realize this potential, we present a novel autoprompting technique, which creates LLM prompts that are wellsuited for fuzzing, and a novel LLM-powered fuzzing loop, which iteratively updates the prompt to create new fuzzing inputs. We evaluate Fuzz4All on nine systems under test that take in six different languages (C, C++, Go, SMT2, Java and Python) as inputs. The evaluation shows, across all six languages, that universal fuzzing achieves higher coverage than existing, language-specific fuzzers. Furthermore, Fuzz4All has identified 76 bugs in widely used systems, such as GCC, Clang, Z3, CVC5, OpenJDK, and the Qiskit quantum computing platform, with 47 bugs already confirmed by developers as previously unknown.
diff --git a/_publications/xu2019commit.markdown b/_publications/xu2019commit.markdown
index 96e1a918..ca64886e 100644
--- a/_publications/xu2019commit.markdown
+++ b/_publications/xu2019commit.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Commit Message Generation for Source Code Changes"
-authors: S. Xu, Y. Yao, F. Xu, T. Gu, H. Tong, J. Lu
+authors: Shengbin Xu, Yuan Yao, Feng Xu, Tianxiao Gu, Hanghang Tong, Jian Lu
 conference: IJCAI
 year: 2019
-bibkey: xu2019commit
 tags: ["edit", "summarization"]
 ---
 Commit  messages,  which  summarize  the  source
diff --git a/_publications/xu2019method.markdown b/_publications/xu2019method.markdown
index d5358080..970a8d76 100644
--- a/_publications/xu2019method.markdown
+++ b/_publications/xu2019method.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Method name suggestion with hierarchical attention networks"
-authors: S. Xu, S. Zhang, W. Wang, X. Cao, C. Guo, J. Xu
+authors: Sihan Xu, Sen Zhang, Weijing Wang, Xinya Cao, Chenkai Guo, Jing Xu.
 conference: PEPM
 year: 2019
-bibkey: xu2019method
 tags: ["naming"]
 ---
 Method Rename has been a widely used refactoring operation that improves program comprehension and maintenance. Descriptive method names that summarize functionalities of source code can facilitate program comprehension. Much research has been done to suggest method names through source code summarization. However, unlike natural language, a code snippet consists of basic blocks organized by complicated structures. In this work, we observe a hierarchical structure --- tokens form basic blocks and basic blocks form a code snippet. Based on this observation, we exploit a hierarchical attention network to learn the representation of methods. Specifically, we apply two-level attention mechanism to learn the importance of each token in a basic block and that of a basic block in a method respectively. We evaluated our approach on 10 open source repositories and compared it against three state-of-the-art approaches. The results on these open-source data show the superiority of our hierarchical attention networks in terms of effectiveness.
diff --git a/_publications/xu2020incorporating.markdown b/_publications/xu2020incorporating.markdown
index d00a5ceb..f4eeb528 100644
--- a/_publications/xu2020incorporating.markdown
+++ b/_publications/xu2020incorporating.markdown
@@ -4,10 +4,9 @@ title: "Incorporating External Knowledge through Pre-training for Natural Langua
 authors: Frank F. Xu, Zhengbao Jiang, Pengcheng Yin, Bogdan Vasilescu, Graham Neubig
 conference: ACL
 year: 2020
-bibkey: xu2020incorporating
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2004.09015"}
    - {name: "Code", url: "https://github.com/neulab/external-knowledge-codegen"}
-tags: ["bimodal", "generation"]
+tags: ["bimodal", "code generation"]
 ---
 Open-domain code generation aims to generate code in a general-purpose programming language (such as Python) from natural language (NL) intents. Motivated by the intuition that developers usually retrieve resources on the web when writing code, we explore the effectiveness of incorporating two varieties of external knowledge into NL-to-code generation: automatically mined NL-code pairs from the online programming QA forum StackOverflow and programming language API documentation. Our evaluations show that combining the two sources with data augmentation and retrieval-based data re-sampling improves the current state-of-the-art by up to 2.2% absolute BLEU score on the code generation testbed CoNaLa. The code and resources are available at [Open-domain code generation aims to generate code in a general-purpose programming language (such as Python) from natural language (NL) intents. Motivated by the intuition that developers usually retrieve resources on the web when writing code, we explore the effectiveness of incorporating two varieties of external knowledge into NL-to-code generation: automatically mined NL-code pairs from the online programming QA forum StackOverflow and programming language API documentation. Our evaluations show that combining the two sources with data augmentation and retrieval-based data re-sampling improves the current state-of-the-art by up to 2.2% absolute BLEU score on the code generation testbed CoNaLa. The code and resources are available at [https://github.com/neulab/external-knowledge-codegen](https://github.com/neulab/external-knowledge-codegen).
diff --git a/_publications/xu2021capturing.markdown b/_publications/xu2021capturing.markdown
new file mode 100644
index 00000000..db3498ac
--- /dev/null
+++ b/_publications/xu2021capturing.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Capturing Structural Locality in Non-parametric Language Models"
+authors: Frank F. Xu, Junxian He, Graham Neubig, Vincent J. Hellendoorn
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2110.02870"}
+tags: ["language model"]
+---
+Structural locality is a ubiquitous feature of real-world datasets, wherein data points are organized into local hierarchies. Some examples include topical clusters in text or project hierarchies in source code repositories. In this paper, we explore utilizing this structural locality within non-parametric language models, which generate sequences that reference retrieved examples from an external source. We propose a simple yet effective approach for adding locality information into such models by adding learned parameters that improve the likelihood of retrieving examples from local neighborhoods. Experiments on two different domains, Java source code and Wikipedia text, demonstrate that locality features improve model efficacy over models without access to these features, with interesting differences. We also perform an analysis of how and where locality features contribute to improved performance and why the traditionally used contextual similarity metrics alone are not enough to grasp the locality structure. 
diff --git a/_publications/xu2022systematic.markdown b/_publications/xu2022systematic.markdown
new file mode 100644
index 00000000..fc1885f2
--- /dev/null
+++ b/_publications/xu2022systematic.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "A Systematic Evaluation of Large Language Models of Code"
+authors: Frank F. Xu, Uri Alon, Graham Neubig, Vincent J. Hellendoorn
+conference: 
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2202.13169"}
+tags: ["Transformer", "language model"]
+---
+Large language models (LMs) of code have recently shown tremendous promise in completing code and synthesizing code from natural language descriptions. However, the current state-of-the-art code LMs (e.g., Codex (Chen et al., 2021)) are not publicly available, leaving many questions about their model and data design decisions. We aim to fill in some of these blanks through a systematic evaluation of the largest existing models: Codex, GPT-J, GPT-Neo, GPT-NeoX-20B, and CodeParrot, across various programming languages. Although Codex itself is not open-source, we find that existing open-source models do achieve close results in some programming languages, although targeted mainly for natural language modeling. We further identify an important missing piece in the form of a large open-source model trained exclusively on a multi-lingual corpus of code. We release a new model, PolyCoder, with 2.7B parameters based on the GPT-2 architecture, which was trained on 249GB of code across 12 programming languages on a single machine. In the C programming language, PolyCoder outperforms all models including Codex. Our trained models are open-source and publicly available at this https URL, which enables future research and application in this area. 
diff --git a/_publications/yadavally2023partial.markdown b/_publications/yadavally2023partial.markdown
new file mode 100644
index 00000000..46ab23b5
--- /dev/null
+++ b/_publications/yadavally2023partial.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "(Partial) Program Dependence Learning"
+authors: Aashish Yadavally, Wenbo Wang, Shaohua Wang, Tien N. Nguyen
+conference: ICSE
+year: 2023
+additional_links:
+   - {name: "website", url: "https://aashishyadavally.github.io/publication/C5"}
+   - {name: "code", url: "https://github.com/aashishyadavally/NeuralPDA"}
+tags: ["large language models", "program analysis", "static analysis", "tool"]
+---
+Code fragments from developer forums often migrate to applications due to the code reuse practice. Owing to the incomplete nature of such programs, analyzing them to early determine the presence of potential vulnerabilities is challenging. In this work, we introduce NeuralPDA, a neural network-based program dependence analysis tool for both complete and partial programs. Our tool efficiently incorporates intra-statement and inter-statement contextual features into statement representations, thereby modeling program dependence analysis as a statement-pair dependence decoding task. In the empirical evaluation, we report that NeuralPDA predicts the CFG and PDG edges in complete Java and C/C++ code with combined F-scores of 94.29% and 92.46%, respectively. The F-score values for partial Java and C/C++ code range from 94.29%–97.17% and 92.46%–96.01%, respectively. We also test the usefulness of the PDGs predicted by NEURALPDA (i.e., PDG*) on the downstream task of method-level vulnerability detection. We discover that the performance of the vulnerability detection tool utilizing PDG* is only 1.1% less than that utilizing the PDGs generated by a program analysis tool. We also report the detection of 14 real-world vulnerable code snippets from StackOverflow by a machine learning-based vulnerability detection tool that employs the PDGs predicted by NeuralPDA for these code snippets.
diff --git a/_publications/yadavally2024learning.markdown b/_publications/yadavally2024learning.markdown
new file mode 100644
index 00000000..3a46067e
--- /dev/null
+++ b/_publications/yadavally2024learning.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "A Learning-Based Approach to Static Program Slicing"
+authors: Aashish Yadavally, Yi Li, Shaohua Wang, Tien N. Nguyen
+conference: OOPSLA
+year: 2024
+additional_links:
+   - {name: "website", url: "https://aashishyadavally.github.io/assets/pdf/pub-oopsla2024.pdf"}
+   - {name: "code", url: "https://github.com/aashishyadavally/ns-slicer"}
+tags: ["large language models", "program analysis", "static", "tool"]
+---
+Traditional program slicing techniques are crucial for early bug detection and manual/automated debugging of online code snippets. Nevertheless, their inability to handle incomplete code hinders their real-world applicability in such scenarios. To overcome these challenges, we present NS-Slicer, a novel learning-based approach that predicts static program slices for both complete and partial code. Our tool leverages a pre-trained language model to exploit its understanding of fine-grained variable-statement dependencies within source code. With this knowledge, given a variable at a specific location and a statement in a code snippet, NS-Slicer determines whether the statement belongs to the backward slice or forward slice, respectively. We conducted a series of experiments to evaluate NS-Slicer’s performance. On complete code, it predicts the backward and forward slices with an F1-score of 97.41% and 95.82%, respectively, while achieving an overall F1-score of 96.77%. Notably, in 85.20% of the cases, the static program slices predicted by NS-Slicer exactly match entire slices from the oracle. For partial programs, it achieved an F1-score of 96.77%–97.49% for backward slicing, 92.14%–95.40% for forward slicing, and an overall F1-score of 94.66%–96.62%. Furthermore, we demonstrate NS-Slicer’s utility in vulnerability detection (VD), integrating its predicted slices into an automated VD tool. In this setup, the tool detected vulnerabilities in Java code with a high F1-score of 73.38%. We also include the analyses studying NS-Slicer’s promising performance and limitations, providing insights into its understanding of intrinsic code properties such as variable aliasing, leading to better slicing.
diff --git a/_publications/yadavally2024predictive.markdown b/_publications/yadavally2024predictive.markdown
new file mode 100644
index 00000000..9f8930b1
--- /dev/null
+++ b/_publications/yadavally2024predictive.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Predictive Program Slicing via Execution Knowledge-Guided Dynamic Dependence Learning"
+authors: Aashish Yadavally, Yi Li, Tien N. Nguyen
+conference: FSE
+year: 2024
+additional_links:
+   - {name: "website", url: "https://aashishyadavally.github.io/assets/pdf/pub-fse2024.pdf"}
+   - {name: "code", url: "https://github.com/aashishyadavally/nd-slicer"}
+tags: ["large language models", "program analysis", "dynamic", "tool"]
+---
+Program slicing, the process of extracting program statements that influence values at a designated location (known as the slicing criterion), is helpful in both manual and automated debugging. However, such slicing techniques prove ineffective in scenarios where executing specific inputs is prohibitively expensive, or even impossible, as with partial code. In this paper, we introduce ND-Slicer, a predictive slicing methodology that caters to specific executions based on a particular input, overcoming the need for actual execution. We enable such a process by leveraging execution-aware pre-training to learn the dynamic program dependencies, including both dynamic data and control dependencies between variables in the slicing criterion and the remaining program statements. Such knowledge forms the cornerstone for constructing a predictive backward slice. Our empirical evaluation revealed a high accuracy in predicting program slices, achieving an exact-match accuracy of 81.3% and a ROUGE-LCS F1-score of 95.4% on Python programs. As an extrinsic evaluation, we illustrate ND-Slicer’s usefulness in crash detection, with it locating faults with an accuracy of 63.9%. Furthermore, we include an in-depth qualitative evaluation, assessing ND-Slicer’s understanding of branched structures such as if-else blocks and loops, as well as the control flow in inter-procedural calls.
diff --git a/_publications/yadid2016extracting.markdown b/_publications/yadid2016extracting.markdown
index ce16e05e..5e91e271 100644
--- a/_publications/yadid2016extracting.markdown
+++ b/_publications/yadid2016extracting.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Extracting Code from Programming Tutorial Videos"
-authors: S. Yadid, E. Yahav
+authors: Shir Yadid, Eran Yahav
 conference: Onward!
 year: 2016
-bibkey: yadid2016extracting
 tags: ["information extraction"]
 ---
 The number of programming tutorial videos on the web
diff --git a/_publications/yan2020are.markdown b/_publications/yan2020are.markdown
new file mode 100644
index 00000000..d8815855
--- /dev/null
+++ b/_publications/yan2020are.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Are the Code Snippets What We Are Searching for? A Benchmark and an Empirical Study on Code Search with Natural-Language Queries"
+authors: Shuhan Yan, Hang Yu, Yuting Chen, Beijun Shen, Lingxiao Jiang
+conference: SANER
+year: 2020
+additional_links:
+  - { name: "IEEE", url: "https://ieeexplore.ieee.org/document/9054840" }
+tags: ["search"]
+---
+
+Code search methods, especially those that allow programmers to raise queries in a natural language, plays an important role in software development. It helps to improve programmers' productivity by returning sample code snippets from the Internet and/or source-code repositories for their natural-language queries. Meanwhile, there are many code search methods in the literature that support natural-language queries. Difficulties exist in recognizing the strengths and weaknesses of each method and choosing the right one for different usage scenarios, because (1) the implementations of those methods and the datasets for evaluating them are usually not publicly available, and (2) some methods leverage different training datasets or auxiliary data sources and thus their effectiveness cannot be fairly measured and may be negatively affected in practical uses. To build a common ground for measuring code search methods, this paper builds CosBench, a dataset that consists of 1000 projects, 52 code-independent natural-language queries with ground truths, and a set of scripts for calculating four metrics on code research results. We have evaluated four IR (Information Retrieval)-based and two DL (Deep Learning)-based code search methods on CosBench. The empirical evaluation results clearly show the usefulness of the CosBench dataset and various strengths of each code search method. We found that DL-based methods are more suitable for queries on reusing code, and IR-based ones for queries on resolving bugs and learning API uses.
diff --git a/_publications/yang2017language.markdown b/_publications/yang2017language.markdown
index 78a36aac..fccc44ba 100644
--- a/_publications/yang2017language.markdown
+++ b/_publications/yang2017language.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: A Language Model for Statements of Software Code
-authors: Y. Yang, Y. Jiang, M. Gu, J. Sun, J. Gao, H. Liu
+authors: Yixiao Yang, Yu Jiang, Ming Gu, Jiaguang Sun, Jian Gao, Han Liu
 conference: ASE
 year: 2017
-bibkey: yang2017language
 additional_links:
    - {name: "ACM", url: "https://dl.acm.org/citation.cfm?id=3155647"}
 tags: ["language model"]
diff --git a/_publications/yang2020survey.markdown b/_publications/yang2020survey.markdown
new file mode 100644
index 00000000..bfa17b11
--- /dev/null
+++ b/_publications/yang2020survey.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "A Survey on Deep Learning for Software Engineering"
+authors: Yanming Yang, Xin Xia, David Lo, John Grundy
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2011.14597"}
+tags: ["survey"]
+---
+In 2006, Geoffrey Hinton proposed the concept of training ''Deep Neural Networks (DNNs)'' and an improved model training method to break the bottleneck of neural network development. More recently, the introduction of AlphaGo in 2016 demonstrated the powerful learning ability of deep learning and its enormous potential. Deep learning has been increasingly used to develop state-of-the-art software engineering (SE) research tools due to its ability to boost performance for various SE tasks. There are many factors, e.g., deep learning model selection, internal structure differences, and model optimization techniques, that may have an impact on the performance of DNNs applied in SE. Few works to date focus on summarizing, classifying, and analyzing the application of deep learning techniques in SE. To fill this gap, we performed a survey to analyse the relevant studies published since 2006. We first provide an example to illustrate how deep learning techniques are used in SE. We then summarize and classify different deep learning techniques used in SE. We analyzed key optimization technologies used in these deep learning models, and finally describe a range of key research topics using DNNs in SE. Based on our findings, we present a set of current challenges remaining to be investigated and outline a proposed research road map highlighting key opportunities for future work.
diff --git a/_publications/yao2018staqc.markdown b/_publications/yao2018staqc.markdown
index 5ba778a6..9d6fbc53 100644
--- a/_publications/yao2018staqc.markdown
+++ b/_publications/yao2018staqc.markdown
@@ -4,7 +4,6 @@ title: "StaQC: A Systematically Mined Question-Code Dataset from Stack Overflow"
 authors: Ziyu Yao, Daniel S. Weld, Wei-Peng Chen, Huan Sun
 conference: WWW 2018
 year: 2018
-bibkey: yao2018staqc
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1803.09371"}
    - {name: "code", url: "https://github.com/LittleYUYU/StackOverflow-Question-Code-Dataset"}
diff --git a/_publications/yao2019coacor.markdown b/_publications/yao2019coacor.markdown
index 858e538d..0a67dfa1 100644
--- a/_publications/yao2019coacor.markdown
+++ b/_publications/yao2019coacor.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "CoaCor: Code Annotation for Code Retrieval with Reinforcement Learning"
-authors: Z Yao, JR Peddamail, H. Sun
+authors: Ziyu Yao, Jayavardhan Reddy Peddamail, Huan Sun
 conference: 
 year: 2019
-bibkey: yao2019coacor
 tags: ["search"]
 ---
 To accelerate software development, much research has been performed
diff --git a/_publications/yasunaga2020graph.markdown b/_publications/yasunaga2020graph.markdown
index f50bc103..4f46a739 100644
--- a/_publications/yasunaga2020graph.markdown
+++ b/_publications/yasunaga2020graph.markdown
@@ -4,7 +4,6 @@ title: "Graph-based, Self-Supervised Program Repair from Diagnostic Feedback"
 authors: Michihiro Yasunaga, Percy Liang
 conference: 
 year: 2020
-bibkey: yasunaga2020graph
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.10636"}
 tags: ["repair", "edit", "GNN"]
diff --git a/_publications/ye2020leveraging.markdown b/_publications/ye2020leveraging.markdown
new file mode 100644
index 00000000..d74a7bd3
--- /dev/null
+++ b/_publications/ye2020leveraging.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Leveraging Code Generation to Improve Code Retrieval and Summarization via Dual Learning"
+authors: Wei Ye, Rui Xie, Jinglei Zhang, Tianxiang Hu, Xiaoyin Wang, Shikun Zhang
+conference: WWW
+year: 2020
+additional_links:
+  - { name: "ArXiV", url: "https://arxiv.org/abs/2002.10198" }
+tags: ["search", "summarization"]
+---
+
+Code summarization generates brief natural language description given a source code snippet, while code retrieval fetches relevant source code given a natural language query. Since both tasks aim to model the association between natural language and programming language, recent studies have combined these two tasks to improve their performance. However, researchers have yet been able to effectively leverage the intrinsic connection between the two tasks as they train these tasks in a separate or pipeline manner, which means their performance can not be well balanced. In this paper, we propose a novel end-to-end model for the two tasks by introducing an additional code generation task. More specifically, we explicitly exploit the probabilistic correlation between code summarization and code generation with dual learning, and utilize the two encoders for code summarization and code generation to train the code retrieval task via multi-task learning. We have carried out extensive experiments on an existing dataset of SQL and Python, and results show that our model can significantly improve the results of the code retrieval task over the-state-of-art models, as well as achieve competitive performance in terms of BLEU score for the code summarization task.
diff --git a/_publications/ye2020misim.markdown b/_publications/ye2020misim.markdown
new file mode 100644
index 00000000..4bd0a8c3
--- /dev/null
+++ b/_publications/ye2020misim.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "MISIM: An End-to-End Neural Code Similarity System"
+authors: Fangke Ye, Shengtian Zhou, Anand Venkat, Ryan Marcus, Nesime Tatbul, Jesmin Jahan Tithi, Paul Petersen, Timothy Mattson, Tim Kraska, Pradeep Dubey, Vivek Sarkar, Justin Gottschlich
+conference:
+year: 2020
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2006.05265"}
+tags: ["code similarity"]
+---
+Code similarity systems are integral to a range of applications from code recommendation to automated construction of software tests and defect mitigation. In this paper, we present Machine Inferred Code Similarity (MISIM), a novel end-to-end code similarity system that consists of two core components. First, MISIM uses a novel context-aware similarity structure, which is designed to aid in lifting semantic meaning from code syntax. Second, MISIM provides a neural-based code similarity scoring system, which can be implemented with various neural network algorithms and topologies with learned parameters. We compare MISIM to three other state-of-the-art code similarity systems: (i) code2vec, (ii) Neural Code Comprehension, and (iii) Aroma. In our experimental evaluation across 45,780 programs, MISIM consistently outperformed all three systems, often by a large factor (upwards of 40.6x).
diff --git a/_publications/ye2021neural.markdown b/_publications/ye2021neural.markdown
new file mode 100644
index 00000000..71bceb57
--- /dev/null
+++ b/_publications/ye2021neural.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Neural Program Repair with Execution-based Backpropagation"
+authors: He Ye, Matias Martinez, Monperrus Martin
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.04123"}
+tags: ["repair"]
+---
+Neural machine translation (NMT) architectures have achieved promising results for automatic program repair. Yet, they have the limitation of generating low-quality patches (e.g., not compilable patches). This is because the existing works only optimize a purely syntactic loss function based on characters and tokens without incorporating program-specific information during neural net weight optimization. In this paper, we propose a novel program repair model called RewardRepair. The core novelty of RewardRepair is to improve NMT-based program repair with a loss function based on program compilation and test execution information, rewarding the network to produce patches that compile and that do not overfit. We conduct several experiments to evaluate RewardRepair showing that it is feasible and effective to use compilation and test execution results to optimize the underlying neural repair model. In total, RewardRepair correctly repairs 43 Defects4J bugs including eight that are fixed for the first time.
diff --git a/_publications/ye2022selfapr.markdown b/_publications/ye2022selfapr.markdown
new file mode 100644
index 00000000..65b9d363
--- /dev/null
+++ b/_publications/ye2022selfapr.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "SelfAPR: Self-supervised Program Repair with Test Execution Diagnostics"
+authors: He Ye, Matias Martinez, Xiapu Luo, Tao Zhang, Martin Monperrus
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2203.12755"}
+tags: ["repair", "execution"]
+---
+Neural program repair has achieved good results in a recent series of papers. Yet, we observe that the related work fails to repair some bugs because of a lack of knowledge about 1) the program being repaired, and 2) the actual fault being repaired. In this paper, we solve both problems by changing the learning paradigm from supervised training to self-supervised training in an approach called SelfAPR. First, SelfAPR generates and constructs training samples by perturbing a previous version of the program being repaired, enforcing the neural model to capture project-specific knowledge. This is different from all the existing work based on past commits. Second, SelfAPR extracts and encodes test execution diagnostics into the input representation, steering the neural model to fix the specific kind of fault. This is different from the existing studies that only consider static source code in the input. We implement SelfAPR and evaluate it in a systematic manner. We train SelfAPR with 253 411 training samples obtained by perturbing 17 open-source projects. We evaluate SelfAPR on 818 bugs from Defects4J, SelfAPR correctly repairs 112 of them. 
diff --git a/_publications/yefet2019adversarial.markdown b/_publications/yefet2019adversarial.markdown
index d2381371..02b76b43 100644
--- a/_publications/yefet2019adversarial.markdown
+++ b/_publications/yefet2019adversarial.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Adversarial Examples for Models of Code"
-authors: N. Yefet, U. Alon, E. Yahav
+authors: Noam Yefet, Uri Alon, Eran Yahav
 conference:
 year: 2019
-bibkey: yefet2019adversarial
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1910.07517"}
 tags: ["adversarial"]
diff --git a/_publications/yin2017syntactic.markdown b/_publications/yin2017syntactic.markdown
index 108041a3..436f0926 100644
--- a/_publications/yin2017syntactic.markdown
+++ b/_publications/yin2017syntactic.markdown
@@ -1,11 +1,10 @@
 ---
 layout: publication
 title: "A Syntactic Neural Model for General-Purpose Code Generation"
-authors: P. Yin, G. Neubig
+authors: Pengcheng Yin, Graham Neubig
 conference: ACL
 year: 2017
-bibkey: yin2017syntactic
-tags: ["generation", "AST", "bimodal"]
+tags: ["code generation", "grammar", "bimodal"]
 ---
 We consider the problem of parsing natural language descriptions into source code
 written in a general-purpose programming
diff --git a/_publications/yin2018mining.markdown b/_publications/yin2018mining.markdown
index b0b465f1..1c6e9513 100644
--- a/_publications/yin2018mining.markdown
+++ b/_publications/yin2018mining.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning to Mine Aligned Code and Natural Language Pairs from Stack Overflow"
-authors: P. Yin, B. Deng, E. Chen, B. Vasilescu, G. Neubig
+authors: Pengcheng Yin, B. Deng, E. Chen, B. Vasilescu, Graham Neubig
 conference: MSR
 year: 2018
-bibkey: yin2018mining
 additional_links:
    - {name: "data", url: "https://conala-corpus.github.io/"}
 tags: ["dataset"]
diff --git a/_publications/yin2019learning.markdown b/_publications/yin2019learning.markdown
index 9d89efbb..ddaa290e 100644
--- a/_publications/yin2019learning.markdown
+++ b/_publications/yin2019learning.markdown
@@ -1,11 +1,11 @@
 ---
 layout: publication
 title: "Learning to Represent Edits"
-authors: P. Yin, G. Neubig, M. Allamanis, M. Brockschmidt, A. L. Gaunt
+authors: Pengcheng Yin, Graham Neubig, Miltiadis Allamanis, Marc Brockschmidt, Alexander L. Gaunt
 conference: ICLR
 year: 2019
-bibkey: yin2019learning
 additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1810.13337"}
    - {name: "data extraction", url: "https://github.com/Microsoft/msrc-dpu-learning-to-represent-edits"}
    - {name: "code edit data", url: "http://www.cs.cmu.edu/~pengchey/githubedits.zip"}
 tags: ["edit"]
diff --git a/_publications/yin2022natural.markdown b/_publications/yin2022natural.markdown
new file mode 100644
index 00000000..da39d6cf
--- /dev/null
+++ b/_publications/yin2022natural.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Natural Language to Code Generation in Interactive Data Science Notebooks"
+authors: Pengcheng Yin, Wen-Ding Li, Kefan Xiao, Abhishek Rao, Yeming Wen, Kensen Shi, Joshua Howland, Paige Bailey, Michele Catasta, Henryk Michalewski, Alex Polozov, Charles Sutton
+conference:
+year: 2022
+additional_links:
+  - {name: "ArXiV", url: "https://arxiv.org/abs/2212.09248"}
+tags: ["notebook", "evaluation"]
+---
+Computational notebooks, such as Jupyter notebooks, are interactive computing environments that are ubiquitous among data scientists to perform data wrangling and analytic tasks. To measure the performance of AI pair programmers that automatically synthesize programs for those tasks given natural language (NL) intents from users, we build ARCADE, a benchmark of 1082 code generation problems using the pandas data analysis framework in data science notebooks. ARCADE features multiple rounds of NL-to-code problems from the same notebook. It requires a model to understand rich multi-modal contexts, such as existing notebook cells and their execution states as well as previous turns of interaction. To establish a strong baseline on this challenging task, we develop PaChiNCo, a 62B code language model (LM) for Python computational notebooks, which significantly outperforms public code LMs. Finally, we explore few-shot prompting strategies to elicit better code with step-by-step decomposition and NL explanation, showing the potential to improve the diversity and explainability of model predictions.
diff --git a/_publications/yonai2019mercem.markdown b/_publications/yonai2019mercem.markdown
index 2a1c9984..005ede5e 100644
--- a/_publications/yonai2019mercem.markdown
+++ b/_publications/yonai2019mercem.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Mercem: Method Name Recommendation Based on Call Graph Embedding"
-authors: H. Yonai, Y. Hayase, H. Kitagawa
+authors: Hiroshi Yonai, Yasuhiro Hayase, Hiroyuki Kitagawa
 conference: 
 year: 2019
-bibkey: yonai2019mercem
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1907.05690"}
 tags: ["naming", "representation", "refactoring"]
diff --git a/_publications/yuan2017abridging.markdown b/_publications/yuan2017abridging.markdown
index 0cbda96b..0420e19b 100644
--- a/_publications/yuan2017abridging.markdown
+++ b/_publications/yuan2017abridging.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: Abridging Source Code
-authors: B. Yuan, V. Murali, C. Jermain
+authors: Binhang Yuan, Vijayaraghavan Murali, Christopher Jermaine
 conference: OOPSLA
 year: 2017
-bibkey: yuan2017abridging
 additional_links:
    - {name: "ACM", url: "https://dl.acm.org/citation.cfm?id=3133882"}
 tags: ["summarization"]
diff --git a/_publications/zaremba2014learning.markdown b/_publications/zaremba2014learning.markdown
index 9d2f4961..a697ced4 100644
--- a/_publications/zaremba2014learning.markdown
+++ b/_publications/zaremba2014learning.markdown
@@ -1,10 +1,11 @@
 ---
 layout: publication
 title: "Learning to Execute"
-authors: W. Zaremba, I. Sutskever
-conference: ArXiV 1410.4615
+authors: Wojciech Zaremba, Ilya Sutskever
+conference:
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/1810.13337"} 
 year: 2014
-bibkey: zaremba2014learning
-tags: ["representation"]
+tags: ["execution", "representation"]
 ---
 Recurrent Neural Networks (RNNs) with Long Short-Term Memory units (LSTM) are widely used because they are expressive and are easy to train. Our interest lies in empirically evaluating the expressiveness and the learnability of LSTMs in the sequence-to-sequence regime by training them to evaluate short computer programs, a domain that has traditionally been seen as too complex for neural networks. We consider a simple class of programs that can be evaluated with a single left-to-right pass using constant memory. Our main result is that LSTMs can learn to map the character-level representations of such programs to their correct outputs. Notably, it was necessary to use curriculum learning, and while conventional curriculum learning proved ineffective, we developed a new variant of curriculum learning that improved our networks' performance in all experimental conditions. The improved curriculum had a dramatic impact on an addition problem, making it possible to train an LSTM to add two 9-digit numbers with 99% accuracy.
diff --git a/_publications/zeng2022extensive.markdown b/_publications/zeng2022extensive.markdown
new file mode 100644
index 00000000..f9418aa2
--- /dev/null
+++ b/_publications/zeng2022extensive.markdown
@@ -0,0 +1,38 @@
+---
+layout: publication
+title: "An Extensive Study on Pre-trained Models for Program Understanding and Generation"
+authors: Zhengran Zeng, Hanzhuo Tan, Haotian Zhang, Jing Li, Yuqun Zhang, Lingming Zhang
+conference: ISSTA
+year: 2022
+additional_links:
+   - {name: "Author Version", url: "http://lingming.cs.illinois.edu/publications/issta2022.pdf"}
+tags: ["Transformer", "evaluation"]
+---
+Automatic program understanding and generation techniques could
+significantly advance the productivity of programmers and have
+been widely studied by academia and industry. Recently, the advent of pre-trained paradigm enlightens researchers to develop
+general-purpose pre-trained models which can be applied for a
+broad range of program understanding and generation tasks. Such
+pre-trained models, derived by self-supervised objectives on large
+unlabelled corpora, can be fine-tuned in downstream tasks (such
+as code search and code generation) with minimal adaptations. Although these pre-trained models claim superiority over the prior
+techniques, they seldom follow equivalent evaluation protocols, e.g.,
+they are hardly evaluated on the identical benchmarks, tasks, or settings. Consequently, there is a pressing need for a comprehensive
+study of the pre-trained models on their effectiveness, versatility
+as well as the limitations to provide implications and guidance for
+the future development in this area. To this end, we first perform
+an extensive study of eight open-access pre-trained models over
+a large benchmark on seven representative code tasks to assess
+their reproducibility. We further compare the pre-trained models
+and domain-specific state-of-the-art techniques for validating pre-trained effectiveness. At last, we investigate the robustness of the
+pre-trained models by inspecting their performance variations under adversarial attacks. Through the study, we find that while we
+can in general replicate the original performance of the pre-train
+models on their evaluated tasks and adopted benchmarks, subtle
+performance fluctuations can refute the findings in their original
+papers. Moreover, none of the existing pre-trained models can dominate over all other models. We also find that the pre-trained models
+can significantly outperform non-pre-trained state-of-the-art techniques in program understanding tasks. Furthermore, we perform
+the first study for natural language-programming language pre-trained model robustness via adversarial attacks and find that a
+simple random attack approach can easily fool the state-of-the-art
+pre-trained models and thus incur security issues. At last, we also
+provide multiple practical guidelines for advancing future research
+on pre-trained models for program understanding and generation.
diff --git a/_publications/zhang2019learning.markdown b/_publications/zhang2019learning.markdown
index b428a24b..20b46dd2 100644
--- a/_publications/zhang2019learning.markdown
+++ b/_publications/zhang2019learning.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Learning Uniform Semantic Features for Natural Language and Programming Language Globally, Locally and Sequentially"
-authors: Y. Zhang, W. Zheng, M. Li
+authors: Yudong Zhang, Wenhao Zheng, Ming Li
 conference: AAAI
 year: 2019
-bibkey: zhang2019learning
 tags: ["representation", "bimodal"]
 ---
 Semantic feature learning for natural language and programming language is a preliminary step in addressing many software mining tasks. Many existing methods leverage
diff --git a/_publications/zhang2019novel.markdown b/_publications/zhang2019novel.markdown
index b6f4924e..e4ae7613 100644
--- a/_publications/zhang2019novel.markdown
+++ b/_publications/zhang2019novel.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "A Novel Neural Source Code Representation based on Abstract Syntax Tree"
-authors: J. Zhang, X. Wang, H. Zhang, H Sun, K. Wang, X. Liu
+authors: Jian Zhang, Xu Wang, Hongyu Zhang, Hailong Sun, Kaixuan Wang, Xudong Liu
 conference: ICSE
 year: 2019
-bibkey: zhang2019novel
 additional_links:
    - {name: "PDF", url: "http://xuwang.tech/paper/astnn_icse2019.pdf"}
-tags: ["representation", "AST"]
+tags: ["representation", "grammar"]
 ---
 Exploiting machine learning techniques for analyzing programs has attracted much attention. One key problem is how to represent code fragments well for follow-up analysis. Traditional information retrieval based methods often treat programs as natural language texts, which could miss important semantic information of source code. Recently, state-of-the-art studies demonstrate that abstract syntax tree (AST) based neural models can better represent source code. However, the sizes of ASTs are usually large and the existing models are prone to the long-term dependency problem. In this paper, we propose a novel AST-based Neural Network (ASTNN) for source code representation. Unlike existing models that work on entire ASTs, ASTNN splits each large AST into a sequence of small statement trees, and encodes the statement trees to vectors by capturing the lexical and syntactical knowledge of statements. Based on the sequence of statement vectors, a bidirectional RNN model is used to leverage the naturalness of statements and finally produce the vector representation of a code fragment. We have applied our neural network based source code representation method to two common program comprehension tasks: source code classification and code clone detection. Experimental results on the two tasks indicate that our model is superior to state-of-the-art approaches.
diff --git a/_publications/zhang2020generating.markdown b/_publications/zhang2020generating.markdown
index 2da15e35..ae9a1ea9 100644
--- a/_publications/zhang2020generating.markdown
+++ b/_publications/zhang2020generating.markdown
@@ -1,12 +1,11 @@
 ---
 layout: publication
 title: "Generating Adversarial Examples for Holding Robustness of Source Code Processing Models"
-authors: H. Zhang, Z. Li, G. Li, L. Ma, Y. Liu, Z. Jin
+authors: Huangzhao Zhang, Zhuo Li, Ge Li, Lei Ma, Yang Liu, Zhi Jin
 conference: AAAI
 year: 2020
-bibkey: zhang2020generating
 additional_links:
-   - {name: "Proceedings", url: "https://www.aaai.org/Papers/AAAI/2020GB/AAAI-ZhangH.6730.pdf"}
+   - {name: "Proceedings", url: "https://ojs.aaai.org/index.php/AAAI/article/view/5469"}
 tags: ["adversarial"]
 ---
 Automated  processing,  analysis,  and  generation  of  source code are among the key activities
diff --git a/_publications/zhang2021bag.markdown b/_publications/zhang2021bag.markdown
new file mode 100644
index 00000000..2578e786
--- /dev/null
+++ b/_publications/zhang2021bag.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Bag-of-Words Baselines for Semantic Code Search"
+authors: Xinyu Zhang, Ji Xin, Andrew Yates, Jimmy Lin
+conference: NLP4Prog
+year: 2021
+additional_links:
+   - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.10.pdf"}
+tags: ["search"]
+---
+The task of semantic code search is to retrieve code snippets from a source code corpus based on an information need expressed in natural language. The semantic gap between natural language and programming languages has for long been regarded as one of the most significant obstacles to the effectiveness of keyword-based information retrieval (IR) methods. It is a common assumption that “traditional” bag-of-words IR methods are poorly suited for semantic code search: our work empirically investigates this assumption. Specifically, we examine the effectiveness of two traditional IR methods, namely BM25 and RM3, on the CodeSearchNet Corpus, which consists of natural language queries paired with relevant code snippets. We find that the two keyword-based methods outperform several pre-BERT neural models. We also compare several code-specific data pre-processing strategies and find that specialized tokenization improves effectiveness.
diff --git a/_publications/zhang2021disentangled.md b/_publications/zhang2021disentangled.md
new file mode 100644
index 00000000..318484b6
--- /dev/null
+++ b/_publications/zhang2021disentangled.md
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: Disentangled Code Representation Learning for Multiple Programming Languages
+authors: Jingfeng Zhang, Haiwen Hong, Yin Zhang, Yao Wan, Ye Liu, Yulei Sui
+conference: ACL
+year: 2021
+additional_links:
+   - {name: "Proceedings", url: "https://aclanthology.org/2021.findings-acl.391/"}
+tags: ["representation"]
+---
+Developing effective distributed representations of source code is fundamental yet challenging for many software engineering tasks such as code clone detection, code search, code translation and transformation. However, current code embedding approaches that represent the semantic and syntax of code in a mixed way are less interpretable and the resulting embedding can not be easily generalized across programming languages. In this paper, we propose a disentangled code representation learning approach to separate the semantic from the syntax of source code under a multi-programming-language setting, obtaining better interpretability and generalizability. Specially, we design three losses dedicated to the characteristics of source code to enforce the disentanglement effectively. We conduct comprehensive experiments on a real-world dataset composed of programming exercises implemented by multiple solutions that are semantically identical but grammatically distinguished. The experimental results validate the superiority of our proposed disentangled code representation, compared to several baselines, across three types of downstream tasks, i.e., code clone detection, code translation, and code-to-code search.
\ No newline at end of file
diff --git a/_publications/zhang2022coditt5.markdown b/_publications/zhang2022coditt5.markdown
new file mode 100644
index 00000000..99e60ac7
--- /dev/null
+++ b/_publications/zhang2022coditt5.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "CoditT5: Pretraining for Source Code and Natural Language Editing"
+authors: Jiyang Zhang, Sheena Panthaplackel, Pengyu Nie, Junyi Jessy Li, Milos Gligoric
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2208.05446"}
+tags: ["Transformer", "edit"]
+---
+Pretrained language models have been shown to be effective in many software-related generation tasks; however, they are not well-suited for editing tasks as they are not designed to reason about edits. To address this, we propose a novel pretraining objective which explicitly models edits and use it to build CoditT5, a large language model for software-related editing tasks that is pretrained on large amounts of source code and natural language comments. We fine-tune it on various downstream editing tasks, including comment updating, bug fixing, and automated code review. By outperforming pure generation-based models, we demonstrate the generalizability of our approach and its suitability for editing tasks. We also show how a pure generation model and our edit-based model can complement one another through simple reranking strategies, with which we achieve state-of-the-art performance for the three downstream editing tasks.
diff --git a/_publications/zhang2023repocoder.markdown b/_publications/zhang2023repocoder.markdown
new file mode 100644
index 00000000..5de5ff42
--- /dev/null
+++ b/_publications/zhang2023repocoder.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation"
+authors: Fengji Zhang, Bei Chen, Yue Zhang, Jin Liu, Daoguang Zan, Yi Mao, Jian-Guang Lou, Weizhu Chen
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2303.12570"}
+   - {name: "Code", url: "https://github.com/microsoft/CodeT/tree/main/RepoCoder"}
+tags: ["completion", "Transformer", "retrieval"]
+---
+The task of repository-level code completion is to continue writing the unfinished code based on a broader context of the repository. While for automated code completion tools, it is difficult to utilize the useful information scattered in different files. We propose RepoCoder, a simple, generic, and effective framework to address the challenge. It streamlines the repository-level code completion process by incorporating a similarity-based retriever and a pre-trained code language model, which allows for the effective utilization of repository-level information for code completion and grants the ability to generate code at various levels of granularity. Furthermore, RepoCoder utilizes a novel iterative retrieval-generation paradigm that bridges the gap between retrieval context and the intended completion target. We also propose a new benchmark RepoEval, which consists of the latest and high-quality real-world repositories covering line, API invocation, and function body completion scenarios. We test the performance of RepoCoder by using various combinations of code retrievers and generators. Experimental results indicate that RepoCoder significantly improves the zero-shot code completion baseline by over 10% in all settings and consistently outperforms the vanilla retrieval-augmented code completion approach. Furthermore, we validate the effectiveness of RepoCoder through comprehensive analysis, providing valuable insights for future research. 
diff --git a/_publications/zhao2018neural.markdown b/_publications/zhao2018neural.markdown
index b8e42d65..91e84a63 100644
--- a/_publications/zhao2018neural.markdown
+++ b/_publications/zhao2018neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural-Augumented Static Analysis of Android Communication"
-authors: J. Zhao, A. Albarghouthi, V. Rastogi, S. Jha, D. Octeau
+authors: Jinman Zhao, Aws Albarghouthi, Vaibhav Rastogi, Somesh Jha, Damien Octeau
 conference: FSE
 year: 2018
-bibkey: zhao2018neural
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/1809.04059"}
 tags: ["program analysis"]
diff --git a/_publications/zhao2019neural.markdown b/_publications/zhao2019neural.markdown
index 258d4cee..36c8bea9 100644
--- a/_publications/zhao2019neural.markdown
+++ b/_publications/zhao2019neural.markdown
@@ -1,10 +1,9 @@
 ---
 layout: publication
 title: "Neural Networks for Modeling Source Code Edits"
-authors: R. Zhao, D. Bieber, K. Swersky, D. Tarlow
+authors: Rui Zhao, David Bieber, Kevin Swersky, Daniel Tarlow
 conference: 
 year: 2019
-bibkey: zhao2019neural
 additional_links:
    - {name: "OpenReview", url: "https://openreview.net/forum?id=Sklr9i09KQ"}
    - {name: "ArXiV", url: "https://arxiv.org/abs/1904.02818"}
diff --git a/_publications/zhong2018generating.markdown b/_publications/zhong2018generating.markdown
index 80d09ed9..e4df8893 100644
--- a/_publications/zhong2018generating.markdown
+++ b/_publications/zhong2018generating.markdown
@@ -1,13 +1,12 @@
 ---
 layout: publication
 title: "Generating Regular Expressions from Natural Language Specifications: Are We There Yet?"
-authors: Z. Zhong, J. Guo, W. Yang, T. Xie, JG Lou, Y. Liu, D. Zhang
+authors: Zexuan Zhong, Jiaqi Guo, Wei Yang, Tao Xie, Jian-Guang Lou, Ting Liu, Dongmei Zhang
 conference: NLSE 
 year: 2018
-bibkey: zhong2018generating
 additional_links:
    - {name: "PDF", url: "http://taoxie.cs.illinois.edu/publications/nl4se18-regex.pdf"}
-tags: ["bimodal", "generation"]
+tags: ["bimodal", "code generation"]
 ---
 Recent  state-of-the-art  approaches  automatically  generate
 regular  expressions  from  natural  language  specifications.
diff --git a/_publications/zhong2020semantic.markdown b/_publications/zhong2020semantic.markdown
index 49830699..4a260f1d 100644
--- a/_publications/zhong2020semantic.markdown
+++ b/_publications/zhong2020semantic.markdown
@@ -4,9 +4,8 @@ title: "Semantic Scaffolds for Pseudocode-to-Code Generation"
 authors: Ruiqi Zhong, Mitchell Stern, Dan Klein
 conference:
 year: 2020
-bibkey: zhong2020semantic
 additional_links:
    - {name: "ArXiV", url: "https://arxiv.org/abs/2005.05927"}
-tags: ["generation", "synthesis"]
+tags: ["code generation", "synthesis"]
 ---
 We propose a method for program generation based on semantic scaffolds, lightweight structures representing the high-level semantic and syntactic composition of a program. By first searching over plausible scaffolds then using these as constraints for a beam search over programs, we achieve better coverage of the search space when compared with existing techniques. We apply our hierarchical search method to the SPoC dataset for pseudocode-to-code generation, in which we are given line-level natural language pseudocode annotations and aim to produce a program satisfying execution-based test cases. By using semantic scaffolds during inference, we achieve a 10% absolute improvement in top-100 accuracy over the previous state-of-the-art. Additionally, we require only 11 candidates to reach the top-3000 performance of the previous best approach when tested against unseen problems, demonstrating a substantial improvement in efficiency. 
diff --git a/_publications/zhou2019devign.markdown b/_publications/zhou2019devign.markdown
new file mode 100644
index 00000000..88c2af98
--- /dev/null
+++ b/_publications/zhou2019devign.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks"
+authors: Yaqin Zhou, Shangqing Liu, Jingkai Siow, Xiaoning Du, Yang Liu
+conference: NeurIPS
+year: 2020
+additional_links:
+   - {name: "Paper", url: "http://papers.nips.cc/paper/9209-devign-effective-vulnerability-identification-by-learning-comprehensive-program-semantics-via-graph-neural-networks"}
+tags: ["GNN", "static analysis"]
+---
+Vulnerability identification is crucial to protect the software systems from attacks for cyber security. It is especially important to localize the vulnerable functions among the source code to facilitate the fix. However, it is a challenging and tedious process, and also requires specialized security expertise. Inspired by the work on manually-defined patterns of vulnerabilities from various code representation graphs and the recent advance on graph neural networks, we propose Devign, a general graph neural network based model for graph-level classification through learning on a rich set of code semantic representations. It includes a novel Conv module to efficiently extract useful features in the learned rich node representations for graph-level classification. The model is trained over manually labeled datasets built on 4 diversified large-scale open-source C projects that incorporate high complexity and variety of real source code instead of synthesis code used in previous works. The results of the extensive evaluation on the datasets demonstrate that Devign outperforms the state of the arts significantly with an average of 10.51% higher accuracy and 8.68% F1 score, increases averagely 4.66% accuracy and 6.37% F1 by the Conv module.
diff --git a/_publications/zhou2021improving.markdown b/_publications/zhou2021improving.markdown
new file mode 100644
index 00000000..1930b938
--- /dev/null
+++ b/_publications/zhou2021improving.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Improving Code Autocompletion with Transfer Learning"
+authors: Wen Zhou, Seohyun Kim, Vijayaraghavan Murali, Gareth Ari Aye
+conference:
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2105.05991"}
+tags: ["autocomplete", "Transformer"]
+---
+Software language models have achieved promising results predicting code completion usages, and several industry studies have described successful IDE integrations. Recently, accuracy in autocompletion prediction improved 12.8% from training on a real-world dataset collected from programmers' IDE activity. But what if limited examples of IDE autocompletion in the target programming language are available for model training? In this paper, we investigate the efficacy of pretraining autocompletion models on non-IDE, non-autocompletion, and different-language example code sequences. We find that these unsupervised pretrainings improve model accuracy by over 50% on very small fine-tuning datasets and over 10% on 50k labeled examples. We confirm the real-world impact of these pretrainings in an online setting through A/B testing on thousands of IDE autocompletion users, finding that pretraining is responsible for increases of up to 6.63% autocompletion usage. 
diff --git a/_publications/zhou2022codebertscore.markdown b/_publications/zhou2022codebertscore.markdown
new file mode 100644
index 00000000..86ea2486
--- /dev/null
+++ b/_publications/zhou2022codebertscore.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code"
+authors: Shuyan Zhou, Uri Alon, Sumit Agarwal, Graham Neubig
+conference:
+year: 2023
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2302.05527"}
+   - {name: "Code", url: "https://github.com/neulab/code-bert-score"}
+tags: ["evaluation", "Transformer"]
+---
+Since the rise of neural models of code that can generate long expressions and statements rather than a single next-token, one of the major problems has been reliably evaluating their generated output. In this paper, we propose CodeBERTScore: an automatic evaluation metric for code generation, which builds on BERTScore (Zhang et al., 2020). Instead of measuring exact token matching as BLEU, CodeBERTScore computes a soft similarity score between each token in the generated code and in the reference code, using the contextual encodings of large pretrained models. Further, instead of encoding only the generated tokens as in BERTScore, CodeBERTScore also encodes the programmatic context surrounding the generated code. We perform an extensive evaluation of CodeBERTScore across four programming languages. We find that CodeBERTScore achieves a higher correlation with human preference and with functional correctness than all existing metrics. That is, generated code that receives a higher score by CodeBERTScore is more likely to be preferred by humans, as well as to function correctly when executed. Finally, while CodeBERTScore can be used with a multilingual CodeBERT as its base model, we release five language-specific pretrained models to use with our publicly available code at https://github.com/neulab/code-bert-score . Our language-specific models have been downloaded more than 25,000 times from the Huggingface Hub.
diff --git a/_publications/zhou2022docoder.markdown b/_publications/zhou2022docoder.markdown
new file mode 100644
index 00000000..8e23e65b
--- /dev/null
+++ b/_publications/zhou2022docoder.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "DocCoder: Generating Code by Retrieving and Reading Docs"
+authors: Shuyan Zhou, Uri Alon, Frank F. Xu, Zhengbao JIang, Graham Neubig
+conference:
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2207.05987"}
+   - {name: "Code and Data", url: "https://github.com/shuyanzhou/doccoder"}
+tags: ["Transformer", "search", "code generation"]
+---
+Natural-language-to-code models learn to generate a code snippet given a natural language (NL) intent. However, the rapid growth of both publicly available and proprietary libraries and functions makes it impossible to cover all APIs using training examples, as new libraries and functions are introduced daily. Thus, existing models inherently cannot generalize to using unseen functions and libraries merely through incorporating them into the training data. In contrast, when human programmers write programs, they frequently refer to textual resources such as code manuals, documentation, and tutorials, to explore and understand available library functionality. Inspired by this observation, we introduce DocCoder: an approach that explicitly leverages code manuals and documentation by (1) retrieving the relevant documentation given the NL intent, and (2) generating the code based on the NL intent and the retrieved documentation. Our approach is general, can be applied to any programming language, and is agnostic to the underlying neural model. We demonstrate that DocCoder consistently improves NL-to-code models: DocCoder achieves 11x higher exact match accuracy than strong baselines on a new Bash dataset tldr; on the popular Python CoNaLa benchmark, DocCoder improves over strong baselines by 1.65 BLEU.
diff --git a/_publications/zhu2020ocor.markdown b/_publications/zhu2020ocor.markdown
new file mode 100644
index 00000000..754c9ebd
--- /dev/null
+++ b/_publications/zhu2020ocor.markdown
@@ -0,0 +1,13 @@
+---
+layout: publication
+title: "OCoR: An Overlapping-Aware Code Retriever"
+authors: Qihao Zhu, Zeyu Sun, Xiran Liang, Yingfei Xiong, Lu Zhang
+conference: ASE
+year: 2020
+additional_links:
+  - { name: "ArXiV", url: "https://arxiv.org/abs/2008.05201" }
+tags: ["search"]
+---
+
+Code retrieval helps developers reuse the code snippet in the open-source projects. Given a natural language description, code retrieval aims to search for the most relevant code among a set of code. Existing state-of-the-art approaches apply neural networks to code retrieval. However, these approaches still fail to capture an important feature: overlaps. The overlaps between different names used by different people indicate that two different names may be potentially related (e.g., "message" and "msg"), and the overlaps between identifiers in code and words in natural language descriptions indicate that the code snippet and the description may potentially be related. To address these problems, we propose a novel neural architecture named OCoR, where we introduce two specifically-designed components to capture overlaps: the first embeds identifiers by character to capture the overlaps between identifiers, and the second introduces a novel overlap matrix to represent the degrees of overlaps between each natural language word and each identifier.
+The evaluation was conducted on two established datasets. The experimental results show that OCoR significantly outperforms the existing state-of-the-art approaches and achieves 13.1% to 22.3% improvements. Moreover, we also conducted several in-depth experiments to help understand the performance of different components in OCoR.
diff --git a/_publications/zhu2921syntax.markdown b/_publications/zhu2921syntax.markdown
new file mode 100644
index 00000000..a2a8f1b9
--- /dev/null
+++ b/_publications/zhu2921syntax.markdown
@@ -0,0 +1,15 @@
+---
+layout: publication
+title: "A Syntax-Guided Edit Decoder for Neural Program Repair"
+authors: Qihao Zhu, Zeyu Sun, Yuan-an Xiao, Wenjie Zhang, Kang Yuan, Yingfei Xiong, Lu Zhang
+conference: FSE
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2106.08253"}
+tags: ["edit"]
+---
+Automated Program Repair (APR) helps improve the efficiency of software development and maintenance. Recent APR techniques use deep learning, particularly the encoder-decoder architecture, to generate patches.
+Though existing DL-based APR approaches have proposed different encoder architectures, the decoder remains to be the standard one, which generates a sequence of tokens one by one to replace the faulty statement.
+This decoder has multiple limitations: 1) allowing to generate syntactically incorrect programs, 2) inefficiently representing small edits, and 3) not being able to generate project-specific identifiers.
+In this paper, we propose Recoder, a syntax-guided edit decoder with placeholder generation. Recoder is novel in multiple aspects: 1) Recoder generates edits rather than modified code, allowing efficient representation of small edits; 2) Recoder is syntax-guided, with the novel provider/decider architecture to ensure the syntactic correctness of the patched program and accurate generation; 3) Recoder generates placeholders that could be instantiated as project-specific identifiers later.
+We conduct experiments to evaluate Recoder on 395 bugs from Defects4J v1.2, 420 additional bugs from Defects4J v2.0, 297 bugs from IntroClassJava and 40 bugs from QuixBugs. Our results show that Recoder repairs 53 bugs on Defects4J v1.2, which achieves 26.2% (11 bugs) improvement over the previous state-of-the-art approach for single-hunk bugs (TBar). Importantly, to our knowledge, Recoder is the first DL-based APR approach that has outperformed the traditional APR approaches on this benchmark. 
diff --git a/_publications/ziegler2022productivity.markdown b/_publications/ziegler2022productivity.markdown
new file mode 100644
index 00000000..5cb1d1bb
--- /dev/null
+++ b/_publications/ziegler2022productivity.markdown
@@ -0,0 +1,12 @@
+---
+layout: publication
+title: "Productivity Assessment of Neural Code Completion"
+authors: Albert Ziegler, Eirini Kalliamvakou, Shawn Simister, Ganesh Sittampalam, Alice Li, Andrew Rice, Devon Rifkin, Edward Aftandilian
+conference: MAPS
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2205.06537"}
+   - {name: "Data", url: "https://github.com/wunderalbert/prod-neural-materials"}
+tags: ["evaluation", "human evaluation"]
+---
+Neural code synthesis has reached a point where snippet generation is accurate enough to be considered for integration into human software development workflows. Commercial products aim to increase programmers' productivity, without being able to measure it directly. In this case study, we asked users of GitHub Copilot about its impact on their productivity, and sought to find a reflection of their perception in directly measurable user data. We find that the rate with which shown suggestions are accepted, rather than more specific metrics regarding the persistence of completions in the code over time, drives developers' perception of productivity.
diff --git a/_publications/zlotchevski2022exploring.markdown b/_publications/zlotchevski2022exploring.markdown
new file mode 100644
index 00000000..5bd5d5fc
--- /dev/null
+++ b/_publications/zlotchevski2022exploring.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Exploring and Evaluating Personalized Models for Code Generation"
+authors: Andrei Zlotchevski, Dawn Drain, Alexey Svyatkovskiy, Colin Clement, Neel Sundaresan, Michele Tufano
+conference: FSE
+year: 2022
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2208.13928"}
+tags: ["Transformer"]
+---
+Large Transformer models achieved the state-of-the-art status for Natural Language Understanding tasks and are increasingly becoming the baseline model architecture for modeling source code. Transformers are usually pre-trained on large unsupervised corpora, learning token representations and transformations relevant to modeling generally available text, and are then fine-tuned on a particular downstream task of interest. While fine-tuning is a tried-and-true method for adapting a model to a new domain -- for example, question-answering on a given topic -- generalization remains an on-going challenge. In this paper, we explore and evaluate transformer model fine-tuning for personalization. In the context of generating unit tests for Java methods, we evaluate learning to personalize to a specific software project using several personalization techniques. We consider three key approaches: (i) custom fine-tuning, which allows all the model parameters to be tuned; (ii) lightweight fine-tuning, which freezes most of the model's parameters, allowing tuning of the token embeddings and softmax layer only or the final layer alone; (iii) prefix tuning, which keeps model parameters frozen, but optimizes a small project-specific prefix vector. Each of these techniques offers a trade-off in total compute cost and predictive performance, which we evaluate by code and task-specific metrics, training time, and total computational operations. We compare these fine-tuning strategies for code generation and discuss the potential generalization and cost benefits of each in various deployment scenarios.
diff --git a/_publications/zugner2021language.markdown b/_publications/zugner2021language.markdown
new file mode 100644
index 00000000..ec49df6c
--- /dev/null
+++ b/_publications/zugner2021language.markdown
@@ -0,0 +1,11 @@
+---
+layout: publication
+title: "Language-Agnostic Representation Learning of Source Code from Structure and Context"
+authors: Daniel Zügner, Tobias Kirschstein, Michele Catasta, Jure Leskovec, Stephan Günnemann
+conference: ICLR
+year: 2021
+additional_links:
+   - {name: "ArXiV", url: "https://arxiv.org/abs/2103.11318"}
+tags: ["Transformer", "representation"]
+---
+Source code (Context) and its parsed abstract syntax tree (AST; Structure) are two complementary representations of the same computer program. Traditionally, designers of machine learning models have relied predominantly either on Structure or Context. We propose a new model, which jointly learns on Context and Structure of source code. In contrast to previous approaches, our model uses only language-agnostic features, i.e., source code and features that can be computed directly from the AST. Besides obtaining state-of-the-art on monolingual code summarization on all five programming languages considered in this work, we propose the first multilingual code summarization model. We show that jointly training on non-parallel data from multiple programming languages improves results on all individual languages, where the strongest gains are on low-resource languages. Remarkably, multilingual training only from Context does not lead to the same improvements, highlighting the benefits of combining Structure and Context for representation learning on code.
diff --git a/base-taxonomy/generative.html b/base-taxonomy/generative.html
deleted file mode 100644
index 69364ad9..00000000
--- a/base-taxonomy/generative.html
+++ /dev/null
@@ -1,40 +0,0 @@
----
-layout: default
-title: Code-Generating Models
----
-<h2>Code-Generating Models</h2>
-
-<b>Code-generating models</b> define a probability distribution over code by stochastically mod-
-eling the generation of smaller and simpler parts of code, e.g.
-tokens or AST nodes.
-
-{% assign publicationsList = site.data.allamanistaxonomy | sort: "bibkey" %}
-<table id="genModelTable">
-<thead><th>Name</th><th>Type</th><th>Representation</th><th>Model</th><th>Application</th><th>Abstract</th></thead><tbody>
-{% for publication in publicationsList %}{% if publication.categories contains "generative" %}
-    {% assign pubDetails = site.publications | where:"bibkey", publication.bibkey %}
-    
-      <tr>
-          <td data-order="{{publication.bibkey}}"><a href="https://github.com/publications/%7B%7Bpublication.bibkey%7D%7D">{{pubDetails[0].authors}}, {{pubDetails[0].year}}.</a> {{pubDetails[0].title}}</td>
-          <td>{{publication.type}}</td>
-          <td>{{publication.representation}}</td>
-          <td>{{publication.model}}</td>
-          <td>{{publication.application}}</td>
-          <td>{{pubDetails[0].content}}</td>
-      </tr>
-{% endif %}{% endfor %}
-</tbody></table>
-
-<script>
-$(document).ready( function () {
-    $('#genModelTable').DataTable({
-		paging: false,
-		columnDefs: [
-			{
-				targets: [5],
-				visible: false,
-				searchable: true
-			}]
-		});
-});
-</script>
diff --git a/base-taxonomy/index.md b/base-taxonomy/index.md
deleted file mode 100644
index 68f4eef0..00000000
--- a/base-taxonomy/index.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-layout: page
-title: Taxonomy
-description: A taxonomy of machine learning models of source code.
----
-A probabilistic model of source code is a probability distribution 
-over code artifacts. To group these family of models
-in terms of shared design choices,
-we separate these models into three categories,
-based on the form of the equation of the modeled probability 
-distribution and their inputs and outputs. Some models fall into multiple categories
-because decompositions of their equations fall into different categories.
-
-
- * [**Code-generating Models**]({% link base-taxonomy/generative.html %}) define a probability distribution over code by 
-    stochastically modeling the generation of smaller and simpler parts of code, _e.g._
-    tokens or AST nodes.
-
- * [**Representational Models of Code**]({% link base-taxonomy/representational.html %}) take an abstract
-    representation of 
-    code as input.  Example representations include token contexts or data flow.
-    The resulting model yields a conditional probability distribution over code
-    element properties, like the types of variables, and can predict them.
-
- * [**Pattern Mining Models**]({% link base-taxonomy/pattern.html %}) infer, without supervision, a likely latent
-    structure within code. These models are an instantiation of clustering
-    in the code domain; they can find reusable and human-interpretable patterns.
-
diff --git a/base-taxonomy/pattern.html b/base-taxonomy/pattern.html
deleted file mode 100644
index 371804fd..00000000
--- a/base-taxonomy/pattern.html
+++ /dev/null
@@ -1,40 +0,0 @@
----
-layout: default
-title: Pattern-Mining Models
----
-<h2>Pattern Mining Models</h2>
-
-<b>Pattern mining models</b> infer, without supervision, a likely latent
-structure within code. These models are an instantiation of clustering
-in the code domain; they can find reusable and human-interpretable patterns.
-
-{% assign publicationsList = site.data.allamanistaxonomy | sort: "bibkey" %}
-<table id="patternModelTable">
-<thead><th>Name</th><th>Type</th><th>Representation</th><th>Application</th><th>Abstract</th></thead>
-<tbody>
-{% for publication in publicationsList %}{% if publication.categories contains "pattern" %}
-    {% assign pubDetails = site.publications | where:"bibkey", publication.bibkey %}
-    
-      <tr>
-          <td data-order="{{publication.bibkey}}"><a href="https://github.com/publications/%7B%7Bpublication.bibkey%7D%7D">{{pubDetails[0].authors}}, {{pubDetails[0].year}}.</a> {{pubDetails[0].title}}</td>
-          <td>{{publication.pattern_type}}</td>
-          <td>{{publication.representation}}</td>
-          <td>{{publication.application}}</td>
-          <td>{{pubDetails[0].content}}</td>
-      </tr>
-{% endif %}{% endfor %}
-</tbody></table>
-
-<script>
-$(document).ready( function () {
-    $('#patternModelTable').DataTable({
-		paging: false,
-		columnDefs: [
-			{
-				targets: [4],
-				visible: false,
-				searchable: true
-			}]
-	});
-} );
-</script>
diff --git a/base-taxonomy/representational.html b/base-taxonomy/representational.html
deleted file mode 100644
index 14b2dcd8..00000000
--- a/base-taxonomy/representational.html
+++ /dev/null
@@ -1,41 +0,0 @@
----
-layout: default
-title: Representational Models
----
-<h2>Representational Models</h2>
-
-<b>Representational models</b> take an abstract representation of 
-code as input.  Example representations include token contexts or data flow.
-The resulting model yields a conditional probability distribution over code
-element properties, like the types of variables, and can predict them.
-
-{% assign publicationsList = site.data.allamanistaxonomy | sort: "bibkey" %}
-<table id="reprModelTable">
-<thead><th>Name</th><th>Input Code Representation</th><th>Target</th><th>Intermediate Representation</th><th>Application</th><th>Abstract</th></thead><tbody>
-{% for publication in publicationsList %}{% if publication.categories contains "representational" %}
-    {% assign pubDetails = site.publications | where:"bibkey", publication.bibkey %}
-    
-      <tr>
-          <td data-order="{{publication.bibkey}}"><a href="https://github.com/publications/%7B%7Bpublication.bibkey%7D%7D">{{pubDetails[0].authors}}, {{pubDetails[0].year}}.</a> {{pubDetails[0].title}}</td>
-          <td>{{publication.input_rep}}</td>
-          <td>{{publication.modeled_target}}</td>
-          <td>{{publication.intermediate_rep}}</td>
-          <td>{{publication.application}}</td>
-          <td>{{pubDetails[0].content}}</td>
-      </tr>
-{% endif %}{% endfor %}
-</tbody></table>
-
-<script>
-$(document).ready( function () {
-    $('#reprModelTable').DataTable({
-		paging: false,
-		columnDefs: [
-			{
-				targets: [5],
-				visible: false,
-				searchable: true
-			}]
-		});
-} );
-</script>
diff --git a/contributing.markdown b/contributing.markdown
index fa3b353b..bd906b48 100644
--- a/contributing.markdown
+++ b/contributing.markdown
@@ -8,22 +8,22 @@ Contributions of new or missing publications are very welcome. Alternative categ
 
 ### Adding a publication
 To add a publication (new or missing), create a file in the `_publications` folder. The name of the file should follow the structure `lastnameYEARfirstword.markdown` where `lastname` is the last name of the first author and `firstword` is the first non-punctuation word of the work's title. Within each file, follow the structure shown in the other files. Once the file is added, the work will appear in the "All Papers" section.
-<pre>
+
+```yaml
 ---
 layout: publication
 title: The title of the Publication
 authors: F. M. LastName, F. M. LastName, ...
-conference: AbbreviatedNameOfConference
+conference: AbbreviatedNameOfConference  # Or journal: AbbreviatedNameOfJournal
 year: YEAR
-bibkey: lastnameYEARfirstword
 additional_links:
-   - {name: "ArXiV", url: "http://arxiv.org/abs/XXXX.YYYY"}
-   - {name: "website", url: "http://paperwebsite.com/"}
-   - {name: "code", url: "https://github.com/path-to/code"}
+  - {name: "ArXiV", url: "http://arxiv.org/abs/XXXX.YYYY"}
+  - {name: "website", url: "http://paperwebsite.com/"}
+  - {name: "code", url: "https://github.com/path-to/code"}
 tags: ["tag1", "tag2"]
 ---
 Text of abstract goes here.
-</pre>
+```
 
 The `additional_links` are optional and arbitrary and they will appear on the page referring to this work. Feel free to add as many additional links as needed.
 
diff --git a/contributors.md b/contributors.md
deleted file mode 100644
index 65cefdff..00000000
--- a/contributors.md
+++ /dev/null
@@ -1,18 +0,0 @@
----
-layout: default
-title: Contributors
----
-The core survey and the original taxonomy was created by
-
-* [Miltos Allamanis](https://miltos.allamanis.com) Microsoft Research, Cambridge, UK
-* [Earl T. Barr](http://earlbarr.com) University College London, London, UK
-* [Prem Devanbu](http://web.cs.ucdavis.edu/~devanbu/) University of California, Davis, USA
-* [Charles Sutton](http://homepages.inf.ed.ac.uk/csutton/) University of Edinburgh and The Alan Turing Institute, UK
-
-#### Contributors to the website
-This website accepts external [contributions](/contributing.html).
-Please, feel free to add your name below, once you contribute to this
-website. A comprehensive list can be found [here](https://github.com/ml4code/ml4code.github.io/graphs/contributors).
-
-* [Uri Alon](http://www.cs.technion.ac.il/~urialon/) Technion, Israel
-* [Nghi D. Q. Bui](https://bdqnghi.github.io/) Singapore Management University, Singapore
\ No newline at end of file
diff --git a/etc/compute_embeddings.py b/etc/compute_embeddings.py
index 1e0c8da8..43f0ba7c 100644
--- a/etc/compute_embeddings.py
+++ b/etc/compute_embeddings.py
@@ -1,8 +1,11 @@
 import argparse
 import json
+from timeit import default_timer as timer
+from datetime import date
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 import sklearn.manifold
 import transformers
 
@@ -13,13 +16,19 @@ def parse_arguments():
     parser.add_argument("json", default=False, help="the path the json containing all papers.")
     parser.add_argument("outpath", default=False, help="the target path of the visualizations papers.")
     parser.add_argument("--seed", default=0, help="The seed for TSNE.", type=int)
+    parser.add_argument("--model", default='sentence-transformers/all-MiniLM-L6-v2', help="The name of the HF model")
+    parser.add_argument("--save_emb", action='https://github.com/store_true', help="Save embeddings in CSV for Tensorboard Projector")
+
     return parser.parse_args()
 
+def mean_pooling(token_embeddings, attention_mask):
+    """ Mean Pooling, takes attention mask into account for correct averaging"""
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
-if __name__ == "__main__":
-    args = parse_arguments()
-    tokenizer = transformers.AutoTokenizer.from_pretrained("deepset/sentence_bert")
-    model = transformers.AutoModel.from_pretrained("deepset/sentence_bert")
+def main(args):
+    tokenizer = transformers.AutoTokenizer.from_pretrained(args.model)
+    model = transformers.AutoModel.from_pretrained(args.model)
     model.eval()
 
     with open(args.json) as f:
@@ -27,15 +36,34 @@ def parse_arguments():
 
     print(f"Num papers: {len(data)}")
 
-    all_embeddings = []
+    corpus = []
     for paper_info in data:
+        corpus.append(tokenizer.sep_token.join([paper_info['title'], paper_info['abstract']]))
+
+    batch_size = 4
+    all_embeddings=[]
+    start = timer()
+    for i in range(0, len(corpus), batch_size):
+        encoded_batch = tokenizer(corpus[i:min(i+batch_size, len(corpus))], padding=True, truncation=True, return_tensors='pt')
         with torch.no_grad():
-            token_ids = torch.tensor([tokenizer.encode(paper_info["abstract"])][:512])
-            hidden_states, _ = model(token_ids)[-2:]
-            all_embeddings.append(hidden_states.mean(0).mean(0).numpy())
+            hidden_state = model(**encoded_batch).last_hidden_state
+            all_embeddings.append(mean_pooling(hidden_state, encoded_batch['attention_mask']))
+
+    all_embeddings = torch.cat(all_embeddings, dim=0)
+    all_embeddings = F.normalize(all_embeddings, p=2, dim=1)
+    print(f"elapsed {timer()-start:.1f}s")
+
+    if args.save_emb:
+        filename = f"{args.model.replace('/', '_')}-{date.today().strftime('%d.%m.%y')}"
+        np.savetxt(f"{filename}-emb.tsv", all_embeddings, delimiter="\t")
+        import csv
+        with open(f"{filename}-meta.tsv", 'w', newline='') as csvfile:
+            w = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
+            w.writerow(["year", "key", "title"])
+            for paper in data:
+                w.writerow([paper["year"], paper["key"], paper["title"]])
 
     np.random.seed(args.seed)
-    all_embeddings = np.array(all_embeddings)
     out = sklearn.manifold.TSNE(n_components=2, metric="cosine").fit_transform(all_embeddings)
 
     for i, paper_info in enumerate(data):
@@ -43,3 +71,7 @@ def parse_arguments():
 
     with open(args.outpath, 'w') as f:
         json.dump(data, f)
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)
diff --git a/etc/compute_related.py b/etc/compute_related.py
new file mode 100644
index 00000000..36f3bc2c
--- /dev/null
+++ b/etc/compute_related.py
@@ -0,0 +1,74 @@
+import argparse
+import json
+import os
+
+import nltk
+
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt_tab')
+
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+
+import numpy as np
+import scipy
+
+from gensim.models import TfidfModel
+from gensim.corpora import Dictionary
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="TSNE Visualization of Papers in ML4Code")
+
+    parser.add_argument("json", default=False, help="the path the json containing all papers.")
+    parser.add_argument("outdir", default=False, help="the target path of the visualizations papers.")
+    parser.add_argument("--num-relwork", default=4, help="Number of related work per paper.", type=int)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    num_relworks = args.num_relwork
+
+    with open(args.json) as f:
+        data = json.load(f)
+
+    print(f"Num papers: {len(data)}")
+
+    lemmatizer = WordNetLemmatizer()
+    stopwords = set(stopwords.words('english'))
+    stopwords.update(["one", "two", "using"])
+
+    tokens_per_paper = []
+    keys = []
+
+    for paper_info in data:
+        keys.append((paper_info["key"], paper_info["title"]))
+        text = paper_info["title"] + " " + paper_info["abstract"].replace("<p>", " ").replace("</p>", " ") + " ".join(paper_info["tags"])
+        lemmatized_tokens = [lemmatizer.lemmatize(w).lower() for w in nltk.word_tokenize(text) if w.lower() not in stopwords and w.isalpha()]
+        tokens_per_paper.append(lemmatized_tokens)
+
+    dictionary = Dictionary(tokens_per_paper)
+    dictionary.filter_extremes(no_below=2, no_above=0.5)
+
+    corpus = [dictionary.doc2bow(line) for line in tokens_per_paper]
+    model = TfidfModel(corpus)
+    
+    tf_idf_vectors = []
+    for bow in corpus:
+        vec = np.zeros(len(dictionary), dtype=np.float64)
+        for i, v in model[bow]:
+            vec[i] = v
+        tf_idf_vectors.append(vec)
+    tf_idf_vectors = np.array(tf_idf_vectors)
+    
+    distances = scipy.spatial.distance.cdist(tf_idf_vectors, tf_idf_vectors, metric='cosine')
+    sorted_idxs = np.argsort(distances, axis=-1)[:, 1:num_relworks+1]
+    
+    os.makedirs(args.outdir, exist_ok=True)
+    for i, (bibkey, title) in enumerate(keys):
+        with open(os.path.join(args.outdir, bibkey + ".json"), "w") as f:
+            json.dump([keys[j] for j in sorted_idxs[i]], f)
+
+    
\ No newline at end of file
diff --git a/etc/compute_topics.py b/etc/compute_topics.py
new file mode 100644
index 00000000..0bba7ade
--- /dev/null
+++ b/etc/compute_topics.py
@@ -0,0 +1,83 @@
+import argparse
+import json
+import nltk
+
+nltk.download('omw-1.4')
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt_tab')
+
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from gensim.corpora import Dictionary
+from gensim.models import LdaModel
+
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Topic Model of Papers in ML4Code")
+
+    parser.add_argument("json", default=False, help="the path the json containing all papers.")
+    parser.add_argument("outpath", default=False, help="the target path of the visualizations papers.")
+    parser.add_argument("--num-topics", default=20, help="The number of topics.", type=int)
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    with open(args.json) as f:
+        data = json.load(f)
+
+    print(f"Num papers: {len(data)}")
+
+    
+    lemmatizer = WordNetLemmatizer()
+    stopwords = set(stopwords.words('english'))
+    stopwords.update(["one", "two", "using"])
+    
+    tokens_per_paper = []
+    for paper_info in data:
+        text = paper_info["title"] + " " + paper_info["abstract"].replace("<p>", " ").replace("</p>", " ") + " ".join(paper_info["tags"])
+        lemmatized_tokens = [lemmatizer.lemmatize(w).lower() for w in nltk.word_tokenize(text) if w.lower() not in stopwords and w.isalpha()]
+        tokens_per_paper.append(lemmatized_tokens)
+
+    dictionary = Dictionary(tokens_per_paper)
+    dictionary.filter_extremes(no_below=20, no_above=0.5)
+
+    corpus = [dictionary.doc2bow(doc) for doc in tokens_per_paper]
+
+    passes = 100
+    iterations = 1000
+
+    temp = dictionary[0]  # This is needed to "load" the dictionary.
+
+    model = LdaModel(
+        corpus=corpus,
+        id2word=dictionary.id2token,
+        chunksize=1000,
+        alpha='asymmetric',
+        eta='auto',
+        iterations=iterations,
+        num_topics=args.num_topics,
+        passes=passes,
+        eval_every=None
+    )
+
+    topic_tokens = []
+    for topicid in range(args.num_topics):
+        topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025])
+
+    paper_topic_data = []
+    for paper, paper_bow in zip(data, corpus):
+        topic_distr = model.get_document_topics(paper_bow, minimum_probability=0)
+        paper_topic_data.append({
+            "key": paper["key"],
+            "year": paper["year"],
+            "title": paper["title"],
+            "topic_distr": {t: float(p) for t, p in topic_distr}
+        })
+
+    with open(args.outpath, 'w') as f:
+        json.dump({
+            "topics": topic_tokens,
+            "paper_data": paper_topic_data 
+        }, f)
diff --git a/index.md b/index.md
index 0370f24b..44467cff 100644
--- a/index.md
+++ b/index.md
@@ -21,21 +21,19 @@ research is inherently interdisciplinary, uniting the machine learning and
 natural language processing communities with software engineering
 and programming language communities.
 
-#### Browse Papers by Tag
+#### 🏷 Browse Papers by Tag
 {% assign rawtags = Array.new %}
 {% for publication in site.publications %}
   {% assign ttags = publication.tags  %}  
   {% assign rawtags = rawtags | concat: ttags %}  
 {% endfor %}
-{% assign rawtags = rawtags | uniq | sort %}
+{% assign rawtags = rawtags | uniq | sort_natural %}
 {% for tag in rawtags %}<tag><a href="https://github.com/tags.html#{{%20tag%20}}">{{ tag }}</a></tag> {% endfor %}
 
 ### About This Site
 
 This site is an experiment: a [living literature review](https://en.wikipedia.org/wiki/Living_review) that allows
-you explore, [search and navigate]({% link papers.html %}) the literature in this area, by
-following a [taxonomy]({% link base-taxonomy/index.md %})
-based on the underlying design principles of each model.
+you explore, [search and navigate]({% link papers.html %}) the literature in this area.
 The full survey is available [as a research paper](https://arxiv.org/abs/1709.06182).
 Please cite as
 <pre>
@@ -58,3 +56,22 @@ But a website can! We hope to make this site a living document.
 Anyone can add a paper to this web site, essentially by creating one Markdown file.
  To contribute, open a pull request in GitHub, by following [these instructions 
 for contributing](contributing.html).
+
+### Contributors
+
+The core survey and the original taxonomy was created by
+
+* [Miltos Allamanis](https://miltos.allamanis.com) Microsoft Research, Cambridge, UK
+* [Earl T. Barr](http://earlbarr.com) University College London, London, UK
+* [Prem Devanbu](http://web.cs.ucdavis.edu/~devanbu/) University of California, Davis, USA
+* [Charles Sutton](http://homepages.inf.ed.ac.uk/csutton/) University of Edinburgh and The Alan Turing Institute, UK
+
+#### Contributors to the website
+This website accepts external [contributions](/contributing.html).
+Please, feel free to add your name below, once you contribute to this
+website. A comprehensive list can be found [here](https://github.com/ml4code/ml4code.github.io/graphs/contributors).
+
+* [Uri Alon](http://www.cs.technion.ac.il/~urialon/) Technion, Israel
+* [Shaked Brody](https://shakedbr.cswp.cs.technion.ac.il/) Technion, Israel
+* [Nghi D. Q. Bui](https://bdqnghi.github.io/) Singapore Management University, Singapore
+* [Rajaswa Patil](https://rajaswa.github.io/) Microsoft PROSE
diff --git a/paper-abstracts.json b/paper-abstracts.json
index 368fb1c1..4321f4ff 100644
--- a/paper-abstracts.json
+++ b/paper-abstracts.json
@@ -3,7 +3,7 @@ layout:
 title:
 ---
 [
-{% for publication in site.publications %}{"key": "{{ publication.bibkey }}", "year": "{{ publication.year }}", "title":{{ publication.title | jsonify }}, "abstract": {{ publication.content | jsonify }}, "tags": {{ publication.tags | jsonify }} }{% if forloop.rindex0 > 0 %},{% endif %}
+{% for publication in site.publications %}{"key": "{{ publication.path | replace_first: '_publications/', '' | replace: '.markdown', '' }}", "year": "{{ publication.year }}", "title":{{ publication.title | jsonify }}, "abstract": {{ publication.content | jsonify }}, "tags": {{ publication.tags | jsonify }} }{% if forloop.rindex0 > 0 %},{% endif %}
 {% endfor %}
 ]
 
diff --git a/papers.html b/papers.html
index ccba7e4a..3988890c 100644
--- a/papers.html
+++ b/papers.html
@@ -18,7 +18,6 @@
 		<span class="externallinks">
 			&nbsp;<a href='http://scholar.google.com/scholar?q={{%20publication.title%20}}' target="_blank"><img  style="display: inline; margin: 0;" src="https://github.com/public/media/google-scholar.png"/></a>
 			<a href='https://www.semanticscholar.org/search?q={{%20publication.title%20}}' target="_blank"><img style="display: inline; margin: 0;" src="https://github.com/public/media/semscholar.png"/></a>
-			<a href='http://academic.microsoft.com/#/search?iq={{%20publication.title%20|%20uri_escape%20}}' target="_blank"><img style="display: inline; margin: 0;" src="https://github.com/public/media/ms-academic.png"/></a>
 		</span>
 	</td>
 	<td>{{ publication.authors }}</td>
diff --git a/public/css/hyde.css b/public/css/hyde.css
index a45f1a10..002c3f4c 100644
--- a/public/css/hyde.css
+++ b/public/css/hyde.css
@@ -287,3 +287,36 @@ tag {
 tag > a {
   color: #fff;
 }
+
+.ribbon {
+  /* positioning */
+  position: fixed;
+  padding: 2px 45px;
+  width: 128px;
+  /* bottom left of the page */
+  bottom: 50px;
+  left: -50px;
+  -webkit-transform: rotate(45deg);
+  -moz-transform: rotate(45deg);
+  -ms-transform: rotate(45deg);
+  transform: rotate(45deg);
+  /* effects with some shadow */
+  box-shadow: 0 0 0 3px #020202, 0 0 20px -3px rgba(0, 0, 0, 0.5);
+  text-shadow: 0 0 0 #e5e5e5, 0 0 5px rgba(0, 0, 0, 0.3);
+  /* looks */
+  background-color: #020202;
+  color: #e5e5e5;
+  font-size: 10px;
+  font-family: sans-serif;
+  text-decoration: none;
+  font-weight: bold;
+  /* ribbon effects */
+  /*border: 2px dotted #e5e5e5; */
+  /* webkit antialias fix */
+  -webkit-backface-visibility: hidden;
+  letter-spacing: .5px;
+  
+   z-index:100;
+   box-sizing:unset;
+  
+}
diff --git a/public/opensearchdescription.xml b/public/opensearchdescription.xml
index 33232678..4d94e44e 100644
--- a/public/opensearchdescription.xml
+++ b/public/opensearchdescription.xml
@@ -1,10 +1,13 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
- <ShortName>ML4Code</ShortName>
- <Description>Search Papers in Machine Learning for Source Code</Description>
- <Tags>ml4code </Tags>
- <Url type="text/html"
-      template="https://ml4code.github.io/papers.html#{searchTerms}"/>
- <Image height="64" width="64" type="image/png">https://ml4code.github.io/public/favicon.svg</Image>
+<OpenSearchDescription>
+<ShortName>ML4Code</ShortName>
+<LongName>ML4Code Survey</LongName>
+<Description></Description>
+<Contact/>
+<Url type="text/html" method="get" template="https://ml4code.github.io/papers.html#{searchTerms}"> </Url>
+<Image height="16" width="16" type="image/svg+xml">https://ml4code.github.io/public/favicon.svg</Image>
+<SyndicationRight>open</SyndicationRight>
+<AdultContent>false</AdultContent>
+<Language>en-us</Language>
+<OutputEncoding>UTF-8</OutputEncoding>
+<InputEncoding>UTF-8</InputEncoding>
 </OpenSearchDescription>
diff --git a/resources.md b/resources.md
index 0a3a7f6a..567c5608 100644
--- a/resources.md
+++ b/resources.md
@@ -14,8 +14,8 @@ A list of datasets used in this area can be found at the appendix of the
 [survey](https://arxiv.org/abs/1709.06182) and at [learnbigcode.github.io](http://learnbigcode.github.io/datasets/).
 
 ### Courses
-A few university courses are been taught covering aspects of machine learning for code, big code or naturalnness of code. Below there are a few that have publicly available material.
-* [Analyzing Software using Deep Learning](http://software-lab.org/teaching/summer2020/asdl/) in T.U. Darmstadt [[videos](https://www.youtube.com/playlist?list=PLBmY8PAxzwIHIKq4tYLws25KqGvUM4iFD)]
+A few university courses are been taught covering aspects of machine learning for code, big code or naturalness of code. Below there are a few that have publicly available material.
+* [Analyzing Software using Deep Learning](http://software-lab.org/teaching/summer2020/asdl/) in University of Stuttgart [[videos](https://www.youtube.com/playlist?list=PLBmY8PAxzwIHIKq4tYLws25KqGvUM4iFD)]
 * [Seminars on Applications of Deep Learning in Software Engineering and Programming Languages](https://sites.google.com/view/mlplse-sp18/) in U.C. Berkeley
 * [Machine learning for programming](https://www.cl.cam.ac.uk/teaching/1920/P252/) in the University of Cambridge, UK
 * [Deep Learning for Symbolic Reasoning](http://tiarkrompf.github.io/cs590/2018/) in Purdue University
@@ -26,6 +26,9 @@ Please, feel free to submit a pull request to adding more links in this page.
 ### Workshops and Other Academic Events
 The last few years a few workshops have been organized in this area. Please, feel free to add any missing or future workshops here.
 
+* [Deep Learning for Code](https://dl4c.github.io) April 29 2022, ICLR 2022, virtual
+* [NLP4Prog Workshop](https://nlp4prog.github.io/2021/) 6 August 2021, ACL 2021, virtual
+* [Workshop on Computer-Assisted Programming](https://capworkshop.github.io/) 12 December 2020, NeurIPS 2020, virtual 
 * [ML on Code devroom at FOSDEM19](https://fosdem.org/2019/schedule/track/ml_on_code/) 2-3 February 2019, Brussels, EU [[videos](https://video.fosdem.org/2019/H.2213/)]
 * [Machine Learning for Programming](http://ml4p.org/) 18–19 July 2018, Oxford, UK [[videos](https://www.youtube.com/watch?v=dQaAp9wdFtQ&list=PLMPy362FkW9pd96bwh0BuCGMo6fdMQ2aw)]
 * [International Workshop on Machine Learning techniques for Programming Languages](https://conf.researchr.org/track/ecoop-issta-2018/ML4PL-2018-papers) 16 - 21 July 2018 Amsterdam, Netherlands
@@ -37,9 +40,12 @@ The last few years a few workshops have been organized in this area. Please, fee
 
 ### Courses on Important Relevant Background
 
-* [Sofware Analysis](http://rightingcode.org/) in Univ. of Pennsylvania. It is a great introduction to Program Analysis [[videos](https://www.youtube.com/playlist?list=PLF3-CvSRq2SaApl3Lnu6Tu_ecsBr94543)]
+* [Sofware Analysis](http://rightingcode.org/) at Univ. of Pennsylvania. It is a great introduction to Program Analysis [[videos](https://www.youtube.com/playlist?list=PLF3-CvSRq2SaApl3Lnu6Tu_ecsBr94543)]
+* [Program Analysis](https://software-lab.org/teaching/winter2020/pa/) at University of Stuttgart [[videos](https://www.youtube.com/playlist?list=PLBmY8PAxzwIEGtnJiucyGAnwWpxACE633)]
+* [Applications of Data Science for Software Engineering 2020](https://www.youtube.com/watch?v=34hcH7Js41I&list=PLmAXH4O57P5_0IflYjLIg8l0IupZPbdlY) at Eindhoven University of Technology.
 
 ### Competitions
+* [nlc2cmd](http://nlc2cmd.us-east.mybluemix.net/#/) in NeurIPS 2020 by Project CLAI. Starts July 2020.
 * [CodeSearchNet Challenge: Evaluating the State of Semantic Code Search](https://github.com/github/CodeSearchNet) by Github. Starts Sep 2019.
 * [CodRep 2019: Machine Learning on Source Code Competition](https://github.com/KTH/codrep-2019) by KTH. Starts on April 25th 2019.
 * [CodRep 2018: Machine Learning on Source Code Competition](https://github.com/KTH/CodRep-competition) by KTH. Starts on April 14th 2018.
@@ -49,4 +55,4 @@ The last few years a few workshops have been organized in this area. Please, fee
 papers in the area. You can access the list [here](https://github.com/src-d/awesome-machine-learning-on-source-code).
 * [Autormated Program Repair](https://www.monperrus.net/martin/automatic-software-repair)
 has a curated list of pointers for helping newcomers to understan the field,
-maintained by [Martin Monperrus](www.monperrus.net).
\ No newline at end of file
+maintained by [Martin Monperrus](https://www.monperrus.net/martin/).
diff --git a/tags.html b/tags.html
index 991dca18..bf9476cf 100644
--- a/tags.html
+++ b/tags.html
@@ -8,7 +8,7 @@
   {% assign ttags = publication.tags  %}  
   {% assign rawtags = rawtags | concat: ttags %}  
 {% endfor %}
-{% assign rawtags = rawtags | uniq | sort %}
+{% assign rawtags = rawtags | uniq | sort_natural %}
 
 <h1>Publications by Tag</h1>
 <p>
@@ -22,7 +22,7 @@ <h2>Tags</h2>
 
 {% assign sortedPublications =  site.publications | sort: "authors" | sort: "year"%}
 {% for tag in rawtags %}
-   <h3>{{ tag }} <a id="{{ tag }}"></a></h3>
+   <h3>🏷 {{ tag }} <a id="{{ tag }}"></a></h3>
    <ul>
    {% for publication in sortedPublications %}
      {% if publication.tags contains tag %}
diff --git a/topic-viz.html b/topic-viz.html
new file mode 100644
index 00000000..d2ed5880
--- /dev/null
+++ b/topic-viz.html
@@ -0,0 +1,61 @@
+---
+layout: default
+title: Explore ML4Code papers with Topics
+description: A topic model for the papers in the ML4Code survey
+---
+<h2>Topic-based Explorer</h2>
+<p>Using topic-modelling the following topics have been extracted. The top stemmed words apprear below. 
+  Please change the slider to present the papers that mostly related to the appropria topics</p>
+<div id="topicslider">
+</div>
+<p>
+<ul id="toppapers">
+  <li>Please move the sliders to look at the papers.</li>
+</ul>
+</p>
+
+
+
+<script>
+  var all_papers = null;
+  var num_topics = -1;
+
+  $(document).ready(
+    function() {
+      $.getJSON('/topics.json', function(data) {
+        all_papers=data.paper_data;
+        num_topics = data.topics.length;
+        html = "";
+        for (let i=0; i < num_topics; i++) {
+            html += '<tag style="white-space: nowrap;">'+ data.topics[i].join(", ") +' <input type="range" min="0" max="10" value="0" style="width:50px" id="topicSlider'+i+'"></tag> '
+        }
+        $("#topicslider").append(html);
+        for (let i=0; i < num_topics; i++) {
+          $("#topicSlider"+i).on("change", renderPapers);
+        }
+      });
+    });
+
+  function scorePaper(paper_id) {
+    let score = 0;
+    topic_dist = all_papers[paper_id].topic_distr;
+    for (let i=0; i < num_topics; i++) {
+      score += $("#topicSlider"+i).val() * topic_dist[i];
+    }
+    return score;
+  }
+
+  function renderPapers(e, u) {
+    paper_idxs = [];
+    for (let i=0; i < all_papers.length; i++) {
+      paper_idxs.push([i, scorePaper(i)]);
+    }
+    paper_idxs = paper_idxs.sort(function(a,b){return b[1] - a[1]});
+    
+    $("#toppapers").text("");
+    for (let i=0; i < 20; i++) {
+      data = all_papers[paper_idxs[i][0]];
+      $("#toppapers").append("<li><a href='https://github.com/publications/"+data.key+"'>"+ data.title +"</a>. " + data.year + "</li>");
+    }
+  }
+</script>
\ No newline at end of file
diff --git a/tsne-viz.html b/tsne-viz.html
index 9274f963..a8d66a05 100644
--- a/tsne-viz.html
+++ b/tsne-viz.html
@@ -1,11 +1,12 @@
 ---
 layout: default
-title: Visualization of Publications on Machine Learning for Source Code
-description: A tSNE visualization of all the ML4Code papers
+title: A Map of Publications on Machine Learning for Source Code
+description: A map/visualization of the ML4Code papers.
 ---
 <h2>2D Map of Papers</h2>
-Each dot represents one paper in this survey. Hover your mouse over each point to look
-at the details. Click on a point to go to the paper information page.
+<p>Each dot represents one paper in this survey. Hover your mouse over each point to look
+at the details. Click on a point to go to the paper information page.</p>
+<div style="text-align: right;"><label>Search </label><input type="text" spellcheck="false" id="filtermap"/></div>
 <div id="paperviz"></div>
 
 Please consider <a href="https://github.com/contributing.html">contributing</a> by updating
@@ -65,15 +66,16 @@ <h2>2D Map of Papers</h2>
     for (i=0; i<d.tags.length; i++) {
         tags += "<tag>" + d.tags[i] + "</tag>&nbsp;"
     }
+
+    var boundingRect = document.getElementById("paperviz").getBoundingClientRect();
+    var mousePos = d3.mouse(this);
+    var x = mousePos[0] + boundingRect.x + 20;
+    var y = mousePos[1] + boundingRect.y + 30;
+
     tooltip
       .html("<p>" + d.title + " " + tags + "</p>")
-      .style("left", (d3.mouse(this)[0]+30) + "px")
-      .style("top", (d3.mouse(this)[1]) + "px");
-   d3.selectAll("circle").filter(dd => dd.key == d.key).style("fill", "#ff0000");
-  }
-
-  var mouseleave = function(d) {
-   d3.selectAll("circle").filter(dd => dd.key == d.key).style("fill", "#69b3a2");
+      .style("left", x + "px")
+      .style("top", y + "px");
   }
 
   var click_link = function(d) {
@@ -92,10 +94,23 @@ <h2>2D Map of Papers</h2>
       .style("fill", "#69b3a2")
       .style("opacity", 0.4)
       .style("stroke", "white")
-    .on("mouseover", mouseover )
-    .on("mousemove", mousemove )
-    .on("mouseleave", mouseleave )
-    .on("click", click_link)
+    .on("mouseover", mouseover)
+    .on("mousemove", mousemove)
+    .on("click", click_link);
+
+  var isMatch = function(d, searchTerm) {
+    if (searchTerm.length < 3) return false;
+    var allText = (d.title + " " + d.abstract + " " + d.tags.join(" ")).toLocaleLowerCase();
+    return allText.indexOf(searchTerm) != -1;
+  }
+
+  $("#filtermap").keyup(function (e) {
+    var searchTerm = $("#filtermap").val().toLocaleLowerCase();
+    var allPoints = d3.selectAll("circle");
+    // TODO: This seems quite inefficient...
+    allPoints.filter(dd => !isMatch(dd, searchTerm)).style("fill", "#69b3a2");
+    allPoints.filter(dd => isMatch(dd, searchTerm)).style("fill", "#aa0000");
+  });
 
 });
 </script>