diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..4a8183d7 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,7 @@ + +Thank you for submitting a PR! Please replace this text with a high-level description of the PR. Also ensure the following for new publications: + + +- [ ] Files for new publications are in the `_publications` folder. +- [ ] The name of each file is `lastnameYEARfirstword.markdown`, _e.g._ `smith2019neural` for a Smith _et al._ paper title "A neural approach to the Universe". +- [ ] Consider using tags that already exist. We aim to avoid variations or introducing new ones when possible. This is to help searching across this literature review. diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f13646cb..b6e8d907 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -19,13 +19,18 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: '3.7' + python-version: '3.8' architecture: x64 - name: Compute tSNE Embeddings run: | - python -m pip install transformers sklearn numpy - python -m pip install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + python -m pip install transformers scikit-learn numpy + python -m pip install torch==1.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html python ${{ github.workspace }}/etc/compute_embeddings.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/tsne.json + - name: Compute topics + run: | + python -m pip install nltk gensim scipy + python ${{ github.workspace }}/etc/compute_topics.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/topics.json + python ${{ github.workspace }}/etc/compute_related.py ${{ github.workspace }}/_site/paper-abstracts.json ${{ github.workspace }}/_site/publications-metadata/ - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..c3200447 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,22 @@ +# YAML 1.2 +--- +authors: + - + family-names: Allamanis + given-names: Miltiadis + orcid: "https://orcid.org/0000-0002-5819-9900" + - + family-names: Barr + given-names: "Earl T" + - + family-names: Devanbu + given-names: Premkumar + - + family-names: Sutton + given-names: Charles +cff-version: "1.1.0" +doi: "10.1145/3212695" +message: "For this live survey cite and the associated paper, please cite it as below." +repository-code: "https://ml4code.github.io/" +title: "A survey of machine learning for big code and naturalness" +... \ No newline at end of file diff --git a/_config.yml b/_config.yml index b99a8666..2b7f88d2 100644 --- a/_config.yml +++ b/_config.yml @@ -13,3 +13,6 @@ collections: plugins_dir: - jekyll-sitemap - jekyll-seo-tag + +sass: + style: compressed diff --git a/_includes/sidebar.html b/_includes/sidebar.html index 1535bd8c..6e5e71b6 100644 --- a/_includes/sidebar.html +++ b/_includes/sidebar.html @@ -1,3 +1,4 @@ +Contribute to ML4Code
{% for additional_link in page.additional_links %}
[{{ additional_link.name }}]
{% endfor %}
-
{% for tag in page.tags %}
{{ content }}
+{{ content }}
+ ++
diffsfrom which NMT can generate high-quality messages are similar to one or more training
diffsat the token level. (2) About 16% of the commit messages in Jiang et al.’s dataset are noisy due to being automatically generated or due to them describing repetitive trivial changes. (3) The performance of NMT declines by a large amount after removing such noisy commit messages. In addition, NMT is complicated and time-consuming. Inspired by our first finding, we proposed a simpler and faster approach, named NNGen (Nearest Neighbor Generator), to generate concise commit messages using the nearest neighbor algorithm. Our experimental results show that NNGen is over 2,600 times faster than NMT, and outperforms NMT in terms of BLEU (an accuracy measure that is widely used to evaluate machine translation systems) by 21%. Finally, we also discuss some observations for the road ahead for automated commit message generation to inspire other researchers. diff --git a/_publications/liu2019deepfuzz.markdown b/_publications/liu2019deepfuzz.markdown index 7c8a302d..2466aa6d 100644 --- a/_publications/liu2019deepfuzz.markdown +++ b/_publications/liu2019deepfuzz.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "DeepFuzz: Automatic Generation of Syntax Valid C Programs for Fuzz Testing" -authors: X. Liu, X. Li, R. Prajapati, D. Wu +authors: Xiao Liu, Xiaoting Li, Rupesh Prajapati, Dinghao Wu conference: AAAI year: 2019 -bibkey: liu2019deepfuzz -tags: ["fuzzing", "generation"] +tags: ["fuzzing", "code generation"] --- Compilers are among the most fundamental programming tools for building software. However, production compilers diff --git a/_publications/liu2019generating.markdown b/_publications/liu2019generating.markdown index e1d2e9ff..027d3a2b 100644 --- a/_publications/liu2019generating.markdown +++ b/_publications/liu2019generating.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Generating commit messages from diffs using pointer-generator network" -authors: Q. Liu, Z. Liu, H. Zhu, H. Fan, B. Du, Y. Qian +authors: Qin Liu, Zihe Liu, Hongming Zhu, Hongfei Fan, Bowen Du, Yu Qian. conference: MSR year: 2019 -bibkey: liu2019generating tags: ["edit"] --- The commit messages in source code repositories are valuable but not easy to be generated manually in time for tracking issues, reporting bugs, and understanding codes. Recently published works indicated that the deep neural machine translation approaches have drawn considerable attentions on automatic generation of commit messages. However, they could not deal with out-of-vocabulary (OOV) words, which are essential context-specific identifiers such as class names and method names in code diffs. In this paper, we propose PtrGNCMsg, a novel approach which is based on an improved sequence-to-sequence model with the pointer-generator network to translate code diffs into commit messages. By searching the smallest identifier set with the highest probability, PtrGNCMsg outperforms recent approaches based on neural machine translation, and first enables the prediction of OOV words. The experimental results based on the corpus of diffs and manual commit messages from the top 2,000 Java projects in GitHub show that PtrGNCMsg outperforms the state-of-the-art approach with improved BLEU by 1.02, ROUGE-1 by 4.00 and ROUGE-L by 3.78, respectively. diff --git a/_publications/liu2019learning.markdown b/_publications/liu2019learning.markdown index cca027d5..56c8754b 100644 --- a/_publications/liu2019learning.markdown +++ b/_publications/liu2019learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning to Sport and Refactor Inconsistent Method Names" -authors: K. Liu, D. Kim, T. F. Bissyand́e, T. Kim, K. Kim, A. Koyuncu, S. Kim, Y. Le Traon +authors: Kui Liu, Dongsun Kim, Tegawendé F. Bissyandé, Taeyoung Kim, Kisub Kim, Anil Koyuncu, Suntae Kim, Yves Le Traon conference: ICSE year: 2019 -bibkey: liu2019learning tags: ["naming"] --- To ensure code readability and facilitate software maintenance, program methods must be named properly. In particular, method names must be consistent with the corresponding method implementations. Debugging method names remains an important topic in the literature, where various approaches analyze commonalities among method names in a large dataset to detect inconsistent method names and suggest better ones. We note that the state-of-the-art does not analyze the implemented code itself to assess consistency. We thus propose a novel automated approach to debugging method names based on the analysis of consistency between method names and method code. The approach leverages deep feature representation techniques adapted to the nature of each artifact. Experimental results on over 2.1 million Java methods show that we can achieve up to 15 percentage points improvement over the state-of-the-art, establishing a record performance of 67.9% F1-measure in identifying inconsistent method names. We further demonstrate that our approach yields up to 25% accuracy in suggesting full names, while the state-of-the-art lags far behind at 1.1% accuracy. Finally, we report on our success in fixing 66 inconsistent method names in a live study on projects in the wild. \ No newline at end of file diff --git a/_publications/liu2019neural.markdown b/_publications/liu2019neural.markdown index 53de5147..cacfc801 100644 --- a/_publications/liu2019neural.markdown +++ b/_publications/liu2019neural.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Neural query expansion for code search" -authors: J. Liu, S. Kim, V. Murali, S. Chaudhuri, S. Chandra +authors: Jason Liu, Seohyun Kim, Vijayaraghavan Murali, Swarat Chaudhuri, Satish Chandra conference: MAPL year: 2019 -bibkey: liu2019neural tags: ["search"] --- Searching repositories of existing source code for code snippets is a key task in software engineering. Over the years, many approaches to this problem have been proposed. One recent tool called NCS, takes in a natural language query and outputs relevant code snippets, often being able to correctly answer Stack Overflow questions. But what happens when the developer doesn’t provide a query with a clear intent? What if shorter queries are used to demonstrate a more vague intent? diff --git a/_publications/liu2020automating.markdown b/_publications/liu2020automating.markdown new file mode 100644 index 00000000..e149d68d --- /dev/null +++ b/_publications/liu2020automating.markdown @@ -0,0 +1,9 @@ +--- +layout: publication +title: "Automating Just-In-Time Comment Updating" +authors: Zhongxin Liu, Xin Xia, Meng Yan, Shanping Li +conference: ASE +year: 2020 +tags: ["documentation"] +--- +Code comments are valuable for program comprehension and software maintenance, and also require maintenance with code evolution. However, when changing code, developers sometimes neglect updating the related comments, bringing in inconsistent or obsolete comments (aka., bad comments). Such comments are detrimental since they may mislead developers and lead to future bugs. Therefore, it is necessary to fix and avoid bad comments. In this work, we argue that bad comments can be reduced and even avoided by automatically performing comment updates with code changes. We refer to this task as “Just-In-Time (JIT) Comment Updating” and propose an approach named CUP (Comment UPdater) to automate this task. CUP can be used to assist developers in updating comments during code changes and can consequently help avoid the introduction of bad comments. Specifically, CUP leverages a novel neural sequence-to-sequence model to learn comment update patterns from extant code-comment co-changes and can automatically generate a new comment based on its corresponding old comment and code change. Several customized enhancements, such as a special tokenizer and a novel co-attention mechanism, are introduced in CUP by us to handle the characteristics of this task. We build a dataset with over 108K comment-code co-change samples and evaluate CUP on it. The evaluation results show that CUP outperforms an information-retrieval-based and a rule-based baselines by substantial margins, and can reduce developers' edits required for JIT comment updating. In addition, the comments generated by our approach are identical to those updated by developers in 1612 (16.7%) test samples, 7 times more than the best-performing baseline. diff --git a/_publications/liu2022open.markdown b/_publications/liu2022open.markdown new file mode 100644 index 00000000..1ff11cdb --- /dev/null +++ b/_publications/liu2022open.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: Open-ended Knowledge Tracing +authors: Naiming Liu, Zichao Wang, Richard G. Baraniuk, Andrew Lan +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2203.03716"} + - {name: "code", url: "https://github.com/lucy66666/OKT"} +tags: ["education", "code generation"] +--- +In education applications, knowledge tracing refers to the problem of estimating students' time-varying concept/skill mastery level from their past responses to questions and predicting their future performance. One key limitation of most existing knowledge tracing methods is that they treat student responses to questions as binary-valued, i.e., whether they are correct or incorrect. Response correctness analysis/prediction ignores important information on student knowledge contained in the exact content of the responses, especially for open-ended questions. In this paper, we conduct the first exploration into open-ended knowledge tracing (OKT) by studying the new task of predicting students' exact open-ended responses to questions. Our work is grounded in the domain of computer science education with programming questions. We develop an initial solution to the OKT problem, a student knowledge-guided code generation approach, that combines program synthesis methods using language models with student knowledge tracing methods. We also conduct a series of quantitative and qualitative experiments on a real-world student code dataset to validate OKT and demonstrate its promise in educational applications. diff --git a/_publications/liu2023code.markdown b/_publications/liu2023code.markdown new file mode 100644 index 00000000..2009fd2d --- /dev/null +++ b/_publications/liu2023code.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Code Execution with Pre-trained Language Models" +authors: Chenxiao Liu, Shuai Lu, Weizhu Chen, Daxin Jiang, Alexey Svyatkovskiy, Shengyu Fu, Neel Sundaresan, Nan Duan +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2305.05383"} +tags: ["Transformer", "execution"] +--- +Code execution is a fundamental aspect of programming language semantics that reflects the exact behavior of the code. However, most pre-trained models for code intelligence ignore the execution trace and only rely on source code and syntactic structures. In this paper, we investigate how well pre-trained models can understand and perform code execution. We develop a mutation-based data augmentation technique to create a large-scale and realistic Python dataset and task for code execution, which challenges existing models such as Codex. We then present CodeExecutor, a Transformer model that leverages code execution pre-training and curriculum learning to enhance its semantic comprehension. We evaluate CodeExecutor on code execution and show its promising performance and limitations. We also demonstrate its potential benefits for code intelligence tasks such as zero-shot code-to-code search and text-to-code generation. Our analysis provides insights into the learning and generalization abilities of pre-trained models for code execution. diff --git a/_publications/lomshakov2023fine.markdown b/_publications/lomshakov2023fine.markdown new file mode 100644 index 00000000..b38a2ff2 --- /dev/null +++ b/_publications/lomshakov2023fine.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: Fine-Tuning Large Language Models for Answering Programming Questions with Code Snippets +authors: V. Lomshakov, S. Kovalchuk, M. Omelchenko, S. Nikolenko, A. Aliev +conference: ICCS +year: 2023 +additional_links: + - {name: "LNCS", url: "https://link.springer.com/chapter/10.1007/978-3-031-36021-3_15"} + - {name: "Papers with Code ", url: "https://paperswithcode.com/paper/fine-tuning-large-language-models-for"} +tags: ["program synthesis", "question answering", "large language models"] +--- +We study the ability of pretrained large language models (LLM) to answer questions from online question answering fora such as Stack Overflow. We consider question-answer pairs where the main part of the answer consists of source code. On two benchmark datasets — CoNaLa and a newly collected dataset based on Stack Overflow — we investigate how a closed-book question answering system can be improved by fine-tuning the LLM for the downstream task, prompt engineering, and data preprocessing. We use publicly available autoregressive language models such as GPT-Neo, CodeGen, and PanGu-Coder, and after the proposed fine-tuning achieve a BLEU score of 0.4432 on the CoNaLa test set, significantly exceeding previous state of the art for this task. \ No newline at end of file diff --git a/_publications/louis2018deep.markdown b/_publications/louis2018deep.markdown index b5278ddb..3c92a2c3 100644 --- a/_publications/louis2018deep.markdown +++ b/_publications/louis2018deep.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Deep Learning to Detect Redundant Method Comments" -authors: A. Louis, S. K. Dash, E. T. Barr, C. Sutton +authors: Annie Louis, Santanu Kumar Dash, Earl T. Barr, Charles Sutton conference: year: 2018 -bibkey: louis2018deep additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1806.04616"} tags: ["bimodal", "documentation"] diff --git a/_publications/louis2020where.markdown b/_publications/louis2020where.markdown index 02e539b8..cad2c83a 100644 --- a/_publications/louis2020where.markdown +++ b/_publications/louis2020where.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Where should I comment my code? A dataset and model for predicting locations that need comments" -authors: A. Louis, S.K. Dash, E.T. Barr, M.D. Ernst, and C. Sutton +authors: Annie Louis, Santanu Kumar Dash, Earl T. Barr, Charles Sutton conference: International Conference on Software Engineering (ICSE; NIER track) year: 2020 -bibkey: louis2020where additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1806.04616"} - {name: "Data", url: "http://groups.inf.ed.ac.uk/cup/comment-locator"} diff --git a/_publications/loyola2017neural.markdown b/_publications/loyola2017neural.markdown index 0166a1dd..4db50bf2 100644 --- a/_publications/loyola2017neural.markdown +++ b/_publications/loyola2017neural.markdown @@ -1,10 +1,10 @@ --- layout: publication title: "A Neural Architecture for Generating Natural Language Descriptions from Source Code Changes" -authors: P. Loyola, E. Marrese-Taylor, Y. Matsuo -conference: ArXiV 1704.04856 +authors: Pablo Loyola, Edison Marrese-Taylor, Yutaka Matsuo year: 2017 -bibkey: loyola2017neural +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1704.04856"} tags: ["edit", "summarization"] --- We propose a model to automatically describe changes introduced in the source code of a program using natural language. Our method receives as input a set of code commits, which contains both the modifications and message introduced by an user. These two modalities are used to train an encoder-decoder architecture. We evaluated our approach on twelve real world open source projects from four different programming languages. Quantitative and qualitative results showed that the proposed approach can generate feasible and semantically sound descriptions not only in standard in-project settings, but also in a cross-project setting. diff --git a/_publications/loyola2018content.markdown b/_publications/loyola2018content.markdown index c1748420..f2dc4412 100644 --- a/_publications/loyola2018content.markdown +++ b/_publications/loyola2018content.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Content Aware Source Code Change Description Generation" -authors: P. Loyola, E. Marrese-Taylor, J.A. Balazs, Y. Matsuo, F. Satoh +authors: Pablo Loyola, Edison Marrese-Taylor, Jorge Balazs, Yutaka Matsuo, Fumiko Satoh conference: International Natural Language Generation Conference year: 2018 -bibkey: loyola2018content tags: ["edit", "summarization"] --- We propose to study the generation of descriptions from source code changes by integrating the messages included on code diff --git a/_publications/lu2019program.markdown b/_publications/lu2019program.markdown index 11e30e6d..09402580 100644 --- a/_publications/lu2019program.markdown +++ b/_publications/lu2019program.markdown @@ -1,10 +1,11 @@ --- layout: publication title: "Program Classification Using Gated Graph Attention Neural Network for Online Programming Service" -authors: M. Lu, D. Tan, N. Xiong, Z. Chen, H. Li +authors: Mingming Lu, Dingwu Tan, Naixue Xiong, Zailiang Chen, Haifeng Li conference: year: 2019 -bibkey: lu2019program +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1903.03804"} tags: ["GNN", "representation"] --- The online programing services, such as Github, TopCoder, and EduCoder, have promoted a lot of social interactions among the service users. However, the existing social interactions is rather limited and inefficient due to the rapid increasing of source-code repositories, which is difficult to explore manually. The emergence of source-code mining provides a promising way to analyze those source codes, so that those source codes can be relatively easy to understand and share among those service users. Among all the source-code mining attempts,program classification lays a foundation for various tasks related to source-code understanding, because it is impossible for a machine to understand a computer program if it cannot classify the program correctly. Although numerous machine learning models, such as the Natural Language Processing (NLP) based models and the Abstract Syntax Tree (AST) based models, have been proposed to classify computer programs based on their corresponding source codes, the existing works cannot fully characterize the source codes from the perspective of both the syntax and semantic information. To address this problem, we proposed a Graph Neural Network (GNN) based model, which integrates data flow and function call information to the AST,and applies an improved GNN model to the integrated graph, so as to achieve the state-of-art program classification accuracy. The experiment results have shown that the proposed work can classify programs with accuracy over 97%. \ No newline at end of file diff --git a/_publications/lu2021codexglue.markdown b/_publications/lu2021codexglue.markdown new file mode 100644 index 00000000..bb852c47 --- /dev/null +++ b/_publications/lu2021codexglue.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation" +authors: Shuai Lu, Daya Guo, Shuo Ren, Junjie Huang, Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Duyu Tang, Ge Li, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, Ming Gong, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, Shujie Liu +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2102.04664"} +tags: ["benchmark", "Transformer"] +--- +Benchmark datasets have a significant impact on accelerating research in programming language tasks. In this paper, we introduce CodeXGLUE, a benchmark dataset to foster machine learning research for program understanding and generation. CodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison. CodeXGLUE also features three baseline systems, including the BERT-style, GPT-style, and Encoder-Decoder models, to make it easy for researchers to use the platform. The availability of such data and baselines can help the development and validation of new methods that can be applied to various program understanding and generation problems. diff --git a/_publications/lu2022reacc.markdown b/_publications/lu2022reacc.markdown new file mode 100644 index 00000000..06cc08e5 --- /dev/null +++ b/_publications/lu2022reacc.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "ReACC: A Retrieval-Augmented Code Completion Framework" +authors: Shuai Lu, Nan Duan, Hojae Han, Daya Guo, Seung-won Hwang, Alexey Svyatkovskiy +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2203.07722"} +tags: ["Transformer", "autocomplete"] +--- +Code completion, which aims to predict the following code token(s) according to the code context, can improve the productivity of software development. Recent work has proved that statistical language modeling with transformers can greatly improve the performance in the code completion task via learning from large-scale source code datasets. However, current approaches focus only on code context within the file or project, i.e. internal context. Our distinction is utilizing "external" context, inspired by human behaviors of copying from the related code snippets when writing code. Specifically, we propose a retrieval-augmented code completion framework, leveraging both lexical copying and referring to code with similar semantics by retrieval. We adopt a stage-wise training approach that combines a source code retriever and an auto-regressive language model for programming language. We evaluate our approach in the code completion task in Python and Java programming languages, achieving a state-of-the-art performance on CodeXGLUE benchmark. diff --git a/_publications/luan2019aroma.markdown b/_publications/luan2019aroma.markdown new file mode 100644 index 00000000..ec4eeb4b --- /dev/null +++ b/_publications/luan2019aroma.markdown @@ -0,0 +1,9 @@ +--- +layout: publication +title: "Aroma: code recommendation via structural code search" +authors: Sifei Luan, Di Yang, Celeste Barnaby, Koushik Sen, Satish Chandra +conference: PACMPL +year: 2015 +tags: ["search"] +--- +Programmers often write code that has similarity to existing code written somewhere. A tool that could help programmers to search such similar code would be immensely useful. Such a tool could help programmers to extend partially written code snippets to completely implement necessary functionality, help to discover extensions to the partial code which are commonly included by other programmers, help to cross-check against similar code written by other programmers, or help to add extra code which would fix common mistakes and errors. We propose Aroma, a tool and technique for code recommendation via structural code search. Aroma indexes a huge code corpus including thousands of open-source projects, takes a partial code snippet as input, searches the corpus for method bodies containing the partial code snippet, and clusters and intersects the results of the search to recommend a small set of succinct code snippets which both contain the query snippet and appear as part of several methods in the corpus. We evaluated Aroma on 2000 randomly selected queries created from the corpus, as well as 64 queries derived from code snippets obtained from Stack Overflow, a popular website for discussing code. We implemented Aroma for 4 different languages, and developed an IDE plugin for Aroma. Furthermore, we conducted a study where we asked 12 programmers to complete programming tasks using Aroma, and collected their feedback. Our results indicate that Aroma is capable of retrieving and recommending relevant code snippets efficiently. diff --git a/_publications/maddison2014structured.markdown b/_publications/maddison2014structured.markdown index bac89749..f875d449 100644 --- a/_publications/maddison2014structured.markdown +++ b/_publications/maddison2014structured.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Structured Generative Models of Natural Source Code" -authors: C.J. Maddison, D. Tarlow +authors: Chris J. Maddison, Daniel Tarlow conference: ICML year: 2014 -bibkey: maddison2014structured -tags: ["language model", "generation", "grammar", "AST"] +tags: ["language model", "code generation", "grammar", "grammar"] --- We study the problem of building generative models of natural source code (NSC); that is, diff --git a/_publications/mahmud2021code.markdown b/_publications/mahmud2021code.markdown new file mode 100644 index 00000000..f364f7b1 --- /dev/null +++ b/_publications/mahmud2021code.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Code to Comment Translation: A Comparative Study on Model Effectiveness & Errors" +authors: Junayed Mahmud, Fahim Faisal, Raihan Islam Arnob, Antonios Anastasopoulos, Kevin Moran +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.1.pdf"} +tags: ["survey", "summarization", "Transformer"] +--- +Automated source code summarization is a popular software engineering research topic wherein machine translation models are employed to “translate” code snippets into relevant natural language descriptions. Most evaluations of such models are conducted using automatic reference-based metrics. However, given the relatively large semantic gap between programming languages and natural language, we argue that this line of research would benefit from a qualitative investigation into the various error modes of current state-of-the-art models. Therefore, in this work, we perform both a quantitative and qualitative comparison of three recently proposed source code summarization models. In our quantitative evaluation, we compare the models based on the smoothed BLEU-4, METEOR, and ROUGE-L machine translation metrics, and in our qualitative evaluation, we perform a manual open-coding of the most common errors committed by the models when compared to ground truth captions. Our investigation reveals new insights into the relationship between metric-based performance and model prediction errors grounded in an error taxonomy that can be used to drive future research efforts. diff --git a/_publications/malik2019nl2type.markdown b/_publications/malik2019nl2type.markdown index 234f011f..27dd35ef 100644 --- a/_publications/malik2019nl2type.markdown +++ b/_publications/malik2019nl2type.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "NL2Type: Inferring JavaScript Function Types from Natural Language Information" -authors: R.S. Malik, J. Patra, M. Pradel +authors: Rabee Sohail Malik, Jibesh Patra, Michael Pradel conference: ICSE year: 2019 -bibkey: malik2019nl2type tags: ["bimodal", "types"] --- JavaScript is dynamically typed and hence lacks thetype safety of statically typed languages, diff --git a/_publications/mammadli2020static.markdown b/_publications/mammadli2020static.markdown new file mode 100644 index 00000000..aba4d2f9 --- /dev/null +++ b/_publications/mammadli2020static.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Static Neural Compiler Optimization via Deep Reinforcement Learning" +authors: Rahim Mammadli, Ali Jannesari, Felix Wolf +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2008.08951"} +tags: ["compilation"] +--- +The phase-ordering problem of modern compilers has received a lot of attention from the research community over the years, yet remains largely unsolved. Various optimization sequences exposed to the user are manually designed by compiler developers. In designing such a sequence developers have to choose the set of optimization passes, their parameters and ordering within a sequence. Resulting sequences usually fall short of achieving optimal runtime for a given source code and may sometimes even degrade the performance when compared to unoptimized version. In this paper, we employ a deep reinforcement learning approach to the phase-ordering problem. Provided with sub-sequences constituting LLVM's O3 sequence, our agent learns to outperform the O3 sequence on the set of source codes used for training and achieves competitive performance on the validation set, gaining up to 1.32x speedup on previously-unseen programs. Notably, our approach differs from autotuning methods by not depending on one or more test runs of the program for making successful optimization decisions. It has no dependence on any dynamic feature, but only on the statically-attainable intermediate representation of the source code. We believe that the models trained using our approach can be integrated into modern compilers as neural optimization agents, at first to complement, and eventually replace the hand-crafted optimization sequences. diff --git a/_publications/mangal2015user.markdown b/_publications/mangal2015user.markdown index 5895108d..5f84f066 100644 --- a/_publications/mangal2015user.markdown +++ b/_publications/mangal2015user.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "A User-Guided Approach to Program Analysis" -authors: R. Mangal, X. Zhang, A. V. Nori, M. Naik +authors: Ravi Mangal, Xin Zhang, Aditya V. Nori, Mayur Naik conference: FSE year: 2015 -bibkey: mangal2015user tags: ["program analysis"] --- Program analysis tools often produce undesirable output diff --git a/_publications/markovtsev2017topic.markdown b/_publications/markovtsev2017topic.markdown index 5d7902fe..50734201 100644 --- a/_publications/markovtsev2017topic.markdown +++ b/_publications/markovtsev2017topic.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Topic modeling of public repositories at scale using names in source code" -authors: V. Markovtsev, E. Kant -conference: ArXiV 1704.00135 +authors: Vadim Markovtsev, Eiso Kant +conference: year: 2017 -bibkey: markovtsev2017topic additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1704.00135"} - {name: "website", url: "https://blog.sourced.tech/post/github_topic_modeling"} diff --git a/_publications/markovtsev2018public.markdown b/_publications/markovtsev2018public.markdown index 3ddbf914..12e55d1c 100644 --- a/_publications/markovtsev2018public.markdown +++ b/_publications/markovtsev2018public.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Public Git Archive: a Big Code dataset for all" -authors: V. Markovtsev, W. Long +authors: Vadim Markovtsev, Waren Long conference: MSR year: 2018 -bibkey: markovtsev2018public additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1803.10144"} - {name: "GitHub", url: "https://github.com/src-d/datasets/tree/master/PublicGitArchive"} diff --git a/_publications/markovtsev2019style.markdown b/_publications/markovtsev2019style.markdown index 6a9af904..8b890c96 100644 --- a/_publications/markovtsev2019style.markdown +++ b/_publications/markovtsev2019style.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "STYLE-ANALYZER: fixing code style inconsistencies with interpretable unsupervised algorithms" -authors: V. Markovtsev, W. Long, H. Mougard, K. Slavnov, E. Bulychev +authors: Vadim Markovtsev, Waren Long, Hugo Mougard, Konstantin Slavnov, Egor Bulychev conference: MSR year: 2019 -bibkey: markovtsev2019style additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1904.00935"} tags: ["style"] diff --git a/_publications/mastropaolo2022using.markdown b/_publications/mastropaolo2022using.markdown new file mode 100644 index 00000000..630b56ac --- /dev/null +++ b/_publications/mastropaolo2022using.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Using Deep Learning to Generate Complete Log Statements" +authors: Antonio Mastropaolo, Luca Pascarella, Gabriele Bavota +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2201.04837"} +tags: ["Transformer", "logging"] +--- +Logging is a practice widely adopted in several phases of the software lifecycle. For example, during software development log statements allow engineers to verify and debug the system by exposing fine-grained information of the running software. While the benefits of logging are undisputed, taking proper decisions about where to inject log statements, what information to log, and at which log level (e.g., error, warning) is crucial for the logging effectiveness. In this paper, we present LANCE (Log stAtemeNt reCommEnder), the first approach supporting developers in all these decisions. LANCE features a Text-To-Text-Transfer-Transformer (T5) model that has been trained on 6,894,456 Java methods. LANCE takes as input a Java method and injects in it a full log statement, including a human-comprehensible logging message and properly choosing the needed log level and the statement location. Our results show that LANCE is able to (i) properly identify the location in the code where to inject the statement in 65.9% of Java methods requiring it; (ii) selecting the proper log level in 66.2% of cases; and (iii) generate a completely correct log statement including a meaningful logging message in 15.2% of cases. diff --git a/_publications/mehrotra2020modeling.markdown b/_publications/mehrotra2020modeling.markdown new file mode 100644 index 00000000..5e5def39 --- /dev/null +++ b/_publications/mehrotra2020modeling.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "Modeling Functional Similarity in Source Code with Graph-Based Siamese Networks" +authors: Nikita Mehrotra, Navdha Agarwal, Piyush Gupta, Saket Anand, David Lo, Rahul Purandare +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2011.11228"} +tags: ["clone", "GNN"] +--- +Code clones are duplicate code fragments that share (nearly) similar syntax or semantics. Code clone detection plays an important role in software maintenance, code refactoring, and reuse. A substantial amount of research has been conducted in the past to detect clones. A majority of these approaches use lexical and syntactic information to detect clones. However, only a few of them target semantic clones. Recently, motivated by the success of deep learning models in other fields, including natural language processing and computer vision, researchers have attempted to adopt deep learning techniques to detect code clones. These approaches use lexical information (tokens) and(or) syntactic structures like abstract syntax trees (ASTs) to detect code clones. However, they do not make sufficient use of the available structural and semantic information hence, limiting their capabilities. + +This paper addresses the problem of semantic code clone detection using program dependency graphs and geometric neural networks, leveraging the structured syntactic and semantic information. We have developed a prototype tool HOLMES, based on our novel approach, and empirically evaluated it on popular code clone benchmarks. Our results show that HOLMES performs considerably better than the other state-of-the-art tool, TBCCD. We also evaluated HOLMES on unseen projects and performed cross dataset experiments to assess the generalizability of HOLMES. Our results affirm that HOLMES outperforms TBCCD since most of the pairs that HOLMES detected were either undetected or suboptimally reported by TBCCD. diff --git a/_publications/menon2013machine.markdown b/_publications/menon2013machine.markdown index 90d1a6ab..806a4cdf 100644 --- a/_publications/menon2013machine.markdown +++ b/_publications/menon2013machine.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "A Machine Learning Framework for Programming by Example" -authors: A. K. Menon, O. Tamuz, S. Gulwani, B. Lampson, A.T. Kalai +authors: Aditya Menon, Omer Tamuz, Sumit Gulwani, Butler Lampson, Adam Kalai conference: ICML year: 2013 -bibkey: menon2013machine -tags: ["generation"] +tags: ["code generation"] --- Learning programs is a timely and interesting challenge. In Programming by Example (PBE), a system attempts to infer a program diff --git a/_publications/mesbah2019deepdelta.markdown b/_publications/mesbah2019deepdelta.markdown index b8f78c51..fd572924 100644 --- a/_publications/mesbah2019deepdelta.markdown +++ b/_publications/mesbah2019deepdelta.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "DeepDelta: Learning to Repair Compilation Errors" -authors: A. Mesbah, A. Rice, E. Johnstin, N. Glorioso +authors: Ali Mesbah, Andrew Rice, Emily Johnston, Nick Glorioso, Edward Aftandilian. conference: year: 2019 -bibkey: mesbah2019deepdelta tags: ["repair", "edit", "compilation"] --- Programmers spend a substantial amount of time manually repairing diff --git a/_publications/mir2021manytypes4py.markdown b/_publications/mir2021manytypes4py.markdown new file mode 100644 index 00000000..539f9985 --- /dev/null +++ b/_publications/mir2021manytypes4py.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "ManyTypes4Py: A Benchmark Python Dataset for Machine Learning-based Type Inference" +authors: Amir M. Mir, Evaldas Latoskinas, Georgios Gousios +conference: MSR +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2104.04706"} + - {name: "Dataset", url: "https://zenodo.org/record/4479714"} +tags: ["dataset", "types"] +--- +In this paper, we present ManyTypes4Py, a large Python dataset for machine learning (ML)-based type inference. The dataset contains a total of 5,382 Python projects with more than 869K type annotations. Duplicate source code files were removed to eliminate the negative effect of the duplication bias. To facilitate training and evaluation of ML models, the dataset was split into training, validation and test sets by files. To extract type information from abstract syntax trees (ASTs), a lightweight static analyzer pipeline is developed and accompanied with the dataset. Using this pipeline, the collected Python projects were analyzed and the results of the AST analysis were stored in JSON-formatted files. The ManyTypes4Py dataset is shared on zenodo and its tools are publicly available on GitHub. diff --git a/_publications/mir2021type4py.markdown b/_publications/mir2021type4py.markdown new file mode 100644 index 00000000..fb8922a4 --- /dev/null +++ b/_publications/mir2021type4py.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Type4Py: Deep Similarity Learning-Based Type Inference for Python" +authors: Amir M. Mir, Evaldas Latoskinas, Sebastian Proksch, Georgios Gousios +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2101.04470"} + - {name: "GitHub", url: "https://github.com/saltudelft/type4py"} +tags: ["types"] +--- +Dynamic languages, such as Python and Javascript, trade static typing for developer flexibility. While this allegedly enables greater productivity, lack of static typing can cause runtime exceptions, type inconsistencies, and is a major factor for weak IDE support. To alleviate these issues, PEP 484 introduced optional type annotations for Python. As retrofitting types to existing codebases is error-prone and laborious, learning-based approaches have been proposed to enable automatic type annotations based on existing, partially annotated codebases. However, the prediction of rare and user-defined types is still challenging. In this paper, we present Type4Py, a deep similarity learning-based type inference model for Python. We design a hierarchical neural network model that learns to discriminate between types of the same kind and dissimilar types in a high-dimensional space, which results in clusters of types. Nearest neighbor search suggests likely type signatures of given Python functions. The types visible to analyzed modules are surfaced using lightweight dependency analysis. The results of quantitative and qualitative evaluation indicate that Type4Py significantly outperforms state-of-the-art approaches at the type prediction task. Considering the Top-1 prediction, Type4Py obtains 19.33% and 13.49% higher precision than Typilus and TypeWriter, respectively, while utilizing a much bigger vocabulary. diff --git a/_publications/mohajer2023skipanalyzer.markdown b/_publications/mohajer2023skipanalyzer.markdown new file mode 100644 index 00000000..cbf424e7 --- /dev/null +++ b/_publications/mohajer2023skipanalyzer.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "SkipAnalyzer: A Tool for Static Code Analysis with Large Language Models" +authors: Mohammad Mahdi Mohajer, Reem Aleithan, Nima Shiri Harzevili, Moshi Wei, Alvine Boaye Belle, Hung Viet Pham, Song Wang +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2310.18532"} +tags: ["repair"] +--- +We introduce SkipAnalyzer, a large language model (LLM)-powered tool for static code analysis. SkipAnalyzer has three components: 1) an LLM-based static bug detector that scans source code and reports specific types of bugs, 2) an LLM-based false-positive filter that can identify false-positive bugs in the results of static bug detectors (e.g., the result of step 1) to improve detection accuracy, and 3) an LLM-based patch generator that can generate patches for the detected bugs above. As a proof-of-concept, SkipAnalyzer is built on ChatGPT, which has exhibited outstanding performance in various software engineering tasks. To evaluate SkipAnalyzer, we focus on two types of typical and critical bugs that are targeted by static bug detection, i.e., Null Dereference and Resource Leak as subjects. We employ Infer to aid the gathering of these two bug types from 10 open-source projects. Consequently, our experiment dataset contains 222 instances of Null Dereference bugs and 46 instances of Resource Leak bugs. Our study demonstrates that SkipAnalyzer achieves remarkable performance in the mentioned static analysis tasks, including bug detection, false-positive warning removal, and bug repair. In static bug detection, SkipAnalyzer achieves accuracy values of up to 68.37% for detecting Null Dereference bugs and 76.95% for detecting Resource Leak bugs, improving the precision of the current leading bug detector, Infer, by 12.86% and 43.13%, respectively. For removing false-positive warnings, SkipAnalyzer can reach a precision of up to 93.88% for Null Dereference bugs and 63.33% for Resource Leak bugs. Additionally, SkipAnalyzer surpasses state-of-the-art false-positive warning removal tools. Furthermore, in bug repair, SkipAnalyzer can generate syntactically correct patches to fix its detected bugs with a success rate of up to 97.30%. diff --git a/_publications/monperrus2021megadiff.markdown b/_publications/monperrus2021megadiff.markdown new file mode 100644 index 00000000..9a3bb4d9 --- /dev/null +++ b/_publications/monperrus2021megadiff.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Megadiff: A Dataset of 600k Java Source Code Changes Categorized by Diff Size" +authors: Martin Monperrus, Matias Martinez, He Ye, Fernanda Madeiral, Thomas Durieux, Zhongxing Yu +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2108.04631"} + - {name: "Dataset", url: "https://zenodo.org/record/5013515"} +tags: ["dataset", "edit"] +--- +This paper presents Megadiff, a dataset of source code diffs. It focuses on Java, with strict inclusion criteria based on commit message and diff size. Megadiff contains 663 029 Java diffs that can be used for research on commit comprehension, fault localization, automated program repair, and machine learning on code changes. diff --git a/_publications/mou2014building.markdown b/_publications/mou2014building.markdown index 81ec0c42..b29e9093 100644 --- a/_publications/mou2014building.markdown +++ b/_publications/mou2014building.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Building Program Vector Representations for Deep Learning" -authors: L. Mou, G. Li, Y. Liu, H. Peng, Z. Jin, Y. Xu, L. Zhang +authors: Hao Peng, Lili Mou, Ge Li, Yuxuan Liu, Lu Zhang, Zhi Jin. conference: International Conference on Knowledge Science, Engineering and Management year: 2014 -bibkey: mou2014building -tags: ["representation", "AST"] +tags: ["representation", "grammar"] --- Deep learning has made significant breakthroughs in various fields of artificial intelligence. Advantages of deep diff --git a/_publications/mou2016convolutional.markdown b/_publications/mou2016convolutional.markdown index c36878d8..5df86b1a 100644 --- a/_publications/mou2016convolutional.markdown +++ b/_publications/mou2016convolutional.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Convolutional Neural Networks over Tree Structures for Programming Language Processing" -authors: L. Mou, G. Li, L. Zhang, T. Wang, Z. Jin +authors: Lili Mou, Ge Li, Lu Zhang, Tao Wang, Zhi Jin conference: AAAI year: 2016 -bibkey: mou2016convolutional -tags: ["representation", "AST"] +tags: ["representation", "grammar"] --- Programming language processing (similar to natural language processing) is a hot research topic in the field of software engineering; it has also aroused growing interest in the artificial intelligence community. However, different from a diff --git a/_publications/movshovitz2013natural.markdown b/_publications/movshovitz2013natural.markdown index 0b4eab5b..1e734eec 100644 --- a/_publications/movshovitz2013natural.markdown +++ b/_publications/movshovitz2013natural.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Natural Language Models for Predicting Programming Comments" -authors: D. Movshovitz-Attias, W.W. Cohen +authors: Dana Movshovitz-Attias, William W. Cohen conference: ACL year: 2013 -bibkey: movshovitz2013natural tags: ["bimodal", "documentation", "summarization"] --- Statistical language models have successfully been used to describe and analyze diff --git a/_publications/movshovitz2015kb.markdown b/_publications/movshovitz2015kb.markdown index caa5a2b1..de0926b2 100644 --- a/_publications/movshovitz2015kb.markdown +++ b/_publications/movshovitz2015kb.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "KB-LDA: Jointly Learning a Knowledge Base of Hierarchy, Relations, and Facts" -authors: D. Movshovitz-Attias, W. W. Cohen +authors: Dana Movshovitz-Attias, William W. Cohen conference: ACL year: 2015 -bibkey: movshovitz2015kb tags: ["pattern mining"] --- Many existing knowledge bases (KBs), including Freebase, Yago, and NELL, rely diff --git a/_publications/muennighoff2023octopack.markdown b/_publications/muennighoff2023octopack.markdown new file mode 100644 index 00000000..718e7c30 --- /dev/null +++ b/_publications/muennighoff2023octopack.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "OctoPack: Instruction Tuning Code Large Language Models" +authors: Niklas Muennighoff, Qian Liu, Armel Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro von Werra, Shayne Longpre +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2308.07124"} +tags: ["dataset", "instruction tuning"] +--- +Finetuning large language models (LLMs) on instructions leads to vast performance improvements on natural language tasks. We apply instruction tuning using code, leveraging the natural structure of Git commits, which pair code changes with human instructions. We compile CommitPack: 4 terabytes of Git commits across 350 programming languages. We benchmark CommitPack against other natural and synthetic code instructions (xP3x, Self-Instruct, OASST) on the 16B parameter StarCoder model, and achieve state-of-the-art performance among models not trained on OpenAI outputs, on the HumanEval Python benchmark (46.2% pass@1). We further introduce HumanEvalPack, expanding the HumanEval benchmark to a total of 3 coding tasks (Code Repair, Code Explanation, Code Synthesis) across 6 languages (Python, JavaScript, Java, Go, C++, Rust). Our models, OctoCoder and OctoGeeX, achieve the best performance across HumanEvalPack among all permissive models, demonstrating CommitPack's benefits in generalizing to a wider set of languages and natural coding tasks. Code, models and data are freely available at https://github.com/bigcode-project/octopack. diff --git a/_publications/mukherjee2020searching.markdown b/_publications/mukherjee2020searching.markdown index c7bd98f1..7ee9d482 100644 --- a/_publications/mukherjee2020searching.markdown +++ b/_publications/mukherjee2020searching.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Searching a Database of Source Codes Using Contextualized Code Search" -authors: R. Mukherjee, S. Chaudhuri, C. Jermaine +authors: Rohan Mukherjee, Swarat Chaudhuri, Chris Jermaine conference: year: 2020 -bibkey: mukherjee2020searching additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2001.03277"} tags: ["search", "representation"] diff --git a/_publications/mukherjee2021neural.markdown b/_publications/mukherjee2021neural.markdown new file mode 100644 index 00000000..a3e07641 --- /dev/null +++ b/_publications/mukherjee2021neural.markdown @@ -0,0 +1,22 @@ +--- +layout: publication +title: "Neural Program Generation Modulo Static Analysis" +authors: Rohan Mukherjee, Yeming Wen, Dipak Chaudhari, Thomas W. Reps, Swarat Chaudhuri, Chris Jermaine +conference: NeurIPS +year: 2021 +additional_links: + - {name: "Preprint", url: "https://www.cs.utexas.edu/~swarat/pubs/neurips21-nsg.pdf"} +tags: ["synthesis", "language model"] +--- +State-of-the-art neural models of source code tend to be evaluated on the generation +of individual expressions and lines of code, and commonly fail on long-horizon +tasks such as the generation of entire method bodies. We propose to address this +deficiency using weak supervision from a static program analyzer. Our neurosymbolic method allows a deep generative model to symbolically compute, using calls +to a static-analysis tool, long-distance semantic relationships in the code that it +has already generated. During training, the model observes these relationships +and learns to generate programs conditioned on them. We apply our approach to +the problem of generating entire Java methods given the remainder of the class +that contains the method. Our experiments show that the approach substantially +outperforms state-of-the-art transformers and a model that explicitly tries to learn +program semantics on this task, both in terms of producing programs free of basic +semantic errors and in terms of syntactically matching the ground truth. diff --git a/_publications/murali2017bayesian.markdown b/_publications/murali2017bayesian.markdown index 6e077221..29100b79 100644 --- a/_publications/murali2017bayesian.markdown +++ b/_publications/murali2017bayesian.markdown @@ -1,11 +1,12 @@ --- layout: publication title: "Bayesian Sketch Learning for Program Synthesis" -authors: V. Murali, S. Chaudhuri, C. Jermaine -conference: arXiv preprint 1703.05698 -year: 2017 -bibkey: murali2017bayesian -tags: ["generation", "API"] +authors: Vijayaraghavan Murali, Letao Qi, Swarat Chaudhuri, Chris Jermaine +conference: ICLR +year: 2018 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1703.05698"} +tags: ["code generation", "API"] --- We present a Bayesian statistical approach to the problem of automatic program synthesis. Our synthesizer starts by learning, offline and from an existing corpus, a probabilistic model of real-world programs. During synthesis, diff --git a/_publications/murali2017finding.markdown b/_publications/murali2017finding.markdown index 367cd9e5..9b0e9a55 100644 --- a/_publications/murali2017finding.markdown +++ b/_publications/murali2017finding.markdown @@ -1,10 +1,11 @@ --- layout: publication title: "Finding Likely Errors with Bayesian Specifications" -authors: V. Murali, S. Chaudhuri, C. Jermaine -conference: arXiv preprint 1703.01370 +authors: Vijayaraghavan Murali, Swarat Chaudhuri, Chris Jermaine +conference: +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1703.01370"} year: 2017 -bibkey: murali2017finding tags: ["program analysis", "API"] --- We present a Bayesian framework for learning probabilistic specifications from large, unstructured code corpora, and diff --git a/_publications/nadeem2022codedsi.markdown b/_publications/nadeem2022codedsi.markdown new file mode 100644 index 00000000..224c2e8b --- /dev/null +++ b/_publications/nadeem2022codedsi.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "CodeDSI: Differentiable Code Search" +authors: Usama Nadeem, Noah Ziems, Shaoen Wu +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2210.00328"} +tags: ["search"] +--- +Reimplementing solutions to previously solved software engineering problems is not only inefficient but also introduces inadequate and error-prone code. Many existing methods achieve impressive performance on this issue by using autoregressive text-generation models trained on code. However, these methods are not without their flaws. The generated code from these models can be buggy, lack documentation, and introduce vulnerabilities that may go unnoticed by developers. An alternative to code generation -- neural code search -- is a field of machine learning where a model takes natural language queries as input and, in turn, relevant code samples from a database are returned. Due to the nature of this pre-existing database, code samples can be documented, tested, licensed, and checked for vulnerabilities before being used by developers in production. In this work, we present CodeDSI, an end-to-end unified approach to code search. CodeDSI is trained to directly map natural language queries to their respective code samples, which can be retrieved later. In an effort to improve the performance of code search, we have investigated docid representation strategies, impact of tokenization on docid structure, and dataset sizes on overall code search performance. Our results demonstrate CodeDSI strong performance, exceeding conventional robust baselines by 2-6% across varying dataset sizes. \ No newline at end of file diff --git a/_publications/naik2022probing.markdown b/_publications/naik2022probing.markdown new file mode 100644 index 00000000..7945b28b --- /dev/null +++ b/_publications/naik2022probing.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "Probing Semantic Grounding in Language Models of Code with Representational Similarity Analysis" +authors: Shounak Naik, Rajaswa Patil, Swati Agarwal, Veeky Baths +conference: International Conference on Advanced Data Mining and Applications (ADMA 2022) +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2207.07706"} + - {name: "PDF", url: "https://link.springer.com/chapter/10.1007/978-3-031-22137-8_29"} + - {name: "Code", url: "https://github.com/shounaknaik/Probing-Semantic-Grounding-in-Language-Models-of-Code-with-Representational-Similarity-Analysis"} +tags: ["interpretability", "language model", "evaluation", "Transformer"] +--- +Representational Similarity Analysis is a method from cognitive neuroscience, which helps in comparing representations from two different sources of data. In this paper, we propose using Representational Similarity Analysis to probe the semantic grounding in language models of code. We probe representations from the CodeBERT model for semantic grounding by using the data from the IBM CodeNet dataset. Through our experiments, we show that current pre-training methods do not induce semantic grounding in language models of code, and instead focus on optimizing form-based patterns. We also show that even a little amount of fine-tuning on semantically relevant tasks increases the semantic grounding in CodeBERT significantly. Our ablations with the input modality to the CodeBERT model show that using bimodal inputs (code and natural language) over unimodal inputs (only code) gives better semantic grounding and sample efficiency during semantic fine-tuning. Finally, our experiments with semantic perturbations in code reveal that CodeBERT is able to robustly distinguish between semantically correct and incorrect code. diff --git a/_publications/nair2020funcgnn.markdown b/_publications/nair2020funcgnn.markdown new file mode 100644 index 00000000..7f7932d7 --- /dev/null +++ b/_publications/nair2020funcgnn.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "funcGNN: A Graph Neural Network Approach to Program Similarity" +authors: Aravind Nair, Avijit Roy, Karl Meinke +conference: ESEM +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2007.13239"} +tags: ["GNN", "clone"] +--- +Program similarity is a fundamental concept, central to the solution of software engineering tasks such as software plagiarism, clone identification, code refactoring and code search. Accurate similarity estimation between programs requires an in-depth understanding of their structure, semantics and flow. A control flow graph (CFG), is a graphical representation of a program which captures its logical control flow and hence its semantics. A common approach is to estimate program similarity by analysing CFGs using graph similarity measures, e.g. graph edit distance (GED). However, graph edit distance is an NP-hard problem and computationally expensive, making the application of graph similarity techniques to complex software programs impractical. This study intends to examine the effectiveness of graph neural networks to estimate program similarity, by analysing the associated control flow graphs. We introduce funcGNN, which is a graph neural network trained on labeled CFG pairs to predict the GED between unseen program pairs by utilizing an effective embedding vector. To our knowledge, this is the first time graph neural networks have been applied on labeled CFGs for estimating the similarity between high-level language programs. Results: We demonstrate the effectiveness of funcGNN to estimate the GED between programs and our experimental analysis demonstrates how it achieves a lower error rate (0.00194), with faster (23 times faster than the quickest traditional GED approximation method) and better scalability compared with the state of the art methods. funcGNN posses the inductive learning ability to infer program structure and generalise to unseen programs. The graph embedding of a program proposed by our methodology could be applied to several related software engineering problems (such as code plagiarism and clone identification) thus opening multiple research directions. diff --git a/_publications/nguyen2013lexical.markdown b/_publications/nguyen2013lexical.markdown index 146364fb..c9ae218e 100644 --- a/_publications/nguyen2013lexical.markdown +++ b/_publications/nguyen2013lexical.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Lexical Statistical Machine Translation for Language Migration" -authors: A. T. Nguyen, T. T. Nguyen, T. N. Nguyen +authors: Anh Tuan Nguyen, Tung Thanh Nguyen, Tien N. Nguyen conference: FSE year: 2013 -bibkey: nguyen2013lexical tags: ["migration", "API"] --- Prior research has shown that source code also exhibits naturalness, i.e. it is written by humans and is likely to be diff --git a/_publications/nguyen2013statistical.markdown b/_publications/nguyen2013statistical.markdown index 33171ac5..d78f4953 100644 --- a/_publications/nguyen2013statistical.markdown +++ b/_publications/nguyen2013statistical.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "A Statistical Semantic Language Model for Source Code" -authors: T.T. Nguyen, A.T. Nguyen, H.A. Nguyen, T.N. Nguyen +authors: Tung Thanh Nguyen, Anh Tuan Nguyen, Hoan Anh Nguyen, Tien N. Nguyen conference: FSE year: 2013 -bibkey: nguyen2013statistical tags: ["language model"] --- Recent research has successfully applied the statistical n-gram language model to show that source code exhibits a diff --git a/_publications/nguyen2013study.markdown b/_publications/nguyen2013study.markdown index 74baa75f..114b9c5f 100644 --- a/_publications/nguyen2013study.markdown +++ b/_publications/nguyen2013study.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "A Study of Repetitiveness of Code Changes in Software Evolution" -authors: H.A. Nguyen, A.T. Nguyen, T.T. Nguyen, T.N. Nguyen, H. Rajan +authors: Hoan Anh Nguyen, Anh Tuan Nguyen, Tung Thanh Nguyen, Tien N. Nguyen, and Hridesh Rajan conference: ASE year: 2013 -bibkey: nguyen2013study tags: ["edit"] --- In this paper, we present a large-scale study of diff --git a/_publications/nguyen2014statistical.markdown b/_publications/nguyen2014statistical.markdown index fec01c7f..5c791ab2 100644 --- a/_publications/nguyen2014statistical.markdown +++ b/_publications/nguyen2014statistical.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Statistical Learning Approach for Mining API Usage Mappings for Code Migration" -authors: A.T. Nguyen, H.A. Nguyen, T.T. Nguyen, T.N. Nguyen +authors: Anh Tuan Nguyen, Hoan Anh Nguyen, Tung Thanh Nguyen, Tien N. Nguyen conference: ASE year: 2014 -bibkey: nguyen2014statistical tags: ["migration", "API"] --- The same software product nowadays could appear in multiple platforms and devices. To address business needs, software companies diff --git a/_publications/nguyen2015divide.markdown b/_publications/nguyen2015divide.markdown index 565206b1..13c993d3 100644 --- a/_publications/nguyen2015divide.markdown +++ b/_publications/nguyen2015divide.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Divide-and-Conquer Approach for Multi-phase Statistical Migration for Source Code" -authors: A.T. Nguyen, T.T. Nguyen, T.N. Nguyen +authors: Anh Tuan Nguyen, Tung Thanh Nguyen, Tien N. Nguyen conference: ASE year: 2014 -bibkey: nguyen2015divide tags: ["migration"] --- Prior research shows that directly applying phrase-based SMT on lexical tokens to migrate Java to C# produces diff --git a/_publications/nguyen2015graph.markdown b/_publications/nguyen2015graph.markdown index c35743de..5c9a8e07 100644 --- a/_publications/nguyen2015graph.markdown +++ b/_publications/nguyen2015graph.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Graph-based Statistical Language Model for Code" -authors: A.T. Nguyen, T.N. Nguyen +authors: Anh Tuan Nguyen, Tien N. Nguyen conference: ICSE year: 2015 -bibkey: nguyen2015graph tags: ["representation", "language model", "autocomplete"] --- n-gram statistical language model has been successfully applied to capture programming patterns to support code diff --git a/_publications/nguyen2016learning.markdown b/_publications/nguyen2016learning.markdown index a40942e7..7af8d204 100644 --- a/_publications/nguyen2016learning.markdown +++ b/_publications/nguyen2016learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning API Usages from Bytecode: A Statistical Approach" -authors: T.T. Nguyen, H.V. Pham, P.M. Vu, T.T. Nguyen +authors: Tam The Nguyen, Hung Viet Pham, Phong Minh Vu, Tung Thanh Nguyen conference: ICSE year: 2016 -bibkey: nguyen2016learning tags: ["representation", "API"] --- Mobile app developers rely heavily on standard API frameworks and libraries. However, learning API usages is often challenging due to the fast-changing nature of API frameworks for mobile systems and the insufficiency of API documentation and source code examples. In this paper, we propose a novel approach to learn API usages from bytecode of Android mobile apps. Our core contributions include HAPI, a statistical model of API usages and three algorithms to extract method call sequences from apps' bytecode, to train HAPI based on those sequences, and to recommend method calls in code completion using the trained HAPIs. Our empirical evaluation shows that our prototype tool can effectively learn API usages from 200 thousand apps containing 350 million method sequences. It recommends next method calls with top-3 accuracy of 90% and outperforms baseline approaches on average 10-20%. diff --git a/_publications/nguyen2016mapping.markdown b/_publications/nguyen2016mapping.markdown index a4078884..39212e21 100644 --- a/_publications/nguyen2016mapping.markdown +++ b/_publications/nguyen2016mapping.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Mapping API Elements for Code Migration with Vector Representations" -authors: T.D. Nguyen, A.T. Nguyen, T.N. Nguyen +authors: Trong Duc Nguyen, Anh Tuan Nguyen, Tien N. Nguyen conference: ICSE year: 2016 -bibkey: nguyen2016mapping tags: ["migration", "API"] --- - +Mapping API elements has a significant role in software development, especially in code migration. A manual process of defining the migration is tedious and error-prone while recent approaches to automatically mine API mappings are limited to discover the mappings with textually similar APIs' names. This leads to the low accuracy in existing migration tools.We propose an approach to automatically mine API mappings which overcomes the lexical mismatch problem. We represent an API by its usages instead of its name.To characterize an API with its context consisting of surrounding APIs in its usages, we take advantage of Word2Vec model to project the APIs of Java JDK and C# .NET into corresponding continuous vector spaces. The semantic relations among APIs will be observed in those continuous space as the geometric arrangements between their representation vectors in two vector spaces.We use a learning approach to derive the linear (e.g., rotating and scaling) transformation function between two vector spaces. Transformation function is trained from human-defined pairs of API mappings from Java to C#. To find the C# API mapping with a given Java API, we use the learned function to compute its transformed vector in the C# vector space. Then, the C# API which has the most similar vector with the transformed vector is considered as the result. Our experiment shows that for just one suggestion, we are able to correctly derive the API in C# in almost 43% of the cases. With 5 suggestions, we can correctly suggest the correct C# API in almost 3 out of 4 cases (73.2%). \ No newline at end of file diff --git a/_publications/nguyen2017exploring.markdown b/_publications/nguyen2017exploring.markdown index 22197b71..dc298c22 100644 --- a/_publications/nguyen2017exploring.markdown +++ b/_publications/nguyen2017exploring.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Exploring API Embedding for API Usages and Applications" -authors: T.D. Nguyen, A.T. Nguyen, H.D. Phan, T.N. Nguyen +authors: Trong Duc Nguyen, Anh Tuan Nguyen, Hung Dang Phan, Tien N. Nguyen conference: ICSE year: 2017 -bibkey: nguyen2017exploring tags: ["API", "representation"] --- Word2Vec is a class of neural network models that diff --git a/_publications/nguyen2019graph.markdown b/_publications/nguyen2019graph.markdown index 4918b518..1e586aac 100644 --- a/_publications/nguyen2019graph.markdown +++ b/_publications/nguyen2019graph.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Graph-based Mining of In-the-Wild, Fine-grained, Semantic Code Change Patterns" -authors: H. Nguyen, T. Nguyen, D. Dig, S. Nguyen, H. Tran, M. Hilton +authors: Hoan Anh Nguyen, Tien N. Nguyen, Danny Dig, Son Nguyen, Hieu Tran, and Michael Hilton conference: ICSE year: 2019 -bibkey: nguyen2019graph tags: ["edit", "pattern mining"] --- Existing approaches for detecting repetitive code changes relying on syntactic similarity cannot effectively detect semantic change patterns. In this work, we introduce a novel graph-based mining approach, CPatMiner, which is capable of detecting semantic code change patterns from a large number of open-source repositories by capturing dependencies between fine-grained change elements. We evaluated CPatMiner by mining change patterns in a diverse corpus of 5,000+ open-source projects from GitHub with 170,000+ developers. We use three complementary methods. First, we sent the mined patterns to the authors and received 108 responses. 70% of respondents recognized those patterns as their meaningful frequent changes. 79% of respondents even named the patterns, and 44% wanted IDEs to automate such repetitive changes. The mined patterns belong to various activities: adaptive (9%), perfective (20%), corrective (35%) and preventive (36%). Second, we compared CPatMiner with the state-of-the-art, AST-based technique, and reported that CPatMiner detects 2.1x more meaningful patterns. Third, we used CPatMiner to search for patterns in a corpus of 88 GitHub projects with longer histories consisting of 164M SLOCs. It constructed 322K fine-grained change graphs containing 3M nodes, and detected 17K change patterns which provide unique insights on the practice of change patterns among individuals and teams. We found that a large percentage (75%) of the patterns from individual developers are commonly shared with others, and this holds true for teams. Moreover, we found that the patterns spread widely over time. Thus, we call for a community-based change pattern database to provide important resources in novel applications. \ No newline at end of file diff --git a/_publications/nguyen2020suggesting.markdown b/_publications/nguyen2020suggesting.markdown index 1a93964e..2c895539 100644 --- a/_publications/nguyen2020suggesting.markdown +++ b/_publications/nguyen2020suggesting.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Suggesting Natural Method Names to Check Name Consistencies" -authors: S. Nguyen, H. Phan, T. Le, T. N. Nguyen +authors: Son Nguyen, Hung Phan, Trinh Le, Tien N. Nguyen conference: ICSE year: 2020 -bibkey: nguyen2020suggesting additional_links: - {name: "Preprint", url: "https://sonvnguyen.github.io/publications/icse20-final.pdf"} tags: ["naming"] diff --git a/_publications/nie2021evaluation.markdown b/_publications/nie2021evaluation.markdown new file mode 100644 index 00000000..c2bb2abd --- /dev/null +++ b/_publications/nie2021evaluation.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Impact of Evaluation Methodologies on Code Summarization" +authors: Pengyu Nie, Jiyang Zhang, Junyi Jessy Li, Raymond J. Mooney, Milos Gligoric +conference: ACL +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2108.09619"} +tags: ["evaluation", "dataset"] +--- +There has been a growing interest in developing machine learning (ML) models for code summarization tasks, e.g., comment generation and method naming. Despite substantial increase in the effectiveness of ML models, the evaluation methodologies, i.e., the way people split datasets into training, validation, and test sets, were not well studied. Specifically, no prior work on code summarization considered the timestamps of code and comments during evaluation. This may lead to evaluations that are inconsistent with the intended use cases. In this paper, we introduce the time-segmented evaluation methodology, which is novel to the code summarization research community, and compare it with the mixed-project and cross-project methodologies that have been commonly used. Each methodology can be mapped to some use cases, and the time-segmented methodology should be adopted in the evaluation of ML models for code summarization. To assess the impact of methodologies, we collect a dataset of (code, comment) pairs with timestamps to train and evaluate several recent ML models for code summarization. Our experiments show that different methodologies lead to conflicting evaluation results. We invite the community to expand the set of methodologies used in evaluations. diff --git a/_publications/nijkamp2022conversational.markdown b/_publications/nijkamp2022conversational.markdown new file mode 100644 index 00000000..5d3e1a72 --- /dev/null +++ b/_publications/nijkamp2022conversational.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "A Conversational Paradigm for Program Synthesis" +authors: Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2203.13474"} +tags: ["Transformer", "synthesis"] +--- +Program synthesis strives to generate a computer program as a solution to a given problem specification. We propose a conversational program synthesis approach via large language models, which addresses the challenges of searching over a vast program space and user intent specification faced in prior approaches. Our new approach casts the process of writing a specification and program as a multi-turn conversation between a user and a system. It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. We train a family of large language models, called CodeGen, on natural language and programming language data. With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. We make the training library JaxFormer including checkpoints available as open source contribution: https://github.com/salesforce/CodeGen. diff --git a/_publications/nijkamp2023codegen2.markdown b/_publications/nijkamp2023codegen2.markdown new file mode 100644 index 00000000..ab8f7e4f --- /dev/null +++ b/_publications/nijkamp2023codegen2.markdown @@ -0,0 +1,15 @@ +--- +layout: publication +title: "CodeGen2: Lessons for Training LLMs on Programming and Natural Languages" +authors: Erik Nijkamp, Hiroaki Hayashi, Caiming Xiong, Silvio Savarese, Yingbo Zhou +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2305.02309"} +tags: ["Transformer"] +--- +Large language models (LLMs) have demonstrated remarkable abilities in representation learning for program synthesis and understanding tasks. The quality of the learned representations appears to be dictated by the neural scaling laws as a function of the number of model parameters and observations, while imposing upper bounds on the model performance by the amount of available data and compute, which is costly. + +In this study, we attempt to render the training of LLMs for program synthesis more efficient by unifying four key components: (1) model architectures, (2) learning methods, (3) infill sampling, and, (4) data distributions. Specifically, for the model architecture, we attempt to unify encoder and decoder-based models into a single prefix-LM. For learning methods, (i) causal language modeling, (ii) span corruption, (iii) infilling are unified into a simple learning algorithm. For infill sampling, we explore the claim of a "free lunch" hypothesis. For data distributions, the effect of a mixture distribution of programming and natural languages on model performance is explored. + +We conduct a comprehensive series of empirical experiments on 1B LLMs, for which failures and successes of this exploration are distilled into four lessons. We will provide a final recipe for training and release CodeGen2 models in size 1B, 3.7B, 7B, and, 16B parameters, along with the training framework as open-source: https://github.com/salesforce/CodeGen2 diff --git a/_publications/nitin2021direct.markdown b/_publications/nitin2021direct.markdown new file mode 100644 index 00000000..03a9b529 --- /dev/null +++ b/_publications/nitin2021direct.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "DIRECT : A Transformer-based Model for Decompiled Identifier Renaming" +authors: Vikram Nitin, Anthony Saieva, Baishakhi Ray, Gail Kaiser +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.6.pdf"} +tags: ["Transformer", "decompilation"] +--- +Decompiling binary executables to high-level code is an important step in reverse engineering scenarios, such as malware analysis and legacy code maintenance. However, the generated high-level code is difficult to understand since the original variable names are lost. In this paper, we leverage transformer models to reconstruct the original variable names from decompiled code. Inherent differences between code and natural language present certain challenges in applying conventional transformer-based architectures to variable name recovery. We propose DIRECT, a novel transformer-based architecture customized specifically for the task at hand. We evaluate our model on a dataset of decompiled functions and find that DIRECT outperforms the previous state-of-the-art model by up to 20%. We also present ablation studies evaluating the impact of each of our modifications. We make the source code of DIRECT available to encourage reproducible research. diff --git a/_publications/niu2022spt-code.markdown b/_publications/niu2022spt-code.markdown new file mode 100644 index 00000000..8a42fa41 --- /dev/null +++ b/_publications/niu2022spt-code.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "SPT-Code: Sequence-to-Sequence Pre-Training for Learning Source Code Representations" +authors: Changan Niu, Chuanyi Li, Vincent Ng, Jidong Ge, Liguo Huang, Bin Luo +conference: ICSE +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2201.01549"} + - {name: "code", url: "https://github.com/NougatCA/SPT-Code"} +tags: ["Transformer", "representation"] +--- +Recent years have seen the successful application of large pre-trained modelsto code representation learning, resulting in substantial improvements on many code-related downstream tasks. But there are issues surrounding theirapplication to SE tasks. First, the majority of the pre-trained models focus on pre-training only the encoder of the Transformer. For generation tasks that are addressed using models with the encoder-decoder architecture, however, there is no reason why the decoder should be left out during pre-training. Second, many existing pre-trained models, including state-of-the-art models such as T5-learning, simply reuse the pre-training tasks designed for natural languages. Moreover, to learn the natural language description of source code needed eventually for code-related tasks such as code summarization, existingpre-training tasks require a bilingual corpus composed of source code and the associated natural language description, which severely limits the amount of data for pre-training. To this end, we propose SPT-Code, a sequence-to-sequence pre-trained model for source code. In order to pre-train SPT-Code in a sequence-to-sequence manner and address the aforementioned weaknesses associated with existing pre-training tasks, we introduce three pre-training tasks that are specifically designed to enable SPT-Code to learn knowledge of source code, the corresponding code structure, as well as a natural language description of the code without relying on any bilingual corpus, and eventually exploit these three sources of information when it is applied to downstreamt asks. Experimental results demonstrate that SPT-Code achieves state-of-the-artperformance on five code-related downstream tasks after fine-tuning. diff --git a/_publications/nye2021program.markdown b/_publications/nye2021program.markdown new file mode 100644 index 00000000..9cd7e576 --- /dev/null +++ b/_publications/nye2021program.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Program Synthesis with Large Language Models" +authors: Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, Charles Sutton +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2108.07732"} +tags: ["Transformer", "synthesis"] +--- +This paper explores the limits of the current generation of large language models for program synthesis in general purpose programming languages. We evaluate a collection of such models (with between 244M and 137B parameters) on two new benchmarks, MBPP and MathQA-Python, in both the few-shot and fine-tuning regimes. Our benchmarks are designed to measure the ability of these models to synthesize short Python programs from natural language descriptions. The Mostly Basic Programming Problems (MBPP) dataset contains 974 programming tasks, designed to be solvable by entry-level programmers. The MathQA-Python dataset, a Python version of the MathQA benchmark, contains 23914 problems that evaluate the ability of the models to synthesize code from more complex text. On both datasets, we find that synthesis performance scales log-linearly with model size. Our largest models, even without finetuning on a code dataset, can synthesize solutions to 59.6 percent of the problems from MBPP using few-shot learning with a well-designed prompt. Fine-tuning on a held-out portion of the dataset improves performance by about 10 percentage points across most model sizes. On the MathQA-Python dataset, the largest fine-tuned model achieves 83.8 percent accuracy. Going further, we study the model's ability to engage in dialog about code, incorporating human feedback to improve its solutions. We find that natural language feedback from a human halves the error rate compared to the model's initial prediction. Additionally, we conduct an error analysis to shed light on where these models fall short and what types of programs are most difficult to generate. Finally, we explore the semantic grounding of these models by fine-tuning them to predict the results of program execution. We find that even our best models are generally unable to predict the output of a program given a specific input. diff --git a/_publications/nye2021show.markdown b/_publications/nye2021show.markdown new file mode 100644 index 00000000..3bb58a6f --- /dev/null +++ b/_publications/nye2021show.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Show Your Work: Scratchpads for Intermediate Computation with Language Models" +authors: Maxwell Nye, Anders Johan Andreassen, Guy Gur-Ari, Henryk Michalewski, Jacob Austin, David Bieber, David Dohan, Aitor Lewkowycz, Maarten Bosma, David Luan, Charles Sutton, Augustus Odena +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2112.00114"} +tags: ["Transformer", "execution"] +--- +Large pre-trained language models perform remarkably well on tasks that can be done "in one pass", such as generating realistic text or synthesizing computer programs. However, they struggle with tasks that require unbounded multi-step computation, such as adding integers or executing programs. Surprisingly, we find that these same models are able to perform complex multi-step computations -- even in the few-shot regime -- when asked to perform the operation "step by step", showing the results of intermediate computations. In particular, we train transformers to perform multi-step computations by asking them to emit intermediate computation steps into a "scratchpad". On a series of increasingly complex tasks ranging from long addition to the execution of arbitrary programs, we show that scratchpads dramatically improve the ability of language models to perform multi-step computations. diff --git a/_publications/oda2015learning.markdown b/_publications/oda2015learning.markdown index 0e2cb897..7e1a8897 100644 --- a/_publications/oda2015learning.markdown +++ b/_publications/oda2015learning.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Learning to Generate Pseudo-code from Source Code using Statistical Machine Translation" -authors: Y. Oda, H. Fudaba, G. Neubig, H. Hata, S. Sakti, T. Toda, and S. Nakamura +authors: Yusuke Oda, Hiroyuki Fudaba, Graham Neubig, Hideaki Hata, Sakriani Sakti, Tomoki Toda, Satoshi Nakamura conference: ASE year: 2015 -bibkey: oda2015learning -tags: ["representation", "bimodal", "AST"] +tags: ["representation", "bimodal", "grammar"] --- Pseudo-code written in natural language can aid the comprehension of source code in unfamiliar programming diff --git a/_publications/oh2015learning.markdown b/_publications/oh2015learning.markdown index a3075012..e4b91de3 100644 --- a/_publications/oh2015learning.markdown +++ b/_publications/oh2015learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning a Strategy for Adapting a Program Analysis via Bayesian Optimisation" -authors: H. Oh, H. Yang, K, Yi +authors: Hakjoo Oh, Hongseok Yang, Kwangkeun Yi. conference: OOPSLA year: 2015 -bibkey: oh2015learning tags: ["program analysis"] --- Building a cost-effective static analyser for real-world programs is still regarded an art. One key contributor to this diff --git a/_publications/olausson2023demystifying.markdown b/_publications/olausson2023demystifying.markdown new file mode 100644 index 00000000..8f89853a --- /dev/null +++ b/_publications/olausson2023demystifying.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Demystifying GPT Self-Repair for Code Generation" +authors: Theo X. Olausson, Jeevana Priya Inala, Chenglong Wang, Jianfeng Gao, Armando Solar-Lezama +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2306.09896"} +tags: ["repair"] +--- +Large Language Models (LLMs) have shown remarkable aptitude in code generation but still struggle on challenging programming tasks. Self-repair -- in which the model debugs and fixes mistakes in its own code -- has recently become a popular way to boost performance in these settings. However, only very limited studies on how and when self-repair works effectively exist in the literature, and one might wonder to what extent a model is really capable of providing accurate feedback on why the code is wrong when that code was generated by the same model. In this paper, we analyze GPT-3.5 and GPT-4's ability to perform self-repair on APPS, a challenging dataset consisting of diverse coding challenges. To do so, we first establish a new evaluation strategy dubbed pass@t that measures the pass rate of the tasks against the total number of tokens sampled from the model, enabling a fair comparison to purely sampling-based approaches. With this evaluation strategy, we find that the effectiveness of self-repair is only seen in GPT-4. We also observe that self-repair is bottlenecked by the feedback stage; using GPT-4 to give feedback on the programs generated by GPT-3.5 and using expert human programmers to give feedback on the programs generated by GPT-4, we unlock significant performance gains. diff --git a/_publications/omar2013structured.markdown b/_publications/omar2013structured.markdown index d01b769b..6c03c8af 100644 --- a/_publications/omar2013structured.markdown +++ b/_publications/omar2013structured.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Structured Statistical Syntax Tree Prediction" -authors: C. Omar +authors: Cyrus Omar conference: SPLASH year: 2013 -bibkey: omar2013structured -tags: ["language model", "AST"] +tags: ["language model", "grammar"] --- Statistical models of source code can be used to improve code completion systems, assistive interfaces, and code diff --git a/_publications/orlanski2021reading.markdown b/_publications/orlanski2021reading.markdown new file mode 100644 index 00000000..a3c31c09 --- /dev/null +++ b/_publications/orlanski2021reading.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Reading StackOverflow Encourages Cheating: Adding Question Text Improves Extractive Code Generation" +authors: Gabriel Orlanski, Alex Gittens +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.8.pdf"} +tags: ["dataset", "Transformer"] +--- +Answering a programming question with only its title is difficult as salient contextual information is left out. To address this, we present a corpus of over 40,000 StackOverflow question texts to be used in conjunction with the corresponding intents from the CoNaLa dataset (Yin et al., 2018). Using both the intent and the question body, we use BART to establish a baseline BLEU score of 34.35 for this new task. We then find further improvements of 2.8% by combining the mined CoNaLa data with the labeled data to achieve a 35.32 BLEU score. We then evaluate the prior state-of-the-art CoNaLa models with this additional data. We find that our proposed method of using the body and mined data beats that of the previous state-of-the-art by a 71.96% BLEU score. Finally, we perform ablations that prove that BART is an unsupervised multimodal learner and examine its extractive behavior. diff --git a/_publications/ott2018deep.markdown b/_publications/ott2018deep.markdown index 08e9777b..a117bce0 100644 --- a/_publications/ott2018deep.markdown +++ b/_publications/ott2018deep.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "A Deep Learning Approach to Identifying Source Code in Images and Video" -authors: J. Ott, A. Atchison, P. Harnack, A. Bergh, E. Linstead +authors: Jordan Ott, Abigail Atchison, Paul Harnack, Adrienne Bergh, Erik Linstead. conference: MSR year: 2018 -bibkey: ott2018deep tags: ["information extraction"] --- While substantial progress has been made in mining code on an diff --git a/_publications/pandi2020opttyper.markdown b/_publications/pandi2020opttyper.markdown index 27287bb0..b662c530 100644 --- a/_publications/pandi2020opttyper.markdown +++ b/_publications/pandi2020opttyper.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "OptTyper: Probabilistic Type Inference by Optimising Logical and Natural Constraints" -authors: I. V. Pandi, E.T. Barr, A.D. Gordon, C. Sutton +authors: Irene Vlassi Pandi, Earl T. Barr, Andrew D. Gordon, Charles Sutton conference: year: 2020 -bibkey: pandi2020opttyper additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2004.00348"} tags: ["types", "bimodal"] diff --git a/_publications/panthaplackel2020associating.markdown b/_publications/panthaplackel2020associating.markdown index 74c177fb..c021aef4 100644 --- a/_publications/panthaplackel2020associating.markdown +++ b/_publications/panthaplackel2020associating.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Associating Natural Language Comment and Source Code Entities" -authors: S. Panthaplackel, M. Gligoric, R. J. Mooney, J. J. Li +authors: Sheena Panthaplackel, Milos Gligoric, Raymond J. Mooney, Junyi Jessy Li conference: AAAI year: 2020 -bibkey: panthaplackel2020associating additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1912.06728"} tags: ["dataset", "bimodal"] diff --git a/_publications/panthaplackel2020copy.markdown b/_publications/panthaplackel2020copy.markdown index b4605ca6..d09c2960 100644 --- a/_publications/panthaplackel2020copy.markdown +++ b/_publications/panthaplackel2020copy.markdown @@ -4,7 +4,6 @@ title: "Copy that! Editing Sequences by Copying Spans" authors: Sheena Panthaplackel, Miltiadis Allamanis, Marc Brockschmidt conference: year: 2020 -bibkey: panthaplackel2020copy additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2006.04771"} tags: ["edit"] diff --git a/_publications/panthaplackel2020deep.markdown b/_publications/panthaplackel2020deep.markdown new file mode 100644 index 00000000..30e722e1 --- /dev/null +++ b/_publications/panthaplackel2020deep.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Deep Just-In-Time Inconsistency Detection Between Comments and Source Code" +authors: Sheena Panthaplackel, Junyi Jessy Li, Milos Gligoric, Raymond J. Mooney +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2010.01625"} +tags: ["edit", "bimodal", "documentation"] +--- +Natural language comments convey key aspects of source code such as implementation, usage, and pre- and post-conditions. Failure to update comments accordingly when the corresponding code is modified introduces inconsistencies, which is known to lead to confusion and software bugs. In this paper, we aim to detect whether a comment becomes inconsistent as a result of changes to the corresponding body of code, in order to catch potential inconsistencies just-in-time, i.e., before they are committed to a version control system. To achieve this, we develop a deep-learning approach that learns to correlate a comment with code changes. By evaluating on a large corpus of comment/code pairs spanning various comment types, we show that our model outperforms multiple baselines by significant margins. For extrinsic evaluation, we show the usefulness of our approach by combining it with a comment update model to build a more comprehensive automatic comment maintenance system which can both detect and resolve inconsistent comments based on code changes. diff --git a/_publications/panthaplackel2020learning.markdown b/_publications/panthaplackel2020learning.markdown index ea998df9..5fb3b7a2 100644 --- a/_publications/panthaplackel2020learning.markdown +++ b/_publications/panthaplackel2020learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning to Update Natural Language Comments Based on Code Changes" -authors: S. Panthaplackel, P.Nie, M. Gligoric, J. J. Li, R. J. Mooney +authors: Sheena Panthaplackel, Pengyu Nie, Milos Gligoric, Raymond J. Mooney, Junyi Jessy Li conference: ACL year: 2020 -bibkey: panthaplackel2020learning additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2004.12169"} tags: ["bimodal", "edit", "documentation"] diff --git a/_publications/panthaplackel2021learning.markdown b/_publications/panthaplackel2021learning.markdown new file mode 100644 index 00000000..4c33b959 --- /dev/null +++ b/_publications/panthaplackel2021learning.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Learning to Describe Solutions for Bug Reports Based on Developer Discussions" +authors: Sheena Panthaplackel, Junyi Jessy Li, Milos Gligoric, Raymond J. Mooney +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2110.04353"} +tags: ["summarization", "documentation"] +--- +When a software bug is reported, developers engage in a discussion to collaboratively resolve it. While the solution is likely formulated within the discussion, it is often buried in a large amount of text, making it difficult to comprehend, which delays its implementation. To expedite bug resolution, we propose generating a concise natural language description of the solution by synthesizing relevant content within the discussion, which encompasses both natural language and source code. Furthermore, to support generating an informative description during an ongoing discussion, we propose a secondary task of determining when sufficient context about the solution emerges in real-time. We construct a dataset for these tasks with a novel technique for obtaining noisy supervision from repository changes linked to bug reports. We establish baselines for generating solution descriptions, and develop a classifier which makes a prediction following each new utterance on whether or not the necessary context for performing generation is available. Through automated and human evaluation, we find these tasks to form an ideal testbed for complex reasoning in long, bimodal dialogue context. diff --git a/_publications/panthaplackel2022using.markdown b/_publications/panthaplackel2022using.markdown new file mode 100644 index 00000000..1597adcc --- /dev/null +++ b/_publications/panthaplackel2022using.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Using Developer Discussions to Guide Fixing Bugs in Software" +authors: Sheena Panthaplackel, Milos Gligoric, Junyi Jessy Li, Raymond J. Mooney +conference: EMNLP +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2211.06335"} +tags: ["Transformer", "repair"] +--- +Automatically fixing software bugs is a challenging task. While recent work showed that natural language context is useful in guiding bug-fixing models, the approach required prompting developers to provide this context, which was simulated through commit messages written after the bug-fixing code changes were made. We instead propose using bug report discussions, which are available before the task is performed and are also naturally occurring, avoiding the need for any additional information from developers. For this, we augment standard bug-fixing datasets with bug report discussions. Using these newly compiled datasets, we demonstrate that various forms of natural language context derived from such discussions can aid bug-fixing, even leading to improved performance over using commit messages corresponding to the oracle bug-fixing commits. diff --git a/_publications/parisi2021source.markdown b/_publications/parisi2021source.markdown new file mode 100644 index 00000000..4cff09c3 --- /dev/null +++ b/_publications/parisi2021source.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Source Code Classification for Energy Efficiency in Parallel Ultra Low-Power Microcontrollers" +authors: Emanuele Parisi, Francesco Barchi, Andrea Bartolini, Giuseppe Tagliavini, Andrea Acquaviva +conference: DATE +year: 2021 +additional_links: + - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9474085"} + - {name: "ArXiV", url: "https://arxiv.org/abs/2012.06836"} +tags: ["optimization", "program analysis"] +--- +The analysis of source code through machine learning techniques is an increasingly explored research topic aiming at increasing smartness in the software toolchain to exploit modern architectures in the best possible way. In the case of low-power, parallel embedded architectures, this means finding the configuration, for instance in terms of the number of cores, leading to minimum energy consumption. Depending on the kernel to be executed, the energy optimal scaling configuration is not trivial. While recent work has focused on general-purpose systems to learn and predict the best execution target in terms of the execution time of a snippet of code or kernel (e.g. offload OpenCL kernel on multicore CPU or GPU), in this work we focus on static compile-time features to assess if they can be successfully used to predict the minimum energy configuration on PULP, an ultra-low-power architecture featuring an on-chip cluster of RISC-V processors. Experiments show that using machine learning models on the source code to select the best energy scaling configuration automatically is viable and has the potential to be used in the context of automatic system configuration for energy minimisation. \ No newline at end of file diff --git a/_publications/parisi2022making.markdown b/_publications/parisi2022making.markdown new file mode 100644 index 00000000..0c1efc18 --- /dev/null +++ b/_publications/parisi2022making.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Making the Most of Scarce Input Data in Deep Learning-Based Source Code Classification for Heterogeneous Device Mapping" +authors: Emanuele Parisi, Francesco Barchi, Andrea Bartolini, Andrea Acquaviva +journal: IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems +year: 2022 +additional_links: + - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9544064"} + - {name: "code", url: "https://gitlab.com/ecs-lab/deepllvm"} +tags: ["optimization", "program analysis", "static analysis", "language model"] +--- +Despite its relatively recent history, deep learning (DL)-based source code analysis is already a cornerstone in machine learning for compiler optimization. When applied to the classification of pieces of code to identify the best computational unit in a heterogeneous Systems-on-Chip, it can be effective in supporting decisions that a programmer has otherwise to take manually. Several techniques have been proposed exploiting different networks and input information, prominently sequence-based and graph-based representations, complemented by auxiliary information typically related to payload and device configuration. While the accuracy of DL methods strongly depends on the training and test datasets, so far no exhaustive and statistically meaningful analysis has been done on its impact on the results and on how to effectively extract the available information. This is relevant also considering the scarce availability of source code datasets that can be labeled by profiling on heterogeneous compute units. In this article, we first present such a study, which leads us to devise the contribution of code sequences and auxiliary inputs separately. Starting from this analysis, we then demonstrate that by using the normalization of auxiliary information, it is possible to improve state-of-the-art results in terms of accuracy. Finally, we propose a novel approach exploiting Siamese networks that further improve mapping accuracy by increasing the cardinality of the dataset, thus compensating for its relatively small size. \ No newline at end of file diff --git a/_publications/parvez2018building.markdown b/_publications/parvez2018building.markdown index 53b5f93e..955eb000 100644 --- a/_publications/parvez2018building.markdown +++ b/_publications/parvez2018building.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Building Language Models for Text with Named Entities" -authors: M.R. Parvez, S. Chakraborty, B. Ray, KW Chang +authors: M.R. Parvez, Saikat Chakraborty, Baishakhi Ray, KW Chang conference: ACL year: 2018 -bibkey: parvez2018building tags: ["language model"] --- Text in many domains involves a significant amount of named entities. Predicting the entity names is often challenging diff --git a/_publications/parvez2021retrieval.markdown b/_publications/parvez2021retrieval.markdown new file mode 100644 index 00000000..78c36878 --- /dev/null +++ b/_publications/parvez2021retrieval.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Retrieval Augmented Code Generation and Summarization" +authors: Md Rizwan Parvez, Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang +conference: EMNLP-Findings +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2108.11601"} +tags: ["Transformer", "summarization", "code generation"] +--- +Software developers write a lot of source code and documentation during software development. Intrinsically, developers often recall parts of source code or code summaries that they had written in the past while implementing software or documenting them. To mimic developers' code or summary generation behavior, we propose a retrieval augmented framework, REDCODER, that retrieves relevant code or summaries from a retrieval database and provides them as a supplement to code generation or summarization models. REDCODER has a couple of uniqueness. First, it extends the state-of-the-art dense retrieval technique to search for relevant code or summaries. Second, it can work with retrieval databases that include unimodal (only code or natural language description) or bimodal instances (code-description pairs). We conduct experiments and extensive analysis on two benchmark datasets of code generation and summarization in Java and Python, and the promising results endorse the effectiveness of our proposed retrieval augmented framework. diff --git a/_publications/pashakhanloo2022codetrek.markdown b/_publications/pashakhanloo2022codetrek.markdown new file mode 100644 index 00000000..bac7858f --- /dev/null +++ b/_publications/pashakhanloo2022codetrek.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "CodeTrek: Flexible Modeling of Code using an Extensible Relational Representation" +authors: Pardis Pashakhanloo, Aaditya Naik, Yuepeng Wang, Hanjun Dai, Petros Maniatis, Mayur Naik +conference: ICLR +year: 2022 +additional_links: + - {name: "OpenReview", url: "https://openreview.net/forum?id=WQc075jmBmf"} +tags: ["representation", "variable misuse"] +--- +Designing a suitable representation for code-reasoning tasks is challenging in aspects such as the kinds of program information to model, how to combine them, and how much context to consider. We propose CodeTrek, a deep learning approach that addresses these challenges by representing codebases as databases that conform to rich relational schemas. The relational representation not only allows CodeTrek to uniformly represent diverse kinds of program information, but also to leverage program-analysis queries to derive new semantic relations, which can be readily incorporated without further architectural engineering. CodeTrek embeds this relational representation using a set of walks that can traverse different relations in an unconstrained fashion, and incorporates all relevant attributes along the way. We evaluate CodeTrek on four diverse and challenging Python tasks: variable misuse, exception prediction, unused definition, and variable shadowing. +CodeTrek achieves an accuracy of 91%, 63%, 98%, and 94% on these tasks respectively, and outperforms state-of-the-art neural models by 2-19% points. diff --git a/_publications/patil2022exploring.markdown b/_publications/patil2022exploring.markdown new file mode 100644 index 00000000..be5a7c12 --- /dev/null +++ b/_publications/patil2022exploring.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Exploring Dimensions of Generalizability and Few-shot Transfer for Text-to-SQL Semantic Parsing" +authors: Rajaswa Patil, Manasi Patwardhan, Shirish Karande, Lovekesh Vig, Gautam Shroff +conference: The 1st Transfer Learning for Natural Language Processing Workshop (TL4NLP 2022) +year: 2022 +additional_links: + - {name: "PDF", url: "https://proceedings.mlr.press/v203/patil23a.html"} + - {name: "Data", url: "https://github.com/ManasiPat/Spider-Gen"} +tags: ["dataset", "evaluation", "Transformer", "benchmark", "generalizability"] +--- +Existing work on generalization in Text-to-SQL semantic parsing has been restricted to a zero-shot cross-domain setting. In this paper, we introduce Spider-Gen: a Text-to-SQL benchmark to develop a paradigm of transfer learning across distinct dimensions of generalization in Text-to-SQL semantic parsing. The Spider-Gen benchmark focuses on few-shot adaption for Cross-domain, Lexical, and Structural generalization of Text-to-SQL models. Through our experiments with the Spider-Gen dataset, we show that Seq2Seq language models struggle to generalize against change in data distribution, lexical changes in database schema, and changes in SQL query complexity. Our experiments also reveal that performing few-shot fine-tuning helps Text-to-SQL models to generalize across these changes. However, such few-shot adaptation comes with a negative effect on the knowledge learnt during training. Hence, we also explore Parameter-efficient Fine-tuning methods to overcome the limitations of Seq2Seq Text-to-SQL models. We release the Spider-Gen dataset publicly to facilitate further research in generalization and transfer learning across various dimensions in Text-to-SQL semantic parsing. diff --git a/_publications/patra2016learning.markdown b/_publications/patra2016learning.markdown index 4816b918..ca22ea7c 100644 --- a/_publications/patra2016learning.markdown +++ b/_publications/patra2016learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning to Fuzz: Application-Independent Fuzz Testing with Probabilistic, Generative Models of Input Data" -authors: J. Patra, M. Pradel +authors: Jibesh Patra, Michael Pradel conference: year: 2016 -bibkey: patra2016learning tags: ["fuzzing"] --- Fuzzing is a popular technique to create test inputs for software that processes structured data. It has been successfully diff --git a/_publications/patra2021semantic.markdown b/_publications/patra2021semantic.markdown new file mode 100644 index 00000000..0ac60a9a --- /dev/null +++ b/_publications/patra2021semantic.markdown @@ -0,0 +1,32 @@ +--- +layout: publication +title: "A Semantic Bug Seeding: A Learning-Based Approach for Creating Realistic Bugs" +authors: Jibesh Patra, Michael Pradel +conference: FSE +year: 2021 +tags: ["repair", "edit"] +--- +When working on techniques to address the wide-spread problem +of software bugs, one often faces the need for a large number of +realistic bugs in real-world programs. Such bugs can either help +evaluate an approach, e.g., in form of a bug benchmark or a suite +of program mutations, or even help build the technique, e.g., in +learning-based bug detection. Because gathering a large number ofreal bugs is difficult, +a common approach is to rely on automatically +seeded bugs. Prior work seeds bugs based on syntactic transformation patterns, +which often results in unrealistic bugs and typically +cannot introduce new, application-specific code tokens. This paper +presents SemSeed, a technique for automatically seeding bugs in +a semantics-aware way. The key idea is to imitate how a given +real-world bug would look like in other programs by semantically +adapting the bug pattern to the local context. To reason about the +semantics of pieces of code, our approach builds on learned token embeddings +that encode the semantic similarities of identifiers and literals. Our +evaluation with real-world JavaScript softwares +hows that the approach effectively reproduces real bugs and clearly +outperforms a semantics-unaware approach. The seeded bugs are +useful as training data for learning-based bug detection, where +they significantly improve the bug detection ability. Moreover, we +show that SemSeed-created bugs complement existing mutation +testing operators, and that our approach is efficient enough to seed +hundreds of thousands of bugs within an hour. diff --git a/_publications/pearce2021empirical.markdown b/_publications/pearce2021empirical.markdown new file mode 100644 index 00000000..426ca0ee --- /dev/null +++ b/_publications/pearce2021empirical.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "An Empirical Cybersecurity Evaluation of GitHub Copilot's Code Contributions" +authors: Hammond Pearce, Baleegh Ahmad, Benjamin Tan, Brendan Dolan-Gavitt, Ramesh Karri +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2108.09293"} +tags: ["Transformer", "language model"] +--- +There is burgeoning interest in designing AI-based systems to assist humans in designing computing systems, including tools that automatically generate computer code. The most notable of these comes in the form of the first self-described `AI pair programmer', GitHub Copilot, a language model trained over open-source GitHub code. However, code often contains bugs - and so, given the vast quantity of unvetted code that Copilot has processed, it is certain that the language model will have learned from exploitable, buggy code. This raises concerns on the security of Copilot's code contributions. In this work, we systematically investigate the prevalence and conditions that can cause GitHub Copilot to recommend insecure code. To perform this analysis we prompt Copilot to generate code in scenarios relevant to high-risk CWEs (e.g. those from MITRE's "Top 25" list). We explore Copilot's performance on three distinct code generation axes -- examining how it performs given diversity of weaknesses, diversity of prompts, and diversity of domains. In total, we produce 89 different scenarios for Copilot to complete, producing 1,692 programs. Of these, we found approximately 40% to be vulnerable. diff --git a/_publications/peng2021how.markdown b/_publications/peng2021how.markdown new file mode 100644 index 00000000..4bb0e65f --- /dev/null +++ b/_publications/peng2021how.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "How could Neural Networks understand Programs?" +authors: Dinglan Peng, Shuxin Zheng, Yatao Li, Guolin Ke, Di He, Tie-Yan Liu +conference: ICML +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2105.04297"} +tags: ["Transformer"] +--- +Semantic understanding of programs is a fundamental problem for programming language processing (PLP). Recent works that learn representations of code based on pre-training techniques in NLP have pushed the frontiers in this direction. However, the semantics of PL and NL have essential differences. These being ignored, we believe it is difficult to build a model to better understand programs, by either directly applying off-the-shelf NLP pre-training techniques to the source code, or adding features to the model by the heuristic. In fact, the semantics of a program can be rigorously defined by formal semantics in PL theory. For example, the operational semantics, describes the meaning of a valid program as updating the environment (i.e., the memory address-value function) through fundamental operations, such as memory I/O and conditional branching. Inspired by this, we propose a novel program semantics learning paradigm, that the model should learn from information composed of (1) the representations which align well with the fundamental operations in operational semantics, and (2) the information of environment transition, which is indispensable for program understanding. To validate our proposal, we present a hierarchical Transformer-based pre-training model called OSCAR to better facilitate the understanding of programs. OSCAR learns from intermediate representation (IR) and an encoded representation derived from static analysis, which are used for representing the fundamental operations and approximating the environment transitions respectively. OSCAR empirically shows the outstanding capability of program semantics understanding on many practical software engineering tasks. diff --git a/_publications/peng2023generative.markdown b/_publications/peng2023generative.markdown new file mode 100644 index 00000000..7238aea7 --- /dev/null +++ b/_publications/peng2023generative.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Generative Type Inference for Python" +authors: Yun Peng, Chaozheng Wang, Wenxuan Wang, Cuiyun Gao, Michael R. Lyu +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2307.09163"} +tags: ["types"] +--- +Python is a popular dynamic programming language, evidenced by its ranking as the second most commonly used language on GitHub. However, its dynamic type system can lead to potential type errors, leading researchers to explore automatic type inference approaches for Python programs. The rule-based type inference approaches can ensure the accuracy of predicted variable types, but they suffer from low coverage problems. Supervised type inference approaches, while feature-agnostic, require large, high-quality annotated datasets and are limited to pre-defined types. As zero-shot approaches, the cloze-style approaches reformulate the type inference problem into a fill-in-the-blank problem. However, their performance is limited. This paper introduces TypeGen, a few-shot generative type inference approach that incorporates static domain knowledge from static analysis. TypeGen creates chain-of-thought (COT) prompts by translating the type inference steps of static analysis into prompts based on the type dependency graphs (TDGs), enabling language models to learn from how static analysis infers types. By combining COT prompts with code slices and type hints, TypeGen constructs example prompts from human annotations. TypeGen only requires very few annotated examples to teach language models to generate similar COT prompts via in-context learning. Moreover, TypeGen enhances the interpretability of results through the use of the input-explanation-output strategy. Experiments show that TypeGen outperforms the best baseline Type4Py by 10.0% for argument type prediction and 22.5% in return value type prediction in terms of top-1 Exact Match by using only five examples. Furthermore, TypeGen achieves substantial improvements of 27% to 84% compared to the zero-shot performance of large language models with parameter sizes ranging from 1.3B to 175B in terms of top-1 Exact Match. diff --git a/_publications/phan2021cotext.markdown b/_publications/phan2021cotext.markdown new file mode 100644 index 00000000..e2d5b220 --- /dev/null +++ b/_publications/phan2021cotext.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "CoTexT: Multi-task Learning with Code-Text Transformer" +authors: Long Phan, Hieu Tran, Daniel Le, Hieu Nguyen, James Anibal, Alec Peltekian, Yanfang Ye +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2105.08645"} + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.5.pdf"} +tags: ["Transformer"] +--- +We present CoTexT, a transformer-based architecture encoder-decoder pre-trained model that learns the representative context between natural language (NL) and programming language (PL) through multi-task learning. CoTexT is pre-trained, in self-supervised fashion, based on large programming language corpus to learn general-purpose understanding and code-text generation supporting downstream NL-PL task such as code summarizing/documentation, code generation, defect detection, code debugging, etc. We train CoTexT on different combination of available PL corpus including both "bimodal" and "unimodal" data where the former is the combinations of both natural texts and their corresponding code snippets in an input sequence and the latter is merely code snippets. We evaluate multi-task learning CoTexT on different generation and classification tasks on CodeXGLUE and it achieves state-of-the-art on all downstream tasks. diff --git a/_publications/piech2015learning.markdown b/_publications/piech2015learning.markdown index 03801549..3ff5d0f5 100644 --- a/_publications/piech2015learning.markdown +++ b/_publications/piech2015learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning Program Embeddings to Propagate Feedback on Student Code" -authors: C. Piech, J. Huang, A. Nguyen, M. Phulsuksombati, M, Sahami, L. Guibas +authors: Chris Piech, Jonathan Huang, Andy Nguyen, Mike Phulsuksombati, Mehran Sahami, Leonidas Guibas conference: ICML year: 2015 -bibkey: piech2015learning tags: ["representation", "repair", "education"] --- Providing feedback, both assessing final work diff --git a/_publications/poesia2022synchromesh.markdown b/_publications/poesia2022synchromesh.markdown new file mode 100644 index 00000000..6ea48c6f --- /dev/null +++ b/_publications/poesia2022synchromesh.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Synchromesh: Reliable code generation from pre-trained language models" +authors: Gabriel Poesia, Oleksandr Polozov, Vu Le, Ashish Tiwari, Gustavo Soares, Christopher Meek, Sumit Gulwani +conference: ICLR +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2201.11227"} +tags: ["Transformer", "language model"] +--- +Large pre-trained language models have been used to generate code,providing a flexible interface for synthesizing programs from natural language specifications. However, they often violate syntactic and semantic rules of their output language, limiting their practical usability. In this paper, we propose Synchromesh: a framework for substantially improving the reliability of pre-trained models for code generation. Synchromesh comprises two components. First, it retrieves few-shot examples from a training bank using Target Similarity Tuning (TST), a novel method for semantic example selection. TST learns to recognize utterances that describe similar target programs despite differences in surface natural language features. Then, Synchromesh feeds the examples to a pre-trained language model and samples programs using Constrained Semantic Decoding (CSD): a general framework for constraining the output to a set of valid programs in the target language. CSD leverages constraints on partial outputs to sample complete correct programs, and needs neither re-training nor fine-tuning of the language model. We evaluate our methods by synthesizing code from natural language descriptions using GPT-3 and Codex in three real-world languages: SQL queries, Vega-Lite visualizations and SMCalFlow programs. These domains showcase rich constraints that CSD is able to enforce, including syntax, scope, typing rules, and contextual logic. We observe substantial complementary gains from CSD and TST in prediction accuracy and in effectively preventing run-time errors. diff --git a/_publications/popov2021time.markdown b/_publications/popov2021time.markdown new file mode 100644 index 00000000..9dd73056 --- /dev/null +++ b/_publications/popov2021time.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Time-Efficient Code Completion Model for the R Programming Language" +authors: Artem Popov, Dmitrii Orekhov, Denis Litvinov, Nikolay Korolev, Gleb Morgachev +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.4.pdf"} +tags: ["dataset", "language model", "code generation", "Transformer"] +--- +In this paper we present a deep learning code completion model for the R language. We introduce several techniques to utilize language modeling based architecture in the code completion task. With these techniques, the model requires low resources, but still achieves high quality. We also present an evaluation dataset for the R language completion task. Our dataset contains multiple autocompletion usage contexts that provides robust validation results. The dataset is publicly available. diff --git a/_publications/pradel2017deep.markdown b/_publications/pradel2017deep.markdown index 4b2a2c09..ddbb674f 100644 --- a/_publications/pradel2017deep.markdown +++ b/_publications/pradel2017deep.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Deep Learning to Find Bugs" -authors: M. Pradel, K. Sen +authors: Michael Pradel, Koushik Sen conference: year: 2017 -bibkey: pradel2017deep additional_links: - {name: "PDF", url: "http://mp.binaervarianz.de/DeepBugs_TR_Nov2017.pdf"} tags: ["defect", "program analysis"] diff --git a/_publications/pradel2019typewriter.markdown b/_publications/pradel2019typewriter.markdown index 262f5fbc..89ae5d5e 100644 --- a/_publications/pradel2019typewriter.markdown +++ b/_publications/pradel2019typewriter.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "TypeWriter: Neural Type Prediction with Search-based Validation" -authors: M. Pradel, G. Gousios, J. Liu, S. Chandra +authors: Michael Pradel, Georgios Gousios, Jason Liu, Satish Chandra. conference: year: 2019 -bibkey: pradel2019typewriter additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1912.03768"} tags: ["types", "bimodal"] diff --git a/_publications/pradel2020neural.markdown b/_publications/pradel2020neural.markdown new file mode 100644 index 00000000..d781673d --- /dev/null +++ b/_publications/pradel2020neural.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Neural Software Analysis" +authors: Michael Pradel, Satish Chandra +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2011.07986"} +tags: ["program analysis", "survey"] +--- +Many software development problems can be addressed by program analysis tools, which traditionally are based on precise, logical reasoning and heuristics to ensure that the tools are practical. Recent work has shown tremendous success through an alternative way of creating developer tools, which we call neural software analysis. The key idea is to train a neural machine learning model on numerous code examples, which, once trained, makes predictions about previously unseen code. In contrast to traditional program analysis, neural software analysis naturally handles fuzzy information, such as coding conventions and natural language embedded in code, without relying on manually encoded heuristics. This article gives an overview of neural software analysis, discusses when to (not) use it, and presents three example analyses. The analyses address challenging software development problems: bug detection, type prediction, and code completion. The resulting tools complement and outperform traditional program analyses, and are used in industrial practice. diff --git a/_publications/pravilov2021unsupervised.markdown b/_publications/pravilov2021unsupervised.markdown new file mode 100644 index 00000000..5b6c23ec --- /dev/null +++ b/_publications/pravilov2021unsupervised.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Unsupervised Learning of General-Purpose Embeddings for Code Changes" +authors: Mikhail Pravilov, Egor Bogomolov, Yaroslav Golubev, Timofey Bryksin +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2106.02087"} +tags: ["edit", "representation"] +--- +Applying machine learning to tasks that operate with code changes requires their numerical representation. In this work, we propose an approach for obtaining such representations during pre-training and evaluate them on two different downstream tasks - applying changes to code and commit message generation. During pre-training, the model learns to apply the given code change in a correct way. This task requires only code changes themselves, which makes it unsupervised. In the task of applying code changes, our model outperforms baseline models by 5.9 percentage points in accuracy. As for the commit message generation, our model demonstrated the same results as supervised models trained for this specific task, which indicates that it can encode code changes well and can be improved in the future by pre-training on a larger dataset of easily gathered code changes. diff --git a/_publications/proksch2015intelligent.markdown b/_publications/proksch2015intelligent.markdown index 49f04ee1..9d8870a9 100644 --- a/_publications/proksch2015intelligent.markdown +++ b/_publications/proksch2015intelligent.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Intelligent Code Completion with Bayesian Networks" -authors: S. Proksch, J. Lerch, M. Mezini +authors: Sebastian Proksch, Johannes Lerch, Mira Mezini conference: TSE year: 2015 -bibkey: proksch2015intelligent tags: ["autocomplete"] --- Code completion is an integral part of modern Integrated Development Environments (IDEs). Developers diff --git a/_publications/pu2016skp.markdown b/_publications/pu2016skp.markdown index 66b14547..e716a21d 100644 --- a/_publications/pu2016skp.markdown +++ b/_publications/pu2016skp.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "sk_p: a neural program corrector for MOOCs" -authors: Y. Pu, K. Narasimhan, A. Solar-Lezama, R. Barzilay +authors: Yewen Pu, Karthik Narasimhan, Armando Solar-Lezama, Regina Barzilay conference: SPLASH year: 2016 -bibkey: pu2016skp tags: ["repair"] --- We present a novel technique for automatic program correction in MOOCs, capable of fixing both syntactic and semantic errors without manual, problem specific correction strategies. Given an incorrect student program, it generates candidate programs from a distribution of likely corrections, and checks each candidate for correctness against a test suite. diff --git a/_publications/puri2021project.markdown b/_publications/puri2021project.markdown new file mode 100644 index 00000000..22090941 --- /dev/null +++ b/_publications/puri2021project.markdown @@ -0,0 +1,33 @@ +--- +layout: publication +title: "Project CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks" +authors: Ruchir Puri, David S. Kung, Geert Janssen, Wei Zhang, Giacomo Domeniconi, Vladmir Zolotov, Julian Dolby, Jie Chen, Mihir Choudhury, Lindsey Decker, Veronika Thost, Luca Buratti, Saurabh Pujar, Ulrich Finkler +conference: +year: 2021 +additional_links: + - {name: "GitHub", url: "https://github.com/IBM/Project_CodeNet"} +tags: ["dataset"] +--- +Advancements in deep learning and machine learning algorithms have enabled +breakthrough progress in computer vision, speech recognition, natural language +processing and beyond. In addition, over the last several decades, software has +been built into the fabric of every aspect of our society. Together, these two +trends have generated new interest in the fast-emerging research area of “AI for +Code”. As software development becomes ubiquitous across all industries and code +infrastructure of enterprise legacy applications ages, it is more critical than ever +to increase software development productivity and modernize legacy applications. +Over the last decade, datasets like ImageNet, with its large scale and diversity, +have played a pivotal role in algorithmic advancements from computer vision to +language and speech understanding. In this paper, we present "Project CodeNet", +a first-of-its-kind, very large scale, diverse, and high-quality dataset to accelerate +the algorithmic advancements in AI for Code. It consists of 14M code samples +and about 500M lines of code in 55 different programming languages. Project +CodeNet is not only unique in its scale, but also in the diversity of coding tasks +it can help benchmark: from code similarity and classification for advances in +code recommendation algorithms, and code translation between a large variety +programming languages, to advances in code performance (both runtime, and +memory) improvement techniques. CodeNet also provides sample input and output +test sets for over 7M code samples, which can be critical for determining code +equivalence in different languages. As a usability feature, we provide several +preprocessing tools in Project CodeNet to transform source codes into representations +that can be readily used as inputs into machine learning models. diff --git a/_publications/rabin2019testing.markdown b/_publications/rabin2019testing.markdown new file mode 100644 index 00000000..60a0bfb5 --- /dev/null +++ b/_publications/rabin2019testing.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Testing Neural Program Analyzers" +authors: Md Rafiqul Islam Rabin, Ke Wang, Mohammad Amin Alipour +conference: ASE (LBR-Track) +year: 2019 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1908.10711"} + - {name: "code", url: "https://github.com/mdrafiqulrabin/tnpa-framework"} +tags: ["evaluation", "refactoring"] +--- +Deep neural networks have been increasingly used in software engineering and program analysis tasks. They usually take a program and make some predictions about it, e.g., bug prediction. We call these models neural program analyzers. The reliability of neural programs can impact the reliability of the encompassing analyses. In this paper, we describe our ongoing efforts to develop effective techniques for testing neural programs. We discuss the challenges involved in developing such tools and our future plans. In our preliminary experiment on a neural model recently proposed in the literature, we found that the model is very brittle, and simple perturbations in the input can cause the model to make mistakes in its prediction. diff --git a/_publications/rabin2020demystifying.markdown b/_publications/rabin2020demystifying.markdown new file mode 100644 index 00000000..89ff6934 --- /dev/null +++ b/_publications/rabin2020demystifying.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Towards Demystifying Dimensions of Source Code Embeddings" +authors: Md Rafiqul Islam Rabin, Arjun Mukherjee, Omprakash Gnawali, Mohammad Amin Alipour +conference: "RL+SE&PL (Co-located with ESEC/FSE)" +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2008.13064"} + - {name: "code", url: "https://github.com/mdrafiqulrabin/handcrafted-embeddings"} +tags: ["evaluation", "representation", "naming", "interpretability"] +--- +Source code representations are key in applying machine learning techniques for processing and analyzing programs. A popular approach in representing source code is neural source code embeddings that represents programs with high-dimensional vectors computed by training deep neural networks on a large volume of programs. Although successful, there is little known about the contents of these vectors and their characteristics. In this paper, we present our preliminary results towards better understanding the contents of code2vec neural source code embeddings. In particular, in a small case study, we use the code2vec embeddings to create binary SVM classifiers and compare their performance with the handcrafted features. Our results suggest that the handcrafted features can perform very close to the highly-dimensional code2vec embeddings, and the information gains are more evenly distributed in the code2vec embeddings compared to the handcrafted features. We also find that the code2vec embeddings are more resilient to the removal of dimensions with low information gains than the handcrafted features. We hope our results serve a stepping stone toward principled analysis and evaluation of these code representations. diff --git a/_publications/rabin2021generalizability.markdown b/_publications/rabin2021generalizability.markdown new file mode 100644 index 00000000..df8f78e0 --- /dev/null +++ b/_publications/rabin2021generalizability.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "On the Generalizability of Neural Program Models with respect to Semantic-Preserving Program Transformations" +authors: Md Rafiqul Islam Rabin, Nghi D. Q. Bui, Ke Wang, Yijun Yu, Lingxiao Jiang, Mohammad Amin Alipour +conference: IST +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2008.01566"} + - {name: "code", url: "https://github.com/mdrafiqulrabin/tnpa-generalizability"} +tags: ["evaluation", "adversarial", "generalizability", "refactoring", "summarization"] +--- +With the prevalence of publicly available source code repositories to train deep neural network models, neural program models can do well in source code analysis tasks such as predicting method names in given programs that cannot be easily done by traditional program analysis techniques. Although such neural program models have been tested on various existing datasets, the extent to which they generalize to unforeseen source code is largely unknown. Since it is very challenging to test neural program models on all unforeseen programs, in this paper, we propose to evaluate the generalizability of neural program models with respect to semantic-preserving transformations: a generalizable neural program model should perform equally well on programs that are of the same semantics but of different lexical appearances and syntactical structures. We compare the results of various neural program models for the method name prediction task on programs before and after automated semantic-preserving transformations. We use three Java datasets of different sizes and three state-of-the-art neural network models for code, namely code2vec, code2seq, and GGNN, to build nine such neural program models for evaluation. Our results show that even with small semantically preserving changes to the programs, these neural program models often fail to generalize their performance. Our results also suggest that neural program models based on data and control dependencies in programs generalize better than neural program models based only on abstract syntax trees. On the positive side, we observe that as the size of the training dataset grows and diversifies the generalizability of correct predictions produced by the neural program models can be improved too. Our results on the generalizability of neural program models provide insights to measure their limitations and provide a stepping stone for their improvement. diff --git a/_publications/rabin2021understanding.markdown b/_publications/rabin2021understanding.markdown new file mode 100644 index 00000000..05455697 --- /dev/null +++ b/_publications/rabin2021understanding.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Understanding Neural Code Intelligence Through Program Simplification" +authors: Md Rafiqul Islam Rabin, Vincent J. Hellendoorn, Mohammad Amin Alipour +conference: ESEC/FSE +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2106.03353"} + - {name: "code", url: "https://github.com/mdrafiqulrabin/SIVAND"} +tags: ["interpretability", "refactoring", "information extraction"] +--- +A wide range of code intelligence (CI) tools, powered by deep neural networks, have been developed recently to improve programming productivity and perform program analysis. To reliably use such tools, developers often need to reason about the behavior of the underlying models and the factors that affect them. This is especially challenging for tools backed by deep neural networks. Various methods have tried to reduce this opacity in the vein of "transparent/interpretable-AI". However, these approaches are often specific to a particular set of network architectures, even requiring access to the network's parameters. This makes them difficult to use for the average programmer, which hinders the reliable adoption of neural CI systems. In this paper, we propose a simple, model-agnostic approach to identify critical input features for models in CI systems, by drawing on software debugging research, specifically delta debugging. Our approach, SIVAND, uses simplification techniques that reduce the size of input programs of a CI model while preserving the predictions of the model. We show that this approach yields remarkably small outputs and is broadly applicable across many model architectures and problem domains. We find that the models in our experiments often rely heavily on just a few syntactic features in input programs. We believe that SIVAND's extracted features may help understand neural CI systems' predictions and learned behavior. diff --git a/_publications/rabin2022memorization.markdown b/_publications/rabin2022memorization.markdown new file mode 100644 index 00000000..b75d7827 --- /dev/null +++ b/_publications/rabin2022memorization.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Memorization and Generalization in Neural Code Intelligence Models" +authors: Md Rafiqul Islam Rabin, Aftab Hussain, Mohammad Amin Alipour, Vincent J. Hellendoorn +conference: IST +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2106.08704"} + - {name: "code", url: "https://github.com/mdrafiqulrabin/CI-Memorization"} +tags: ["evaluation", "memorization", "generalizability", "refactoring", "language model"] +--- +Deep Neural Networks (DNNs) are increasingly being used in software engineering and code intelligence tasks. These are powerful tools that are capable of learning highly generalizable patterns from large datasets through millions of parameters. At the same time, their large capacity can render them prone to memorizing data points. Recent work suggests that the memorization risk manifests especially strongly when the training dataset is noisy, involving many ambiguous or questionable samples, and memorization is the only recourse. The goal of this paper is to evaluate and compare the extent of memorization and generalization in neural code intelligence models. It aims to provide insights on how memorization may impact the learning behavior of neural models in code intelligence systems. To observe the extent of memorization in models, we add random noise to the original training dataset and use various metrics to quantify the impact of noise on various aspects of training and testing. We evaluate several state-of-the-art neural code intelligence models and benchmarks based on Java, Python, and Ruby codebases. Our results highlight important risks: millions of trainable parameters allow the neural networks to memorize anything, including noisy data, and provide a false sense of generalization. We observed all models manifest some forms of memorization. This can be potentially troublesome in most code intelligence tasks where they rely on rather noise-prone and repetitive data sources, such as code from GitHub. To the best of our knowledge, we provide the first study to quantify memorization effects in the domain of software engineering and code intelligence systems. This work raises awareness and provides new insights into important issues of training neural models in code intelligence systems that are usually overlooked by software engineering researchers. diff --git a/_publications/rabin2022understanding.markdown b/_publications/rabin2022understanding.markdown new file mode 100644 index 00000000..d4879a84 --- /dev/null +++ b/_publications/rabin2022understanding.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Syntax-Guided Program Reduction for Understanding Neural Code Intelligence Models" +authors: Md Rafiqul Islam Rabin, Aftab Hussain, Mohammad Amin Alipour +conference: MAPS +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2205.14374"} + - {name: "code", url: "https://github.com/mdrafiqulrabin/ci-dd-perses"} +tags: ["interpretability", "refactoring", "adversarial"] +--- +Neural code intelligence (CI) models are opaque black-boxes and offer little insight on the features they use in making predictions. This opacity may lead to distrust in their prediction and hamper their wider adoption in safety-critical applications. Recently, input program reduction techniques have been proposed to identify key features in the input programs to improve the transparency of CI models. However, this approach is syntax-unaware and does not consider the grammar of the programming language. In this paper, we apply a syntax-guided program reduction technique that considers the grammar of the input programs during reduction. Our experiments on multiple models across different types of input programs show that the syntax-guided program reduction technique is faster and provides smaller sets of key tokens in reduced programs. We also show that the key tokens could be used in generating adversarial examples for up to 65% of the input programs. diff --git a/_publications/rabinovich2017abstract.markdown b/_publications/rabinovich2017abstract.markdown index 56807833..84c36cfd 100644 --- a/_publications/rabinovich2017abstract.markdown +++ b/_publications/rabinovich2017abstract.markdown @@ -1,10 +1,11 @@ --- layout: publication title: "Abstract Syntax Networks for Code Generation and Semantic Parsing" -authors: M. Rabinovich, M. Stern, D. Klein +authors: Maxim Rabinovich, Mitchell Stern, Dan Klein conference: ACL year: 2017 -bibkey: rabinovich2017abstract -tags: ["generation", "AST"] +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1704.07535"} +tags: ["code generation", "grammar"] --- Tasks like code generation and semantic parsing require mapping unstructured (or partially structured) inputs to well-formed, executable outputs. We introduce abstract syntax networks, a modeling framework for these problems. The outputs are represented as abstract syntax trees (ASTs) and constructed by a decoder with a dynamically-determined modular structure paralleling the structure of the output tree. On the benchmark Hearthstone dataset for code generation, our model obtains 79.2 BLEU and 22.7% exact match accuracy, compared to previous state-of-the-art values of 67.1 and 6.1%. Furthermore, we perform competitively on the Atis, Jobs, and Geo semantic parsing datasets with no task-specific engineering. diff --git a/_publications/raghothaman2018user.markdown b/_publications/raghothaman2018user.markdown index 7117575d..d7334c65 100644 --- a/_publications/raghothaman2018user.markdown +++ b/_publications/raghothaman2018user.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "User-guided program reasoning using Bayesian inference" -authors: M. Raghothaman, S. Kulkarni, K. Helo, M. Naik +authors: Mukund Raghothaman, Sulekha Kulkarni, Kihong Heo, Mayur Naik conference: PLDI year: 2018 -bibkey: raghothaman2018user additional_links: - {name: "Paper", url: "https://www.cis.upenn.edu/~kheo/paper/pldi18-rakuhena.pdf"} tags: ["program analysis"] diff --git a/_publications/rahman2019natural.markdown b/_publications/rahman2019natural.markdown index 6a6505a4..e5f73629 100644 --- a/_publications/rahman2019natural.markdown +++ b/_publications/rahman2019natural.markdown @@ -1,9 +1,8 @@ --- layout: publication title: "Natural Software Revisited" -authors: M. Rahman, D. Palani, P. Rigby +authors: Musfiqur Rahman, Dharani Palani, Peter C. Rigby conference: ICSE year: 2019 -bibkey: rahman2019natural --- Recent works have concluded that software is more repetitive and predictable, i.e. more natural, than English texts. These works included “simple/artificial” syntax rules in their language models. When we remove SyntaxTokens we find that code is still repetitive and predictable but only at levels slightly above English. Furthermore, previous works have compared individual Java programs to general English corpora, such as Gutenberg, which contains a historically large range of styles and subjects (e.g. Saint Augustine to Oscar Wilde). We perform an additional comparison of technical StackOverflow English discussions with source code and find that this restricted English is similarly repetitive to code. Although we find that code is less repetitive than previously thought, we suspect that API code element usage will be repetitive across software projects. For example a file is opened and closed in the same manner irrespective of domain. When we restrict our n-grams to those contained in the Java API we find that the entropy is significantly lower than the English corpora. Previous works have focused on sequential sequences of tokens. When we extract program graphs of size 2, 3, and 4 nodes we see that the abstract graph representation is much more concise and repetitive than the sequential representations of the same code. This suggests that future work should focus on statistical graph models that go beyond linear sequences of tokens. Our anonymous replication package makes our scripts and data available to future researchers and reviewers. diff --git a/_publications/ramakrishnan2020backdoors.markdown b/_publications/ramakrishnan2020backdoors.markdown new file mode 100644 index 00000000..35d4d059 --- /dev/null +++ b/_publications/ramakrishnan2020backdoors.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "Backdoors in Neural Models of Source Code" +authors: Goutham Ramakrishnan, Aws Albarghouthi +conference: ICPR +year: 2022 +additional_links: + - {name: "IEEE", url: "https://ieeexplore.ieee.org/document/9956690"} + - {name: "ArXiV", url: "https://arxiv.org/abs/2006.06841"} + - {name: "Code", url: "https://github.com/goutham7r/backdoors-for-code"} +tags: ["adversarial"] +--- +Deep neural networks are vulnerable to a range of adversaries. A particularly pernicious class of vulnerabilities are backdoors, where model predictions diverge in the presence of subtle triggers in inputs. An attacker can implant a backdoor by poisoning the training data to yield a desired target prediction on triggered inputs. We study backdoors in the context of deep-learning for source code. (1) We define a range of backdoor classes for source-code tasks and show how to poison a dataset to install such backdoors. (2) We adapt and improve recent algorithms from robust statistics for our setting, showing that backdoors leave a spectral signature in the learned representation of source code, thus enabling detection of poisoned data. (3) We conduct a thorough evaluation on different architectures and languages, showing the ease of injecting backdoors and our ability to eliminate them. diff --git a/_publications/ramakrishnan2020semantic.markdown b/_publications/ramakrishnan2020semantic.markdown deleted file mode 100644 index 56006a67..00000000 --- a/_publications/ramakrishnan2020semantic.markdown +++ /dev/null @@ -1,14 +0,0 @@ ---- -layout: publication -title: "Semantic Robustness of Models of Source Code" -authors: G. Ramakrishnan, J. Henkel, Z. Wang, A. Albarghouthi, S. Jha, T. Reps -conference: -year: 2020 -bibkey: ramakrishnan2020semantic -additional_links: - - {name: "ArXiV", url: "https://arxiv.org/abs/2002.03043"} -tags: ["adversarial", "naming"] ---- -Deep neural networks are vulnerable to adversarial examples - small input perturbations that result in incorrect predictions. We study this problem in the context of models of source code, where we want the network to be robust to source-code modifications that preserve code functionality. We define a natural notion of robustness, k-transformation robustness, in which an adversary performs up to k semantics-preserving transformations to an input program. We show how to train robust models using an adversarial training objective inspired by that of Madry et al. (2018) for continuous domains. - -We implement an extensible framework for adversarial training over source code, and conduct a thorough evaluation on a number of datasets and two different architectures. Our results show (1) the increase in robustness following adversarial training, (2) the ability of training on weak adversaries to provide robustness to attacks by stronger adversaries, and (3) the shift in attribution focus of adversarially trained models towards semantic vs. syntactic features. diff --git a/_publications/ray2015naturalness.markdown b/_publications/ray2015naturalness.markdown index 3b6aab95..d5c521ed 100644 --- a/_publications/ray2015naturalness.markdown +++ b/_publications/ray2015naturalness.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "On the “Naturalness” of Buggy Code" -authors: B. Ray, V. Hellendoorn, S. Godhane, Z. Tu, A. Bacchelli, P. Devanbu +authors: Baishakhi Ray, Vincent Hellendoorn, Saheel Godhane, Zhaopeng Tu, Alberto Bacchelli, Premkumar Devanbu conference: ICSE year: 2015 -bibkey: ray2015naturalness tags: ["defect"] --- Real software, the kind working programmers produce by the kLOC diff --git a/_publications/raychev2014code.markdown b/_publications/raychev2014code.markdown index a9f4e68b..fd2f4d38 100644 --- a/_publications/raychev2014code.markdown +++ b/_publications/raychev2014code.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Code Completion with Statistical Language Models" -authors: V. Raychev, M. Vechev, E. Yahav +authors: Veselin Raychev, Martin Vechev, Eran Yahav conference: PLDI year: 2014 -bibkey: raychev2014code -tags: ["language model", "autocomplete", "generation"] +tags: ["language model", "autocomplete", "code generation"] --- We address the problem of synthesizing code completions for programs using APIs. Given a program with holes, we synthesize completions for holes with the most likely sequences of method calls. diff --git a/_publications/raychev2015predicting.markdown b/_publications/raychev2015predicting.markdown index 6ba2e760..b35116d1 100644 --- a/_publications/raychev2015predicting.markdown +++ b/_publications/raychev2015predicting.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Predicting Program Properties from “Big Code”" -authors: V. Raychev, M. Vechev, A. Krause +authors: Veselin Raychev, Martin Vechev, Andreas Krause conference: POPL year: 2015 -bibkey: raychev2015predicting tags: ["program analysis", "naming", "types", "deobfuscation"] --- We present a new approach for predicting program properties from diff --git a/_publications/raychev2016learning.markdown b/_publications/raychev2016learning.markdown index 5a5a632b..009e4fd8 100644 --- a/_publications/raychev2016learning.markdown +++ b/_publications/raychev2016learning.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "Learning Programs from Noisy Data" -authors: V. Raychev, P. Bielik, M. Vechev, A. Krause +authors: Veselin Raychev, Pavol lBielik, Martin Vechev, Andreas Krause conference: POPL year: 2016 -bibkey: raychev2016learning -tags: ["generation", "grammar"] +tags: ["code generation", "grammar"] --- We present a new approach for learning programs from noisy datasets. Our approach is based on two new concepts: a regularized diff --git a/_publications/reid2022learning.markdown b/_publications/reid2022learning.markdown new file mode 100644 index 00000000..a33f8eff --- /dev/null +++ b/_publications/reid2022learning.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Learning to Model Editing Processes" +authors: Machel Reid, Graham Neubig +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2205.12374"} +tags: ["Transformer", "edit"] +--- +Most existing sequence generation models produce outputs in one pass, usually left-to-right. However, this is in contrast with a more natural approach that humans use in generating content; iterative refinement and editing. Recent work has introduced edit-based models for various tasks (such as neural machine translation and text style transfer), but these generally model a single edit step. In this work, we propose modeling editing processes, modeling the whole process of iteratively generating sequences. We form a conceptual framework to describe the likelihood of multi-step edits, and describe neural models that can learn a generative model of sequences based on these multistep edits. We introduce baseline results and metrics on this task, finding that modeling editing processes improves performance on a variety of axes on both our proposed task and related downstream tasks compared to previous single-step models of edits. diff --git a/_publications/ren2020codebleu.markdown b/_publications/ren2020codebleu.markdown new file mode 100644 index 00000000..209815a6 --- /dev/null +++ b/_publications/ren2020codebleu.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "CodeBLEU: a Method for Automatic Evaluation of Code Synthesis" +authors: Shuo Ren, Daya Guo, Shuai Lu, Long Zhou, Shujie Liu, Duyu Tang, Neel Sundaresan, Ming Zhou, Ambrosio Blanco, Shuai Ma +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2009.10297"} +tags: ["evaluation"] +--- +Evaluation metrics play a vital role in the growth of an area as it defines the standard of distinguishing between good and bad models. In the area of code synthesis, the commonly used evaluation metric is BLEU or perfect accuracy, but they are not suitable enough to evaluate codes, because BLEU is originally designed to evaluate the natural language, neglecting important syntactic and semantic features of codes, and perfect accuracy is too strict thus it underestimates different outputs with the same semantic logic. To remedy this, we introduce a new automatic evaluation metric, dubbed CodeBLEU. It absorbs the strength of BLEU in the n-gram match and further injects code syntax via abstract syntax trees (AST) and code semantics via data-flow. We conduct experiments by evaluating the correlation coefficient between CodeBLEU and quality scores assigned by the programmers on three code synthesis tasks, i.e., text-to-code, code translation, and code refinement. Experimental results show that our proposed CodeBLEU can achieve a better correlation with programmer assigned scores compared with BLEU and accuracy. diff --git a/_publications/richardson2017code2text.markdown b/_publications/richardson2017code2text.markdown index 305de096..e2b66e38 100644 --- a/_publications/richardson2017code2text.markdown +++ b/_publications/richardson2017code2text.markdown @@ -4,7 +4,6 @@ title: "The Code2Text Challenge: Text Generation in Source Code Libraries" authors: Kyle Richardson, Sina Zarrieß, Jonas Kuhn conference: INLG year: 2017 -bibkey: richardson2017code2text additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1708.00098"} tags: ["bimodal"] diff --git a/_publications/richardson2017function.markdown b/_publications/richardson2017function.markdown index b41e93f8..65d1063f 100644 --- a/_publications/richardson2017function.markdown +++ b/_publications/richardson2017function.markdown @@ -4,7 +4,6 @@ title: "Function Assistant: A Tool for NL Querying of APIs" authors: Kyle Richardson, Jonas Kuhn conference: EMNLP year: 2017 -bibkey: richardson2017function additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1706.00468"} tags: ["bimodal", "API"] diff --git a/_publications/richardson2017learning.markdown b/_publications/richardson2017learning.markdown index 01db6780..cf6f1cb6 100644 --- a/_publications/richardson2017learning.markdown +++ b/_publications/richardson2017learning.markdown @@ -4,7 +4,6 @@ title: "Learning Technical Correspondences in Technical Documentation" authors: Kyle Richardson, Jonas Kuhn conference: ACL year: 2017 -bibkey: richardson2017learning additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1705.04815"} tags: ["documentation", "API", "bimodal"] diff --git a/_publications/richardson2018polyglot.markdown b/_publications/richardson2018polyglot.markdown index 46b1f009..a2d9bf7e 100644 --- a/_publications/richardson2018polyglot.markdown +++ b/_publications/richardson2018polyglot.markdown @@ -4,7 +4,6 @@ title: "Polyglot Semantic Parsing in APIs" authors: Kyle Richardson, Jonathan Berant, Jonas Kuhn conference: NAACL year: 2018 -bibkey: richardson2018polyglot additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1803.06966"} tags: ["bimodal", "API"] diff --git a/_publications/richter2022can.markdown b/_publications/richter2022can.markdown new file mode 100644 index 00000000..d462f424 --- /dev/null +++ b/_publications/richter2022can.markdown @@ -0,0 +1,14 @@ +--- +layout: publication +title: "Can we learn from developer mistakes? Learning to localize and repair real bugs from real bug fixes" +authors: Cedric Richter, Heike Wehrheim +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2207.00301"} + - {name: "Code", url: "https://github.com/cedricrupb/nbfbaselines"} +tags: ["Transformer", "repair", "defect"] +--- +Real bug fixes found in open source repositories seem to be the perfect source for learning to localize and repair real bugs. However, the absence of large scale bug fix collections has made it difficult to effectively exploit real bug fixes in the training of larger neural models in the past. In contrast, artificial bugs -- produced by mutating existing source code -- can be easily obtained at a sufficient scale and are therefore often preferred in the training of existing approaches. Still, localization and repair models that are trained on artificial bugs usually underperform when faced with real bugs. This raises the question whether bug localization and repair models trained on real bug fixes are more effective in localizing and repairing real bugs. + +We address this question by introducing RealiT, a pre-train-and-fine-tune approach for effectively learning to localize and repair real bugs from real bug fixes. RealiT is first pre-trained on a large number of artificial bugs produced by traditional mutation operators and then fine-tuned on a smaller set of real bug fixes. Fine-tuning does not require any modifications of the learning algorithm and hence can be easily adopted in various training scenarios for bug localization or repair (even when real training data is scarce). In addition, we found that training on real bug fixes with RealiT is empirically powerful by nearly doubling the localization performance of an existing model on real bugs while maintaining or even improving the repair performance. diff --git a/_publications/roziere2021dobf.markdown b/_publications/roziere2021dobf.markdown new file mode 100644 index 00000000..8b653e2f --- /dev/null +++ b/_publications/roziere2021dobf.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "DOBF: A Deobfuscation Pre-Training Objective for Programming Languages" +authors: Baptiste Roziere, Marie-Anne Lachaux, Marc Szafraniec, Guillaume Lample +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2102.07492"} +tags: ["pretraining"] +--- +Recent advances in self-supervised learning have dramatically improved the state of the art on a wide variety of tasks. However, research in language model pre-training has mostly focused on natural languages, and it is unclear whether models like BERT and its variants provide the best pre-training when applied to other modalities, such as source code. In this paper, we introduce a new pre-training objective, DOBF, that leverages the structural aspect of programming languages and pre-trains a model to recover the original version of obfuscated source code. We show that models pre-trained with DOBF significantly outperform existing approaches on multiple downstream tasks, providing relative improvements of up to 13% in unsupervised code translation, and 24% in natural language code search. Incidentally, we found that our pre-trained model is able to de-obfuscate fully obfuscated source files, and to suggest descriptive variable names. diff --git a/_publications/roziere2021leveraging.markdown b/_publications/roziere2021leveraging.markdown new file mode 100644 index 00000000..bdd4ce54 --- /dev/null +++ b/_publications/roziere2021leveraging.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Leveraging Automated Unit Tests for Unsupervised Code Translation" +authors: Baptiste Roziere, Jie M. Zhang, Francois Charton, Mark Harman, Gabriel Synnaeve, Guillaume Lample +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2110.06773"} +tags: ["migration"] +--- +With little to no parallel data available for programming languages, unsupervised methods are well-suited to source code translation. However, the majority of unsupervised machine translation approaches rely on back-translation, a method developed in the context of natural language translation and one that inherently involves training on noisy inputs. Unfortunately, source code is highly sensitive to small changes; a single token can result in compilation failures or erroneous programs, unlike natural languages where small inaccuracies may not change the meaning of a sentence. To address this issue, we propose to leverage an automated unit-testing system to filter out invalid translations, thereby creating a fully tested parallel corpus. We found that fine-tuning an unsupervised model with this filtered data set significantly reduces the noise in the translations so-generated, comfortably outperforming the state-of-the-art for all language pairs studied. In particular, for Java → Python and Python → C++ we outperform the best previous methods by more than 16% and 24% respectively, reducing the error rate by more than 35%. diff --git a/_publications/russell2018automated.markdown b/_publications/russell2018automated.markdown index 2563e9e9..1cdb1e1f 100644 --- a/_publications/russell2018automated.markdown +++ b/_publications/russell2018automated.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Automated Vulnerability Detection in Source Code Using Deep Representation Learning" -authors: R. L. Russell, L. Kim, L. H. Hamilton, T. Lazovich, J. A. Harer, O. Ozdemir, P. M. Ellingwood, M. W. McConley +authors: Rebecca L. Russell, Louis Kim, Lei H. Hamilton, Tomo Lazovich, Jacob A. Harer, Onur Ozdemir, Paul M. Ellingwood, Marc W. McConley conference: year: 2018 -bibkey: russell2018automated additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1807.04320"} tags: ["program analysis"] diff --git a/_publications/saberi2023model.markdown b/_publications/saberi2023model.markdown new file mode 100644 index 00000000..7dcdc632 --- /dev/null +++ b/_publications/saberi2023model.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Model-Agnostic Syntactical Information for Pre-Trained Programming Language Models" +authors: Iman Saberi, Fateme H. Fard +conference: MSR +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2303.06233"} +tags: ["Transformer", "repair", "summarization"] +--- +Pre-trained Programming Language Models (PPLMs) achieved many recent states of the art results for many code-related software engineering tasks. Though some studies use data flow or propose tree-based models that utilize Abstract Syntax Tree (AST), most PPLMs do not fully utilize the rich syntactical information in source code. Still, the input is considered a sequence of tokens. There are two issues; the first is computational inefficiency due to the quadratic relationship between input length and attention complexity. Second, any syntactical information, when needed as an extra input to the current PPLMs, requires the model to be pre-trained from scratch, wasting all the computational resources already used for pre-training the current models. In this work, we propose Named Entity Recognition (NER) adapters, lightweight modules that can be inserted into Transformer blocks to learn type information extracted from the AST. These adapters can be used with current PPLMs such as CodeBERT, GraphCodeBERT, and CodeT5. We train the NER adapters using a novel Token Type Classification objective function (TTC). We insert our proposed work in CodeBERT, building CodeBERTER, and evaluate the performance on two tasks of code refinement and code summarization. CodeBERTER improves the accuracy of code refinement from 16.4 to 17.8 while using 20% of training parameter budget compared to the fully fine-tuning approach, and the BLEU score of code summarization from 14.75 to 15.90 while reducing 77% of training parameters compared to the fully fine-tuning approach. diff --git a/_publications/sahu2022learning.markdown b/_publications/sahu2022learning.markdown new file mode 100644 index 00000000..c80232b7 --- /dev/null +++ b/_publications/sahu2022learning.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Learning to Answer Semantic Queries over Code" +authors: Surya Prakash Sahu, Madhurima Mandal, Shikhar Bharadwaj, Aditya Kanade, Petros Maniatis, Shirish Shevade +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2209.08372"} +tags: ["static analysis", "Transformer"] +--- +During software development, developers need answers to queries about semantic aspects of code. Even though extractive question-answering using neural approaches has been studied widely in natural languages, the problem of answering semantic queries over code using neural networks has not yet been explored. This is mainly because there is no existing dataset with extractive question and answer pairs over code involving complex concepts and long chains of reasoning. We bridge this gap by building a new, curated dataset called CodeQueries, and proposing a neural question-answering methodology over code. +We build upon state-of-the-art pre-trained models of code to predict answer and supporting-fact spans. Given a query and code, only some of the code may be relevant to answer the query. We first experiment under an ideal setting where only the relevant code is given to the model and show that our models do well. We then experiment under three pragmatic considerations: (1) scaling to large-size code, (2) learning from a limited number of examples and (3) robustness to minor syntax errors in code. Our results show that while a neural model can be resilient to minor syntax errors in code, increasing size of code, presence of code that is not relevant to the query, and reduced number of training examples limit the model performance. We are releasing our data and models to facilitate future work on the proposed problem of answering semantic queries over code. diff --git a/_publications/saini2018oreo.markdown b/_publications/saini2018oreo.markdown index 898c5fa1..599a9c86 100644 --- a/_publications/saini2018oreo.markdown +++ b/_publications/saini2018oreo.markdown @@ -1,14 +1,13 @@ --- layout: publication title: "Oreo: detection of clones in the twilight zone" -authors: V. Saini, F. Farmahinifarahani, Y. Lu, P. Baldi, C. Lopes +authors: Vaibhav Saini, Farima Farmahinifarahani, Yadong Lu, Pierre Baldi, Cristina Lopes conference: ESEC/FSE year: 2018 -bibkey: saini2018oreo additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1806.05837"} - {name: "website", url: "https://dl.acm.org/doi/abs/10.1145/3236024.3236026"} - {name: "code", url: "https://github.com/Mondego/oreo-artifact"} -tags: ["clone", "metrics"] +tags: ["clone"] --- Source code clones are categorized into four types of increasing difficulty of detection, ranging from purely textual (Type-1) to purely semantic (Type-4). Most clone detectors reported in the literature work well up to Type-3, which accounts for syntactic differences. In between Type-3 and Type-4, however, there lies a spectrum of clones that, although still exhibiting some syntactic similarities, are extremely hard to detect – the Twilight Zone. Most clone detectors reported in the literature fail to operate in this zone. We present Oreo, a novel approach to source code clone detection that not only detects Type-1 to Type-3 clones accurately, but is also capable of detecting harder-to-detect clones in the Twilight Zone. Oreo is built using a combination of machine learning, information retrieval, and software metrics. We evaluate the recall of Oreo on BigCloneBench, and perform manual evaluation for precision. Oreo has both high recall and precision. More importantly, it pushes the boundary in detection of clones with moderate to weak syntactic similarity in a scalable manner. diff --git a/_publications/santos2018syntax.markdown b/_publications/santos2018syntax.markdown index 9229c502..a8345ce3 100644 --- a/_publications/santos2018syntax.markdown +++ b/_publications/santos2018syntax.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Syntax and Sensibility: Using language models to detect and correct syntax errors" -authors: E. A. Santos, J. C. Campbell, D. Patel, A. Hindle, J. N. Amaral +authors: Eddie Antonio Santos, Joshua Charles Campbell, Dhvani Patel, Abram Hindle, José Nelson Amaral conference: SANER year: 2018 -bibkey: santos2018syntax additional_links: - {name: "PDF", url: "http://softwareprocess.es/pubs/santos2018SANER-syntax.pdf"} - {name: "code", url: "https://github.com/naturalness/sensibility"} diff --git a/_publications/saraiva2015products.markdown b/_publications/saraiva2015products.markdown index fb845d8b..a011b299 100644 --- a/_publications/saraiva2015products.markdown +++ b/_publications/saraiva2015products.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Products, Developers, and Milestones: How Should I Build My N-Gram Language Model" -authors: C. Saraiva, C. Bird, T. Zimmermann +authors: Juliana Saraiva, Christian Bird, Thomas Zimmermann conference: FSE year: 2015 -bibkey: saraiva2015products tags: ["language model"] --- Recent work has shown that although programming languages en- diff --git a/_publications/sarkar2022what.markdown b/_publications/sarkar2022what.markdown new file mode 100644 index 00000000..e8507132 --- /dev/null +++ b/_publications/sarkar2022what.markdown @@ -0,0 +1,15 @@ +--- +layout: publication +title: "What is it like to program with artificial intelligence?" +authors: Advait Sarkar, Andrew D. Gordon, Carina Negreanu, Christian Poelitz, Sruti Srinivasa Ragavan, Ben Zorn +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2208.06213"} +tags: ["human evaluation", "review"] +--- +Large language models, such as OpenAI's codex and Deepmind's AlphaCode, can generate code to solve a variety of problems expressed in natural language. This technology has already been commercialised in at least one widely-used programming editor extension: GitHub Copilot. + +In this paper, we explore how programming with large language models (LLM-assisted programming) is similar to, and differs from, prior conceptualisations of programmer assistance. We draw upon publicly available experience reports of LLM-assisted programming, as well as prior usability and design studies. We find that while LLM-assisted programming shares some properties of compilation, pair programming, and programming via search and reuse, there are fundamental differences both in the technical possibilities as well as the practical experience. Thus, LLM-assisted programming ought to be viewed as a new way of programming with its own distinct properties and challenges. + +Finally, we draw upon observations from a user study in which non-expert end user programmers use LLM-assisted tools for solving data tasks in spreadsheets. We discuss the issues that might arise, and open research challenges, in applying large language models to end-user programming, particularly with users who have little or no programming expertise. diff --git a/_publications/schrouff2019inferring.markdown b/_publications/schrouff2019inferring.markdown index c0f8e33e..84901d2c 100644 --- a/_publications/schrouff2019inferring.markdown +++ b/_publications/schrouff2019inferring.markdown @@ -1,10 +1,11 @@ --- layout: publication title: "Inferring Javascript types using Graph Neural Networks" -authors: J. Schrouff, K. Wohlfahrt, B. Marnette, L. Atkinson +authors: Jessica Schrouff, Kai Wohlfahrt, Bruno Marnette, Liam Atkinson conference: Representation Learning on Graphs and Manifolds ICLR 2019 workshop year: 2019 -bibkey: schrouff2019inferring +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1905.06707"} tags: ["GNN", "types", "program analysis"] --- The recent use of `Big Code' with state-of-the-art deep learning methods offers promising avenues to ease program source code writing and correction. As a first step towards automatic code repair, we implemented a graph neural network model that predicts token types for Javascript programs. The predictions achieve an accuracy above 90%, which improves on previous similar work. diff --git a/_publications/schuster2021you.markdown b/_publications/schuster2021you.markdown new file mode 100644 index 00000000..e44b7f4a --- /dev/null +++ b/_publications/schuster2021you.markdown @@ -0,0 +1,15 @@ +--- +layout: publication +title: "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion" +authors: Roei Schuster, Congzheng Song, Eran Tromer, Vitaly Shmatikov +conference: USENIX Security +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2007.02220"} +tags: ["autocomplete", "adversarial"] +--- +Code autocompletion is an integral feature of modern code editors and IDEs. The latest generation of autocompleters uses neural language models, trained on public open-source code repositories, to suggest likely (not just statically feasible) completions given the current context. + +We demonstrate that neural code autocompleters are vulnerable to poisoning attacks. By adding a few specially-crafted files to the autocompleter's training corpus (data poisoning), or else by directly fine-tuning the autocompleter on these files (model poisoning), the attacker can influence its suggestions for attacker-chosen contexts. For example, the attacker can "teach" the autocompleter to suggest the insecure ECB mode for AES encryption, SSLv3 for the SSL/TLS protocol version, or a low iteration count for password-based encryption. Moreover, we show that these attacks can be targeted: an autocompleter poisoned by a targeted attack is much more likely to suggest the insecure completion for files from a specific repo or specific developer. + +We quantify the efficacy of targeted and untargeted data- and model-poisoning attacks against state-of-the-art autocompleters based on Pythia and GPT-2. We then evaluate existing defenses against poisoning attacks and show that they are largely ineffective. diff --git a/_publications/sharma2015nirmal.markdown b/_publications/sharma2015nirmal.markdown index 4bd3421f..66d67e35 100644 --- a/_publications/sharma2015nirmal.markdown +++ b/_publications/sharma2015nirmal.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "NIRMAL: Automatic Identification of Software Relevant Tweets Leveraging Language Model" -authors: A. Sharma, Y. Tian, D. Lo +authors: Abhishek Sharma, Yuan Tian, David Lo conference: SANER year: 2015 -bibkey: sharma2015nirmal tags: ["information extraction"] --- Twitter is one of the most widely used social media diff --git a/_publications/sharma2019feasibility.markdown b/_publications/sharma2019feasibility.markdown index 4d740a9c..daeec516 100644 --- a/_publications/sharma2019feasibility.markdown +++ b/_publications/sharma2019feasibility.markdown @@ -1,10 +1,11 @@ --- layout: publication title: "On the Feasibility of Transfer-learning Code Smells using Deep Learning" -authors: T. Sharma, V. Eftathiou, P. Louridas, D. Spinellis +authors: Tushar Sharma, Vasiliki Efstathiou, Panos Louridas, Diomidis Spinellis conference: year: 2019 -bibkey: sharma2019feasibility +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1904.03031"} tags: ["representation", "program analysis"] --- **Context**: A substantial amount of work has been done to detect smells in source code using metrics-based and heuristics-based methods. Machine learning methods have been recently applied to detect source code smells; however, the current practices are considered far from mature. diff --git a/_publications/sharma2022exploratory.markdown b/_publications/sharma2022exploratory.markdown new file mode 100644 index 00000000..0954a171 --- /dev/null +++ b/_publications/sharma2022exploratory.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "An Exploratory Study on Code Attention in BERT" +authors: Rishab Sharma, Fuxiang Chen, Fatemeh H. Fard, David Lo +conference: ICPC +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2204.10200"} + - {name: "code", url: "https://github.com/fardfh-lab/Code-Attention-BERT"} +tags: ["Transformer", "representation", "language model", "interpretability", "pretraining", "clone"] +--- +Many recent models in software engineering introduced deep neural models based on the Transformer architecture or use transformer-based Pre-trained Language Models (PLM) trained on code. Although these models achieve the state of the arts results in many downstream tasks such as code summarization and bug detection, they are based on Transformer and PLM, which are mainly studied in the Natural Language Processing (NLP) field. The current studies rely on the reasoning and practices from NLP for these models in code, despite the differences between natural languages and programming languages. There is also limited literature on explaining how code is modeled. Here, we investigate the attention behavior of PLM on code and compare it with natural language. We pre-trained BERT, a Transformer based PLM, on code and explored what kind of information it learns, both semantic and syntactic. We run several experiments to analyze the attention values of code constructs on each other and what BERT learns in each layer. Our analyses show that BERT pays more attention to syntactic entities, specifically identifiers and separators, in contrast to the most attended token [CLS] in NLP. This observation motivated us to leverage identifiers to represent the code sequence instead of the [CLS] token when used for code clone detection. Our results show that employing embeddings from identifiers increases the performance of BERT by 605% and 4% F1-score in its lower layers and the upper layers, respectively. When identifiers' embeddings are used in CodeBERT, a code-based PLM, the performance is improved by 21--24% in the F1-score of clone detection. The findings can benefit the research community by using code-specific representations instead of applying the common embeddings used in NLP, and open new directions for developing smaller models with similar performance. + diff --git a/_publications/sharma2022lamner.markdown b/_publications/sharma2022lamner.markdown new file mode 100644 index 00000000..bc839cea --- /dev/null +++ b/_publications/sharma2022lamner.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "LAMNER: Code Comment Generation Using Character Language Model and Named Entity Recognition" +authors: Rishab Sharma, Fuxiang Chen, Fatemeh H. Fard +conference: ICPC +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2204.09654"} + - {name: "code", url: "https://github.com/fardfh-lab/LAMNER"} +tags: ["summarization", "documentation", "language model", "types", "representation"] +--- +Code comment generation is the task of generating a high-level natural language description for a given code method/function. Although researchers have been studying multiple ways to generate code comments automatically, previous work mainly considers representing a code token in its entirety semantics form only (e.g., a language model is used to learn the semantics of a code token), and additional code properties such as the tree structure of a code are included as an auxiliary input to the model. There are two limitations: 1) Learning the code token in its entirety form may not be able to capture information succinctly in source code, and 2)The code token does not contain additional syntactic information, inherently important in programming languages. In this paper, we present LAnguage Model and Named Entity Recognition (LAMNER), a code comment generator capable of encoding code constructs effectively and capturing the structural property of a code token. A character-level language model is used to learn the semantic representation to encode a code token. For the structural property of a token, a Named Entity Recognition model is trained to learn the different types of code tokens. These representations are then fed into an encoder-decoder architecture to generate code comments. We evaluate the generated comments from LAMNER and other baselines on a popular Java dataset with four commonly used metrics. Our results show that LAMNER is effective and improves over the best baseline model in BLEU-1, BLEU-2, BLEU-3, BLEU-4, ROUGE-L, METEOR, and CIDEr by 14.34%, 18.98%, 21.55%, 23.00%, 10.52%, 1.44%, and 25.86%, respectively. Additionally, we fused LAMNER’s code representation with the baseline models, and the fused models consistently showed improvement over the nonfused models. The human evaluation further shows that LAMNER produces high-quality code comments. + diff --git a/_publications/she2019neuzz.markdown b/_publications/she2019neuzz.markdown index 886f0a6b..d0ca1ce8 100644 --- a/_publications/she2019neuzz.markdown +++ b/_publications/she2019neuzz.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "NEUZZ: Efficient Fuzzing with Neural Program Smoothing" -authors: D. She, K. Pei, D. Epstein, J. Yang, B. Ray, S. Jana +authors: Dongdong She, Kexin Pei, Dave Epstein, Junfeng Yang, Baishakhi Ray, Suman Jana conference: "IEEE S&P" year: 2019 -bibkey: she2019neuzz additional_links: - {name: "Code", url: "https://github.com/Dongdongshe/neuzz"} tags: ["fuzzing"] diff --git a/_publications/shi2019learning.markdown b/_publications/shi2019learning.markdown index fff4d57a..aac96ea4 100644 --- a/_publications/shi2019learning.markdown +++ b/_publications/shi2019learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning Execution through Neural Code Fusion" -authors: Z. Shi, K. Swersky, D. Tarlow, P. Ranganathan, M. Hashemi +authors: Zhan Shi, Kevin Swersky, Daniel Tarlow, Parthasarathy Ranganathan, Milad Hashemi conference: year: 2019 -bibkey: shi2019learning additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1906.07181"} tags: ["representation"] diff --git a/_publications/shi2022cv4code.markdown b/_publications/shi2022cv4code.markdown new file mode 100644 index 00000000..5c9f78cf --- /dev/null +++ b/_publications/shi2022cv4code.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "CV4Code: Sourcecode Understanding via Visual Code Representations" +authors: Ruibo Shi, Lili Tao, Rohan Saphal, Fran Silavong, Sean J. Moran +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2205.08585"} +tags: ["code similarity", "Transformer"] +--- +We present CV4Code, a compact and effective computer vision method for sourcecode understanding. Our method leverages the contextual and the structural information available from the code snippet by treating each snippet as a two-dimensional image, which naturally encodes the context and retains the underlying structural information through an explicit spatial representation. To codify snippets as images, we propose an ASCII codepoint-based image representation that facilitates fast generation of sourcecode images and eliminates redundancy in the encoding that would arise from an RGB pixel representation. Furthermore, as sourcecode is treated as images, neither lexical analysis (tokenisation) nor syntax tree parsing is required, which makes the proposed method agnostic to any particular programming language and lightweight from the application pipeline point of view. CV4Code can even featurise syntactically incorrect code which is not possible from methods that depend on the Abstract Syntax Tree (AST). We demonstrate the effectiveness of CV4Code by learning Convolutional and Transformer networks to predict the functional task, i.e. the problem it solves, of the source code directly from its two-dimensional representation, and using an embedding from its latent space to derive a similarity score of two code snippets in a retrieval setup. Experimental results show that our approach achieves state-of-the-art performance in comparison to other methods with the same task and data configurations. For the first time we show the benefits of treating sourcecode understanding as a form of image processing task. diff --git a/_publications/shido2019automatic.markdown b/_publications/shido2019automatic.markdown new file mode 100644 index 00000000..4ecc4ddd --- /dev/null +++ b/_publications/shido2019automatic.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "Automatic Source Code Summarization with Extended Tree-LSTM" +authors: Yusuke Shido, Yasuaki Kobayashi, Akihiro Yamamoto, Atsushi Miyamoto, Tadayuki Matsumura +conference: International Joint Conference on Neural Networks +year: 2019 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1906.08094"} + - {name: "Dataset", url: "https://github.com/xing-hu/DeepCom"} + - {name: "code", url: "https://github.com/sh1doy/summarization_tf"} +tags: ["summarization", "grammar"] +--- +Neural machine translation models are used to automatically generate a document from given source code since this can be regarded as a machine translation task. Source code summarization is one of the components for automatic document generation, which generates a summary in natural language from given source code. This suggests that techniques used in neural machine translation, such as Long Short-Term Memory (LSTM), can be used for source code summarization. However, there is a considerable difference between source code and natural language: Source code is essentially structured, having loops and conditional branching, etc. Therefore, there is some obstacle to apply known machine translation models to source code.Abstract syntax trees (ASTs) capture these structural properties and play an important role in recent machine learning studies on source code. Tree-LSTM is proposed as a generalization of LSTMs for tree-structured data. However, there is a critical issue when applying it to ASTs: It cannot handle a tree that contains nodes having an arbitrary number of children and their order simultaneously, which ASTs generally have such nodes. To address this issue, we propose an extension of Tree-LSTM, which we call Multi-way Tree-LSTM and apply it for source code summarization. As a result of computational experiments, our proposal achieved better results when compared with several state-of-the-art techniques. diff --git a/_publications/shirani2018evaluation.markdown b/_publications/shirani2018evaluation.markdown index 5de0092c..acff0ea8 100644 --- a/_publications/shirani2018evaluation.markdown +++ b/_publications/shirani2018evaluation.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Evaluation of Type Inference with Textual Cues" -authors: A. Shirani, A. P. Lopez-Monroy, F. Gonzalez, T. Solorio, M.A. Alipour +authors: Amirreza A. Shirani, A. Pastor Lopez-Monroy, Fabio Gonzalez, Thamar Solorio, Mohammad Amin Alipour conference: NLSE year: 2018 -bibkey: shirani2018evaluation additional_links: - {name: "PDF", url: "https://alipourm.github.io/pub/nl4se18.pdf"} tags: ["information extraction"] diff --git a/_publications/shrivastava2020on-the-fly.markdown b/_publications/shrivastava2020on-the-fly.markdown index aa544398..699fb839 100644 --- a/_publications/shrivastava2020on-the-fly.markdown +++ b/_publications/shrivastava2020on-the-fly.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "On-the-Fly Adaptation of Source Code Models using Meta-Learning" -authors: D. Shrivastava, H. Larochelle, D. Tarlow +authors: Disha Shrivastava, Hugo Larochelle, Daniel Tarlow conference: year: 2020 -bibkey: shrivastava2020on-the-fly additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2003.11768"} - {name: "Code", url: "https://github.com/shrivastavadisha/meta_learn_source_code"} diff --git a/_publications/shrivastava2020repository.markdown b/_publications/shrivastava2020repository.markdown new file mode 100644 index 00000000..5af6a384 --- /dev/null +++ b/_publications/shrivastava2020repository.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Repository-Level Prompt Generation for Large Language Models of Code" +authors: Disha Shrivastava, Hugo Larochelle, Daniel Tarlow +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2206.12839"} +tags: ["Transformer", "code completion"] +--- +With the success of large language models (LLMs) of code and their use as code assistants (e.g. Codex used in GitHub Copilot), techniques for introducing domain-specific knowledge in the prompt design process become important. In this work, we propose a framework called Repo-Level Prompt Generator that learns to generate example-specific prompts using a set of rules. These rules take context from the entire repository, thereby incorporating both the structure of the repository and the context from other relevant files (e.g. imports, parent class files). Our technique doesn't require any access to the weights of the LLM, making it applicable in cases where we only have black-box access to the LLM. We conduct experiments on the task of single-line code-autocompletion using code repositories taken from Google Code archives. We demonstrate that an oracle constructed from our proposed rules gives up to 36% relative improvement over Codex, showing the quality of the rules. Further, we show that when we train a model to select the best rule, we can achieve significant performance gains over Codex. The code for our work can be found at: https://github.com/shrivastavadisha/repo_level_prompt_generation . diff --git a/_publications/shrivastava2023repofusion.markdown b/_publications/shrivastava2023repofusion.markdown new file mode 100644 index 00000000..e450ec90 --- /dev/null +++ b/_publications/shrivastava2023repofusion.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "RepoFusion: Training Code Models to Understand Your Repository" +authors: Disha Shrivastava, Denis Kocetkov, Harm de Vries, Dzmitry Bahdanau, Torsten Scholak +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2306.10998"} +tags: ["completion"] +--- +Despite the huge success of Large Language Models (LLMs) in coding assistants like GitHub Copilot, these models struggle to understand the context present in the repository (e.g., imports, parent classes, files with similar names, etc.), thereby producing inaccurate code completions. This effect is more pronounced when using these assistants for repositories that the model has not seen during training, such as proprietary software or work-in-progress code projects. Recent work has shown the promise of using context from the repository during inference. In this work, we extend this idea and propose RepoFusion, a framework to train models to incorporate relevant repository context. Experiments on single-line code completion show that our models trained with repository context significantly outperform much larger code models as CodeGen-16B-multi ($\sim73\times$ larger) and closely match the performance of the $\sim 70\times$ larger StarCoderBase model that was trained with the Fill-in-the-Middle objective. We find these results to be a novel and compelling demonstration of the gains that training with repository context can bring. We carry out extensive ablation studies to investigate the impact of design choices such as context type, number of contexts, context length, and initialization within our framework. Lastly, we release Stack-Repo, a dataset of 200 Java repositories with permissive licenses and near-deduplicated files that are augmented with three types of repository contexts. Additionally, we are making available the code and trained checkpoints for our work. Our released resources can be found at \url{https://huggingface.co/RepoFusion}. diff --git a/_publications/shuai2020improving.markdown b/_publications/shuai2020improving.markdown new file mode 100644 index 00000000..cada5da0 --- /dev/null +++ b/_publications/shuai2020improving.markdown @@ -0,0 +1,14 @@ +--- +layout: publication +title: "Improving Code Search with Co-Attentive Representation Learning" +authors: Jianhang Shuai, Ling Xu, Chao Liu, Meng Yan, Xin Xia, Yan Lei +conference: ICPC +year: 2020 +additional_links: + - { name: "ACM", url: "https://dl.acm.org/doi/abs/10.1145/3387904.3389269" } +tags: ["search"] +--- + +Searching and reusing existing code from a large-scale codebase, e.g, GitHub, can help developers complete a programming task efficiently. Recently, Gu et al. proposed a deep learning-based model (i.e., DeepCS), which significantly outperformed prior models. The DeepCS embedded codebase and natural language queries into vectors by two LSTM (long and short-term memory) models separately, and returned developers the code with higher similarity to a code search query. However, such embedding method learned two isolated representations for code and query but ignored their internal semantic correlations. As a result, the learned isolated representations of code and query may limit the effectiveness of code search. + +To address the aforementioned issue, we propose a co-attentive representation learning model, i.e., Co-Attentive Representation Learning Code Search-CNN (CARLCS-CNN). CARLCS-CNN learns interdependent representations for the embedded code and query with a co-attention mechanism. Generally, such mechanism learns a correlation matrix between embedded code and query, and co-attends their semantic relationship via row/column-wise max-pooling. In this way, the semantic correlation between code and query can directly affect their individual representations. We evaluate the effectiveness of CARLCS-CNN on Gu et al.'s dataset with 10k queries. Experimental results show that the proposed CARLCS-CNN model significantly outperforms DeepCS by 26.72% in terms of MRR (mean reciprocal rank). Additionally, CARLCS-CNN is five times faster than DeepCS in model training and four times in testing. diff --git a/_publications/si2018learning.markdown b/_publications/si2018learning.markdown index 288f430a..99a01b33 100644 --- a/_publications/si2018learning.markdown +++ b/_publications/si2018learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning Loop Invariants for Program Verification" -authors: X. Si, H. Dai, M. Raghothaman, M. Naik, L. Song -conference: NIPS +authors: Xujie Si, Hanjun Dai, Mukund Raghothaman, Mayur Naik, Le Song +conference: NeurIPS year: 2018 -bibkey: si2018learning additional_links: - {name: "Preprint", url: "https://www.cis.upenn.edu/~mhnaik/papers/nips18.pdf"} tags: ["program analysis", "verification"] diff --git a/_publications/silavong2022senatus.markdown b/_publications/silavong2022senatus.markdown new file mode 100644 index 00000000..40067f6f --- /dev/null +++ b/_publications/silavong2022senatus.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Senatus - A Fast and Accurate Code-to-Code Recommendation Engine" +authors: Fran Silavong, Sean Moran, Antonios Georgiadis, Rohan Saphal, Robert Otter +conference: MSR +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2111.04473"} +tags: ["code similarity", "search"] +--- +Machine learning on source code (MLOnCode) is a popular research field that has been driven by the availability of large-scale code repositories and the development of powerful probabilistic and deep learning models for mining source code. Code-to-code recommendation is a task in MLOnCode that aims to recommend relevant, diverse and concise code snippets that usefully extend the code currently being written by a developer in their development environment (IDE). Code-to-code recommendation engines hold the promise of increasing developer productivity by reducing context switching from the IDE and increasing code-reuse. Existing code-to-code recommendation engines do not scale gracefully to large codebases, exhibiting a linear growth in query time as the code repository increases in size. In addition, existing code-to-code recommendation engines fail to account for the global statistics of code repositories in the ranking function, such as the distribution of code snippet lengths, leading to sub-optimal retrieval results. We address both of these weaknesses with Senatus, a new code-to-code recommendation engine. At the core of Senatus is De-Skew LSH a new locality sensitive hashing (LSH) algorithm that indexes the data for fast (sub-linear time) retrieval while also counteracting the skewness in the snippet length distribution using novel abstract syntax tree-based feature scoring and selection algorithms. We evaluate Senatus and find the recommendations to be of higher quality than competing baselines, while achieving faster search. For example on the CodeSearchNet dataset Senatus improves performance by 31.21% F1 and 147.9x faster query time compared to Facebook Aroma. Senatus also outperforms standard MinHash LSH by 29.2% F1 and 51.02x faster query time. diff --git a/_publications/silva2023repairllama.markdown b/_publications/silva2023repairllama.markdown new file mode 100644 index 00000000..42df7795 --- /dev/null +++ b/_publications/silva2023repairllama.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair" +authors: André Silva, Sen Fang, Martin Monperrus +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2312.15698"} +tags: ["repair"] +--- +Automated Program Repair (APR) has evolved significantly with the advent of Large Language Models (LLMs). Fine-tuning LLMs for program repair is a recent avenue of research, with many dimensions which have not been explored. Existing work mostly fine-tunes LLMs with naive code representations and is fundamentally limited in its ability to fine-tune larger LLMs. To address this problem, we propose RepairLLaMA, a novel program repair approach that combines 1) code representations for APR and 2) the state-of-the-art parameter-efficient LLM fine-tuning technique called LoRA. This results in RepairLLaMA producing a highly effective `program repair adapter' for fixing bugs with language models. Our experiments demonstrate the validity of both concepts. First, fine-tuning adapters with program repair specific code representations enables the model to use meaningful repair signals. Second, parameter-efficient fine-tuning helps fine-tuning to converge and contributes to the effectiveness of the repair adapter to fix data-points outside the fine-tuning data distribution. Overall, RepairLLaMA correctly fixes 125 Defects4J v2 and 82 HumanEval-Java bugs, outperforming all baselines. diff --git a/_publications/singh2016question.markdown b/_publications/singh2016question.markdown index f745c443..053a00ec 100644 --- a/_publications/singh2016question.markdown +++ b/_publications/singh2016question.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Question Independent Grading using Machine Learning: The Case of Computer Program Grading" -authors: G. Singh, S. Srikant, V. Aggarwal +authors: Gursimran Singh, Shashank Srikant, Varun Aggarwal conference: KDD year: 2016 -bibkey: singh2016question additional_links: - {name: "PDF", url: "https://dl.acm.org/citation.cfm?id=2939696"} - {name: "website", url: "http://research.aspiringminds.com/"} diff --git a/_publications/siow2019core.markdown b/_publications/siow2019core.markdown index 57b9e734..c3efdf28 100644 --- a/_publications/siow2019core.markdown +++ b/_publications/siow2019core.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "CORE: Automating Review Recommendation for Code Changes" -authors: J. Siow, C. Gao, L. Fan, S. Chen, Y. Liu +authors: JingKai Siow, Cuiyun Gao, Lingling Fan, Sen Chen, Yang Liu conference: SANER year: 2019 -bibkey: siow2019core additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1912.09652"} tags: ["review"] diff --git a/_publications/siow2022learning.markdown b/_publications/siow2022learning.markdown new file mode 100644 index 00000000..301f42dd --- /dev/null +++ b/_publications/siow2022learning.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "Learning Program Semantics with Code Representations: An Empirical Study" +authors: Jing Kai Siow, Shangqing Liu, Xiaofei Xie, Guozhu Meng, Yang Liu +conference: SANER +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2203.11790"} +tags: ["representation"] +--- +Program semantics learning is the core and fundamental for various code intelligent tasks e.g., vulnerability detection, clone detection. A considerable amount of existing works propose diverse approaches to learn the program semantics for different tasks and these works have achieved state-of-the-art performance. However, currently, a comprehensive and systematic study on evaluating different program representation techniques across diverse tasks is still missed. + +From this starting point, in this paper, we conduct an empirical study to evaluate different program representation techniques. Specifically, we categorize current mainstream code representation techniques into four categories i.e., Feature-based, Sequence-based, Tree-based, and Graph-based program representation technique and evaluate its performance on three diverse and popular code intelligent tasks i.e., {Code Classification}, Vulnerability Detection, and Clone Detection on the public released benchmark. We further design three {research questions (RQs)} and conduct a comprehensive analysis to investigate the performance. By the extensive experimental results, we conclude that (1) The graph-based representation is superior to the other selected techniques across these tasks. (2) Compared with the node type information used in tree-based and graph-based representations, the node textual information is more critical to learning the program semantics. (3) Different tasks require the task-specific semantics to achieve their highest performance, however combining various program semantics from different dimensions such as control dependency, data dependency can still produce promising results. diff --git a/_publications/sivaraman2021mining.markdown b/_publications/sivaraman2021mining.markdown new file mode 100644 index 00000000..c2aacc0e --- /dev/null +++ b/_publications/sivaraman2021mining.markdown @@ -0,0 +1,15 @@ +--- +layout: publication +title: "Mining Idioms in the Wild" +authors: Aishwarya Sivaraman, Rui Abreu, Andrew Scott, Tobi Akomolede, Satish Chandra +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2107.06402"} +tags: ["pattern mining", "refactoring"] +--- +Existing code repositories contain numerous instances of code patterns that are idiomatic ways of accomplishing a particular programming task. Sometimes, the programming language in use supports specific operators or APIs that can express the same idiomatic imperative code much more succinctly. However, those code patterns linger in repositories because the developers may be unaware of the new APIs or have not gotten around to them. Detection of idiomatic code can also point to the need for new APIs. + +We share our experiences in mine idiomatic patterns from the Hack repo at Facebook. We found that existing techniques either cannot identify meaningful patterns from syntax trees or require test-suite-based dynamic analysis to incorporate semantic properties to mine useful patterns. The key insight of the approach proposed in this paper -- Jezero -- is that semantic idioms from a large codebase can be learned from canonicalized dataflow trees. We propose a scalable, lightweight static analysis-based approach to construct such a tree that is well suited to mine semantic idioms using nonparametric Bayesian methods. + +Our experiments with Jezero on Hack code shows a clear advantage of adding canonicalized dataflow information to ASTs: Jezero was significantly more effective than a baseline that did not have the dataflow augmentation in being able to effectively find refactoring opportunities from unannotated legacy code. diff --git a/_publications/souza2023lexecutor.markdown b/_publications/souza2023lexecutor.markdown new file mode 100644 index 00000000..1ad8eb1b --- /dev/null +++ b/_publications/souza2023lexecutor.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "LExecutor: Learning-Guided Execution" +authors: Beatriz Souza, Michael Pradel +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2302.02343"} + - {name: "Code", url: "https://github.com/michaelpradel/LExecutor"} +tags: ["execution"] +--- +Executing code is essential for various program analysis tasks, e.g., to detect bugs that manifest through exceptions or to obtain execution traces for further dynamic analysis. However, executing an arbitrary piece of code is often difficult in practice, e.g., because of missing variable definitions, missing user inputs, and missing third-party dependencies. This paper presents LExecutor, a learning-guided approach for executing arbitrary code snippets in an underconstrained way. The key idea is to let a neural model predict missing values that otherwise would cause the program to get stuck, and to inject these values into the execution. For example, LExecutor injects likely values for otherwise undefined variables and likely return values of calls to otherwise missing functions. We evaluate the approach on Python code from popular open-source projects and on code snippets extracted from Stack Overflow. The neural model predicts realistic values with an accuracy between 80.1% and 94.2%, allowing LExecutor to closely mimic real executions. As a result, the approach successfully executes significantly more code than any available technique, such as simply executing the code as-is. For example, executing the open-source code snippets as-is covers only 4.1% of all lines, because the code crashes early on, whereas LExecutor achieves a coverage of 50.1%. + diff --git a/_publications/spirin2021psiminer.markdown b/_publications/spirin2021psiminer.markdown new file mode 100644 index 00000000..b64df54f --- /dev/null +++ b/_publications/spirin2021psiminer.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "PSIMiner: A Tool for Mining Rich Abstract Syntax Trees from Code" +authors: Egor Spirin, Egor Bogomolov, Vladimir Kovalenko, Timofey Bryksin +conference: MSR +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2103.12778"} + - {name: "website", url: "https://research.jetbrains.org/groups/ml_methods/publications/"} + - {name: "code", url: "https://github.com/JetBrains-Research/psiminer"} +tags: ["tool"] +--- +The application of machine learning algorithms to source code has grown in the past years. Since these algorithms are quite sensitive to input data, it is not surprising that researchers experiment with input representations. Nowadays, a popular starting point to represent code is abstract syntax trees (ASTs). Abstract syntax trees have been used for a long time in various software engineering domains, and in particular in IDEs. The API of modern IDEs allows to manipulate and traverse ASTs, resolve references between code elements, etc. Such algorithms can enrich ASTs with new data and therefore may be useful in ML-based code analysis. In this work, we present PSIMINER— a tool for processing PSI trees from the IntelliJ Platform. PSI trees contain code syntax trees as well as functions to work with them, and therefore can be used to enrich code representation using static analysis algorithms of modern IDEs. To showcase this idea, we use our tool to infer types of identifiers in Java ASTs and extend the code2seq model for the method name prediction problem. diff --git a/_publications/srikant2014system.markdown b/_publications/srikant2014system.markdown index c51f06e4..15dd7d08 100644 --- a/_publications/srikant2014system.markdown +++ b/_publications/srikant2014system.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "A system to grade computer programming skills using machine learning" -authors: S. Srikant, V. Aggarwal +authors: Shashank Srikant, Varun Aggarwal conference: KDD year: 2014 -bibkey: srikant2014system additional_links: - {name: "PDF", url: "https://dl.acm.org/citation.cfm?id=2623377"} - {name: "website", url: "http://research.aspiringminds.com/"} diff --git a/_publications/sun2019grammar.markdown b/_publications/sun2019grammar.markdown index bd4a669a..48f61ddc 100644 --- a/_publications/sun2019grammar.markdown +++ b/_publications/sun2019grammar.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "A Grammar-Based Structural CNN Decoder for Code Generation" -authors: Z. Sun, Q. Zhu, L. Mou, Y. Xiong, G. Li, L. Zhang +authors: Zeyu Sun, Qihao Zhu, Lili Mou, Yingfei Xiong, Ge Li, Lu Zhang conference: AAAI year: 2019 -bibkey: sun2019grammar -tags: ["generation", "grammar"] +tags: ["code generation", "grammar"] --- Code generation maps a program description to executable source code in a programming language. Existing approaches diff --git a/_publications/sun2020pscs.markdown b/_publications/sun2020pscs.markdown new file mode 100644 index 00000000..87048135 --- /dev/null +++ b/_publications/sun2020pscs.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "PSCS: A Path-based Neural Model for Semantic Code Search" +authors: Zhensu Sun, Yan Liu, Chen Yang, Yu Qian +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2008.03042"} +tags: ["grammar", "search"] +--- +To obtain code snippets for reuse, programmers prefer to search for related documents, e.g., blogs or Q&A, instead of code itself. The major reason is due to the semantic diversity and mismatch between queries and code snippets. Deep learning models have been proposed to address this challenge. Compared with approaches using information retrieval techniques, deep learning models do not suffer from the information loss caused by refining user intention into keywords. However, the performance of previous works is not satisfactory because they ignore the importance of code structure. When the semantics of code (e.g., identifier names, APIs) are ambiguous, code structure may be the only feature for the model to utilize. In that case, previous works relearn the structural information from lexical tokens of code, which is extremely difficult for a model without any domain knowledge. In this work, we propose PSCS, a path-based neural model for semantic code search. Our model encodes both the semantics and structures of code represented by AST paths. We train and evaluate our model over 330k-19k query-function pairs, respectively. The evaluation results demonstrate that PSCS achieves a SuccessRate of 47.6% and a Mean Reciprocal Rank (MRR) of 30.4% when considering the top-10 results with a match. The proposed approach significantly outperforms both DeepCS, the first approach that applies deep learning to code search task, and CARLCS, a state-of-the-art approach that introduces a co-attentive representation learning model on the basis of DeepCS. The importance of code structure is demonstrated with an ablation study on code features, which enlightens model design for further studies. diff --git a/_publications/svyatkovskiy2019pythia.markdown b/_publications/svyatkovskiy2019pythia.markdown index 839261fc..0a891339 100644 --- a/_publications/svyatkovskiy2019pythia.markdown +++ b/_publications/svyatkovskiy2019pythia.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Pythia: AI-assisted Code Completion System" -authors: A. Svyatkovskiy, Y. Zhao, S. Fu, Neel Sundaresan +authors: Alexey Svyatkovskiy, Ying Zhao, Shengyu Fu, Neel Sundaresan conference: KDD year: 2019 -bibkey: svyatkovskiy2019pythia tags: ["autocomplete", "language model"] --- diff --git a/_publications/svyatkovskiy2020fast.markdown b/_publications/svyatkovskiy2020fast.markdown index eabfd40b..43b2ba38 100644 --- a/_publications/svyatkovskiy2020fast.markdown +++ b/_publications/svyatkovskiy2020fast.markdown @@ -4,10 +4,9 @@ title: "Fast and Memory-Efficient Neural Code Completion" authors: Alexey Svyatkovskiy, Sebastian Lee, Anna Hadjitofi, Maik Riechert, Juliana Franco, Miltiadis Allamanis conference: year: 2020 -bibkey: svyatkovskiy2020fast additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2004.13651"} -tags: ["autocompletion"] +tags: ["autocomplete"] --- Code completion is one of the most widely used features of modern integrated development environments (IDEs). Deep learning has recently made significant progress in the statistical prediction of source code. However, state-of-the-art neural network models consume prohibitively large amounts of memory, causing computational burden to the development environment, especially when deployed in lightweight client devices. diff --git a/_publications/svyatkovskiy2020intellicode.markdown b/_publications/svyatkovskiy2020intellicode.markdown index 21130ed9..5428fdcd 100644 --- a/_publications/svyatkovskiy2020intellicode.markdown +++ b/_publications/svyatkovskiy2020intellicode.markdown @@ -2,14 +2,11 @@ layout: publication title: "IntelliCode Compose: Code Generation Using Transformer" authors: Alexey Svyatkovskiy, Shao Kun Deng, Shengyu Fu, Neel Sundaresan -conference: year: 2020 -bibkey: svyatkovskiy2020intellicode additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2005.08025"} -tags: ["autocompletion"] +tags: ["autocomplete", "code generation", "synthesis", "language model", "pretraining"] --- In software development through integrated development environments (IDEs), code completion is one of the most widely used features. Nevertheless, majority of integrated development environments only support completion of methods and APIs, or arguments. - In this paper, we introduce IntelliCode Compose − a general-purpose multilingual code completion tool which is capable of predicting sequences of code tokens of arbitrary types, generating up to entire lines of syntactically correct code. It leverages state-of-the-art generative transformer model trained on 1.2 billion lines of source code in Python, C#, JavaScript and TypeScript programming languages. IntelliCode Compose is deployed as a cloud-based web service. It makes use of client-side tree-based caching, efficient parallel implementation of the beam search decoder, and compute graph optimizations to meet edit-time completion suggestion requirements in the Visual Studio Code IDE and Azure Notebook. -Our best model yields an average edit similarity of 86.7% and a perplexity of 1.82 for Python programming language. +Our best model yields an average edit similarity of 86.7% and a perplexity of 1.82 for Python programming language. diff --git a/_publications/szafraniec2022code.markdown b/_publications/szafraniec2022code.markdown new file mode 100644 index 00000000..2f5c4072 --- /dev/null +++ b/_publications/szafraniec2022code.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Code Translation with Compiler Representations" +authors: Marc Szafraniec, Baptiste Roziere, Hugh Leather, Francois Charton, Patrick Labatut, Gabriel Synnaeve +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2207.03578"} +tags: ["Transformer", "migration", "decompilation"] +--- +In this paper, we leverage low-level compiler intermediate representations (IR) to improve code translation. Traditional transpilers rely on syntactic information and handcrafted rules, which limits their applicability and produces unnatural-looking code. Applying neural machine translation (NMT) approaches to code has successfully broadened the set of programs on which one can get a natural-looking translation. However, they treat the code as sequences of text tokens, and still do not differentiate well enough between similar pieces of code which have different semantics in different languages. The consequence is low quality translation, reducing the practicality of NMT, and stressing the need for an approach significantly increasing its accuracy. Here we propose to augment code translation with IRs, specifically LLVM IR, with results on the C++, Java, Rust, and Go languages. Our method improves upon the state of the art for unsupervised code translation, increasing the number of correct translations by 11% on average, and up to 79% for the Java - Rust pair. We extend previous test sets for code translation, by adding hundreds of Go and Rust functions. Additionally, we train models with high performance on the problem of IR decompilation, generating programming source code from IR, and study using IRs as intermediary pivot for translation. diff --git a/_publications/tabassum2020code.markdown b/_publications/tabassum2020code.markdown index 4326e230..2e88a6d4 100644 --- a/_publications/tabassum2020code.markdown +++ b/_publications/tabassum2020code.markdown @@ -4,9 +4,9 @@ title: "Code and Named Entity Recognition in StackOverflow" authors: Jeniya Tabassum, Mounica Maddela, Wei Xu, Alan Ritter conference: ACL year: 2020 -bibkey: tabassum2020code additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2005.01634"} + - {name: "Code", url: "https://github.com/jeniyat/StackOverflowNER/"} tags: ["dataset", "information extraction"] --- -There is an increasing interest in studying natural language and computer code together, as large corpora of programming texts become readily available on the Internet. For example, StackOverflow currently has over 15 million programming related questions written by 8.5 million users. Meanwhile, there is still a lack of fundamental NLP techniques for identifying code tokens or software-related named entities that appear within natural language sentences. In this paper, we introduce a new named entity recognition (NER) corpus for the computer programming domain, consisting of 15,372 sentences annotated with 20 fine-grained entity types. We also present the SoftNER model that combines contextual information with domain specific knowledge using an attention network. The code token recognizer combined with an entity segmentation model we proposed, consistently improves the performance of the named entity tagger. Our proposed SoftNER tagger outperforms the BiLSTM-CRF model with an absolute increase of +9.73 F-1 score on StackOverflow data. +There is an increasing interest in studying natural language and computer code together, as large corpora of programming texts become readily available on the Internet. For example, StackOverflow currently has over 15 million programming related questions written by 8.5 million users. Meanwhile, there is still a lack of fundamental NLP techniques for identifying code tokens or software-related named entities that appear within natural language sentences. In this paper, we introduce a new named entity recognition (NER) corpus for the computer programming domain, consisting of 15,372 sentences annotated with 20 fine-grained entity types. We trained in-domain BERT representations (BERTOverflow) on 152 million sentences from StackOverflow, which lead to an absolute increase of +10 F-1 score over off-the-shelf BERT. We also present the SoftNER model which achieves an overall 79.10 F1 score for code and named entity recognition on StackOverflow data. Our SoftNER model incorporates a context-independent code token classifier with corpus-level features to improve the BERT-based tagging model. diff --git a/_publications/tan2024llm4decompile.markdown b/_publications/tan2024llm4decompile.markdown new file mode 100644 index 00000000..8ea0b686 --- /dev/null +++ b/_publications/tan2024llm4decompile.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "LLM4Decompile: Decompiling Binary Code with Large Language Models" +authors: Hanzhuo Tan, Qi Luo, Jing Li, Yuqun Zhang +conference: +year: 2024 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2403.05286"} + - {name: "code", url: "https://github.com/albertan017/LLM4Decompile"} +tags: ["decompilation", "translation", "evaluation", "large language models", "LLM"] +--- +Decompilation aims to restore compiled code to human-readable source code, but struggles with details like names and structure. Large language models (LLMs) show promise for programming tasks, motivating their application to decompilation. However, there does not exist any open-source LLM for decompilation. Moreover, existing decompilation evaluation systems mainly consider token-level accuracy and largely ignore code executability, which is the most important feature of any program. Therefore, we release the first open-access decompilation LLMs ranging from 1B to 33B pre-trained on 4 billion tokens of C source code and the corresponding assembly code. The open-source LLMs can serve as baselines for further development in the field. To ensure practical program evaluation, we introduce Decompile-Eval, the first dataset that considers re-compilability and re-executability for decompilation. The benchmark emphasizes the importance of evaluating the decompilation model from the perspective of program semantics. Experiments indicate that our LLM4Decompile has demonstrated the capability to accurately decompile 21% of the assembly code, which achieves a 50% improvement over GPT-4. Our code, dataset, and models are released at this [https URL](https://github.com/albertan017/LLM4Decompile) diff --git a/_publications/tarlow2019learning.markdown b/_publications/tarlow2019learning.markdown index 802083fe..bb4b956b 100644 --- a/_publications/tarlow2019learning.markdown +++ b/_publications/tarlow2019learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning to Fix Build Errors with Graph2Diff Neural Networks" -authors: D. Tarlow, S. Moitra, A. Rcie, Z. Chen, P.A. Manzagol, C. Sutton, E. Aftandilian +authors: Daniel Tarlow, Subhodeep Moitra, Andrew Rice, Zimin Chen, Pierre-Antoine Manzagol, Charles Sutton, Edward Aftandilian conference: year: 2019 -bibkey: tarlow2019learning additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1911.01205"} - {name: "preprint", url: "http://www.cs.toronto.edu/~dtarlow/papers/graph2diff_preprint.pdf"} diff --git a/_publications/template b/_publications/template index c2e362fd..8e8f760a 100644 --- a/_publications/template +++ b/_publications/template @@ -1,13 +1,12 @@ --- layout: publication title: "Add title here" -authors: F. LastName, F. LastName -conference: Optional +authors: FirstName LastName, FirstName LastName +conference: Optional # OR journal year: 2000 -bibkey: sameAsFilename additional_links: - - {name: "ArXiV", url: "https://arxiv.org/abs/xxxx.xxxxxx"} - - {name: "Dataset", url: "https://blah/blah"} + - {name: "ArXiV", url: "https://arxiv.org/abs/xxxx.xxxxxx"} + - {name: "Dataset", url: "https://blah/blah"} tags: ["dataset"] --- Abstract here diff --git a/_publications/theeten2019import2vec.markdown b/_publications/theeten2019import2vec.markdown index 6dc85d41..11b38e26 100644 --- a/_publications/theeten2019import2vec.markdown +++ b/_publications/theeten2019import2vec.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Import2vec - Learning Embeddings for Software Libraries" -authors: B. Theeten, F. Vandeputte, T.Van Cutsem +authors: Bart Theeten, Frederik Vandeputte, Tom Van Cutsem conference: MSR year: 2019 -bibkey: theeten2019import2vec tags: ["representation"] --- We consider the problem of developing suitable learning representations (embeddings) for library packages that capture semantic similarity among libraries. Such representations are known to improve the performance of downstream learning tasks (e.g. classification) or applications such as contextual search and analogical reasoning. diff --git a/_publications/tian2020evaluating.markdown b/_publications/tian2020evaluating.markdown new file mode 100644 index 00000000..a9d418f8 --- /dev/null +++ b/_publications/tian2020evaluating.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Evaluating Representation Learning of Code Changes for Predicting Patch Correctness in Program Repair" +authors: Haoye Tian, Kui Liu, Abdoul Kader Kaboreé, Anil Koyuncu, Li Li, Jacques Klein, Tegawendé F. Bissyandé +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2008.02944"} +tags: ["repair", "Transformer"] +--- +A large body of the literature of automated program repair develops approaches where patches are generated to be validated against an oracle (e.g., a test suite). Because such an oracle can be imperfect, the generated patches, although validated by the oracle, may actually be incorrect. While the state of the art explore research directions that require dynamic information or rely on manually-crafted heuristics, we study the benefit of learning code representations to learn deep features that may encode the properties of patch correctness. Our work mainly investigates different representation learning approaches for code changes to derive embeddings that are amenable to similarity computations. We report on findings based on embeddings produced by pre-trained and re-trained neural networks. Experimental results demonstrate the potential of embeddings to empower learning algorithms in reasoning about patch correctness: a machine learning predictor with BERT transformer-based embeddings associated with logistic regression yielded an AUC value of about 0.8 in predicting patch correctness on a deduplicated dataset of 1000 labeled patches. Our study shows that learned representations can lead to reasonable performance when comparing against the state-of-the-art, PATCH-SIM, which relies on dynamic information. These representations may further be complementary to features that were carefully (manually) engineered in the literature. diff --git a/_publications/tian2024debugbench.markdown b/_publications/tian2024debugbench.markdown new file mode 100644 index 00000000..10dd79a9 --- /dev/null +++ b/_publications/tian2024debugbench.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "DebugBench: Evaluating Debugging Capability of Large Language Models" +authors: Runchu Tian, Yining Ye, Yujia Qin, Xin Cong, Yankai Lin, Yinxu Pan, Yesai Wu, Zhiyuan Liu, Maosong Sun +conference: +year: 2024 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2401.04621"} +tags: ["repair"] +--- +Large Language Models (LLMs) have demonstrated exceptional coding capability. However, as another critical component of programming proficiency, the debugging capability of LLMs remains relatively unexplored. Previous evaluations of LLMs' debugging ability are significantly limited by the risk of data leakage, the scale of the dataset, and the variety of tested bugs. To overcome these deficiencies, we introduce `DebugBench', an LLM debugging benchmark consisting of 4,253 instances. It covers four major bug categories and 18 minor types in C++, Java, and Python. To construct DebugBench, we collect code snippets from the LeetCode community, implant bugs into source data with GPT-4, and assure rigorous quality checks. We evaluate two commercial and three open-source models in a zero-shot scenario. We find that (1) while closed-source models like GPT-4 exhibit inferior debugging performance compared to humans, open-source models such as Code Llama fail to attain any pass rate scores; (2) the complexity of debugging notably fluctuates depending on the bug category; (3) incorporating runtime feedback has a clear impact on debugging performance which is not always helpful. As an extension, we also compare LLM debugging and code generation, revealing a strong correlation between them for closed-source models. These findings will benefit the development of LLMs in debugging. diff --git a/_publications/tomczak2019simulating.markdown b/_publications/tomczak2019simulating.markdown index a1ad0b29..0f5c90f8 100644 --- a/_publications/tomczak2019simulating.markdown +++ b/_publications/tomczak2019simulating.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Simulating Execution Time of Tensor Programs using Graph Neural Networks" -authors: J. M. Tomczak, R. Lepert, A. Wiggers +authors: Jakub M. Tomczak, Romain Lepert, Auke Wiggers conference: Representation Learning on Graphs and Manifolds at ICLR year: 2019 -bibkey: tomczak2019simulating additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1904.11876"} tags: ["GNN"] diff --git a/_publications/tran2019recovering.markdown b/_publications/tran2019recovering.markdown index 5511569e..0366fcc5 100644 --- a/_publications/tran2019recovering.markdown +++ b/_publications/tran2019recovering.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Recovering Variable Names for Minified Code with Usage Contexts" -authors: H. Tran, N. Tran, S. Nguyen, H. Nguyen, T. Nguyen +authors: Hieu Tran, Ngoc Tran, Son Nguyen, Hoan Nguyen, Tien N. Nguyen conference: ICSE year: 2019 -bibkey: tran2019recovering tags: ["naming", "deobfuscation"] --- In modern Web technology, JavaScript (JS) code plays an important role. To avoid the exposure of original source code, the variable names in JS code deployed in the wild are often replaced by short, meaningless names, thus making the code extremely difficult to manually understand and analysis. This paper presents JSNeat, an information retrieval (IR)-based approach to recover the variable names in minified JS code. JSNeat follows a data-driven approach to recover names by searching for them in a large corpus of open-source JS code. We use three types of contexts to match a variable in given minified code against the corpus including the context of properties and roles of the variable, the context of that variable and relations with other variables under recovery, and the context of the task of the function to which the variable contributes. We performed several empirical experiments to evaluate JSNeat on the dataset of more than 322K JS files with 1M functions, and 3.5M variables with 176K unique variable names. We found that JSNeat achieves a high accuracy of 69.1%, which is the relative improvements of 66.1% and 43% over two state-of-the-art approaches JSNice and JSNaughty, respectively. The time to recover for a file or for a variable with JSNeat is twice as fast as with JSNice and 4x as fast as with JNaughty, respectively. \ No newline at end of file diff --git a/_publications/tu2014localness.markdown b/_publications/tu2014localness.markdown index 4cbfae31..af7dbda6 100644 --- a/_publications/tu2014localness.markdown +++ b/_publications/tu2014localness.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "On the Localness of Software" -authors: Z. Tu, Z. Su, P. Devanbu +authors: Zhaopeng Tu, Zhendong Su, Premkumar Devanbu conference: FSE year: 2014 -bibkey: tu2014localness tags: ["language model"] --- The n-gram language model, which has its roots in statistical natural diff --git a/_publications/tufano2018deep.markdown b/_publications/tufano2018deep.markdown index 7dbcc391..ee69587b 100644 --- a/_publications/tufano2018deep.markdown +++ b/_publications/tufano2018deep.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Deep Learning Similarities from Different Representations of Source Code" -authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk +authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk conference: MSR year: 2018 -bibkey: tufano2018deep tags: ["representation", "clone"] --- Assessing the similarity between code components plays a pivotal diff --git a/_publications/tufano2018empirical.markdown b/_publications/tufano2018empirical.markdown index 631816e5..a8fd9cdc 100644 --- a/_publications/tufano2018empirical.markdown +++ b/_publications/tufano2018empirical.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation" -authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk +authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk conference: year: 2018 -bibkey: tufano2018empirical tags: ["repair"] --- Millions of open-source projects with numerous bug fixes are available in code repositories. This proliferation of software development histories can be leveraged to learn how to fix common programming bugs. To explore such a potential, we perform an empirical study to assess the feasibility of using Neural Machine Translation techniques for learning bug-fixing patches for real defects. First, we mine millions of bug-fixes from the change histories of projects hosted on GitHub, in order to extract meaningful examples of such bug-fixes. Next, we abstract the buggy and corresponding fixed code, and use them to train an Encoder-Decoder model able to translate buggy code into its fixed version. In our empirical investigation we found that such a model is able to fix thousands of unique buggy methods in the wild. Overall, this model is capable of predicting fixed patches generated by developers in 9-50% of the cases, depending on the number of candidate patches we allow it to generate. Also, the model is able to emulate a variety of different Abstract Syntax Tree operations and generate candidate patches in a split second. diff --git a/_publications/tufano2018learning.markdown b/_publications/tufano2018learning.markdown index f6cceb60..5f0761df 100644 --- a/_publications/tufano2018learning.markdown +++ b/_publications/tufano2018learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning How to Mutate Source Code from Bug-Fixes" -authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk +authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk conference: year: 2018 -bibkey: tufano2018learning tags: ["repair", "edit"] --- Mutation testing has been widely accepted as an approach to guide test case generation or to assess the effectiveness of test suites. Empirical studies have shown that mutants are representative of real faults; yet they also indicated a clear need for better, possibly customized, mutation operators and strategies. While some recent papers have tried to devise domain-specific or general purpose mutator operators by manually analyzing real faults, such an activity is effort- (and error-) prone and does not deal with an important practical question as to how to really mutate a given source code element. We propose a novel approach to automatically learn mutants from faults in real programs. First, our approach processes bug fixing changes using fine-grained differencing, code abstraction, and change clustering. Then, it learns mutation models using a deep learning strategy. We have trained and evaluated our technique on a set of ~787k bugs mined from GitHub. Starting from code fixed by developers in the context of a bug-fix, our empirical evaluation showed that our models are able to predict mutants that resemble original fixed bugs in between 9% and 45% of the cases (depending on the model). Moreover, over 98% of the automatically generated mutants are lexically and syntactically correct. diff --git a/_publications/tufano2019learning.markdown b/_publications/tufano2019learning.markdown index c6dba557..79f7c352 100644 --- a/_publications/tufano2019learning.markdown +++ b/_publications/tufano2019learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "On Learning Meaningful Code Changes via Neural Machine Translation" -authors: M. Tufano, C. Watson, G. Bavota, M. Di Penta, M. White, D. Poshyvanyk +authors: Michele Tufano, Cody Watson, Gabriele Bavota, Massimiliano Di Penta, Martin White, Denys Poshyvanyk conference: ICSE year: 2019 -bibkey: tufano2019learning tags: ["repair", "edit"] --- Recent years have seen the rise of Deep Learning (DL) techniques applied to source code. Researchers have exploited DL to automate several development and maintenance tasks, such as writing commit messages, generating comments and detecting vulnerabilities among others. One of the long lasting dreams of applying DL to code is the possibility to automate non-trivial coding activities. While some steps in this direction have been taken (e.g., learning how to fix bugs), there is still a lack of empirical evidence on the types of code changes that can be learned and automatically applied by DL. Our goal is to make this first step by quantitatively and qualitatively investigating the ability of a Neural Machine Translation (NMT) model to learn how to automatically apply code changes implemented by developers during pull requests. We train and experiment with the NMT model on a set of 236k pairs of code components before and after the implementation of the changes provided in the pull requests. We show that, when applied in a narrow enough context (i.e., small/medium-sized pairs of methods before/after the pull request changes), NMT can automatically replicate the changes implemented by developers during pull requests in up to 36% of the cases. Moreover, our qualitative analysis shows that the model is capable of learning and replicating a wide variety of meaningful code changes, especially refactorings and bug-fixing activities. Our results pave the way to novel research in the area of DL on code, such as the automatic learning and applications of refactoring. diff --git a/_publications/tufano2020generating.markdown b/_publications/tufano2020generating.markdown new file mode 100644 index 00000000..2d73625a --- /dev/null +++ b/_publications/tufano2020generating.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Generating Accurate Assert Statements for Unit Test Cases using Pretrained Transformers" +authors: Michele Tufano, Dawn Drain, Alexey Svyatkovskiy, Shao Kun Deng, Neel Sundaresan +conference: ICSE +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2009.05634"} +tags: ["code generation", "synthesis", "test generation"] +--- +Unit testing represents the foundational basis of the software testing pyramid, beneath integration and end-to-end testing. Automated software testing researchers have proposed a variety of techniques to assist developers in this time-consuming task. In this paper we present an approach to support developers in writing unit test cases by generating accurate and useful assert statements. Our approach is based on a state-of-the-art transformer model initially pretrained on an English textual corpus. This semantically rich model is then trained in a semi-supervised fashion on a large corpus of source code. Finally, we finetune this model on the task of generating assert statements for unit tests. The resulting model is able to generate accurate assert statements for a given method under test. In our empirical evaluation, the model was able to predict the exact assert statements written by developers in 62% of the cases in the first attempt. The results show 80% relative improvement for top-1 accuracy over the previous RNN-based approach in the literature. We also show the substantial impact of the pretraining process on the performances of our model, as well as comparing it with assert auto-completion task. Finally, we demonstrate how our approach can be used to augment EvoSuite test cases, with additional asserts leading to improved test coverage. diff --git a/_publications/tufano2020unit.markdown b/_publications/tufano2020unit.markdown new file mode 100644 index 00000000..fc3fd110 --- /dev/null +++ b/_publications/tufano2020unit.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Unit Test Case Generation with Transformers" +authors: Michele Tufano, Dawn Drain, Alexey Svyatkovskiy, Shao Kun Deng, Neel Sundaresan +conference: ICSE +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2009.05617"} +tags: ["code generation", "synthesis", "test generation"] +--- +Automated Unit Test Case generation has been the focus of extensive literature within the research community. Existing approaches are usually guided by the test coverage criteria, generating synthetic test cases that are often difficult to read or understand for developers. In this paper we propose AthenaTest, an approach that aims at generating unit test cases by learning from real-world, developer-written test cases. Our approach relies on a state-of-the-art sequence-to-sequence transformer model which is able to write useful test cases for a given method under test (i.e., focal method). We also introduce methods2test - the largest publicly available supervised parallel corpus of unit test case methods and corresponding focal methods in Java, which comprises 630k test cases mined from 70k open-source repositories hosted on GitHub. We use this dataset to train a transformer model to translate focal methods into the corresponding test cases. We evaluate the ability of our model in generating test cases using natural language processing as well as code-specific criteria. First, we assess the quality of the translation compared to the target test case, then we analyze properties of the test case such as syntactic correctness and number and variety of testing APIs (e.g., asserts). We execute the test cases, collect test coverage information, and compare them with test cases generated by EvoSuite and GPT-3. Finally, we survey professional developers on their preference in terms of readability, understandability, and testing effectiveness of the generated test cases. diff --git a/_publications/vaithilingam2022expectation.markdown b/_publications/vaithilingam2022expectation.markdown new file mode 100644 index 00000000..4852cb5e --- /dev/null +++ b/_publications/vaithilingam2022expectation.markdown @@ -0,0 +1,24 @@ +--- +layout: publication +title: "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models" +authors: Priyan Vaithilingam, Tianyi Zhang, Elena Glassman +conference: CHI +year: 2022 +additional_links: + - {name: "Preprint", url: "https://tianyi-zhang.github.io/files/chi2022-lbw-copilot.pdf"} +tags: ["human evaluation", "code generation", "language model"] +--- +Recent advances in Large Language Models (LLM) have made automatic code generation possible for real-world programming tasks in +general-purpose programming languages such as Python. However, +there are few human studies on the usability of these tools and how +they fit the programming workflow. In this work, we conducted +a within-subjects user study with 24 participants to understand +how programmers use and perceive Copilot, a LLM-based code +generation tool. We found that, while Copilot did not necessarily +improve the task completion time or success rate, most participants preferred to use Copilot in daily programming tasks, since +Copilot often provided a useful starting point and saved the effort +of searching online. However, participants did face difficulties in +understanding, editing, and debugging code snippets generated +by Copilot, which significantly hindered their task-solving effectiveness. Finally, we highlighted several promising directions for +improving the design of Copilot based on our observations and +participants’ feedback. diff --git a/_publications/vasic2019neural.markdown b/_publications/vasic2019neural.markdown index ca52e1a1..69424536 100644 --- a/_publications/vasic2019neural.markdown +++ b/_publications/vasic2019neural.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Neural Program Repair by Jointly Learning to Localize and Repair" -authors: M. Vasic, A. Kanade, P. Maniatis, D. Bieber, R. Singh +authors: Marko Vasic, Aditya Kanade, Petros Maniatis, David Bieber, Rishabh Singh conference: ICLR year: 2019 -bibkey: vasic2019neural tags: ["repair", "program analysis", "variable misuse"] --- Due to its potential to improve programmer productivity and software quality, automated program repair has been an active topic of research. Newer techniques harness neural networks to learn directly from examples of buggy programs and their fixes. In this work, we consider a recently identified class of bugs called variable-misuse bugs. The state-of-the-art solution for variable misuse enumerates potential fixes for all possible bug locations in a program, before selecting the best prediction. We show that it is beneficial to train a model that jointly and directly localizes and repairs variable-misuse bugs. We present multi-headed pointer networks for this purpose, with one head each for localization and repair. The experimental results show that the joint model significantly outperforms an enumerative solution that uses a pointer based model for repair alone. diff --git a/_publications/vasilescu2017recovering.markdown b/_publications/vasilescu2017recovering.markdown index e103e690..cbec38ba 100644 --- a/_publications/vasilescu2017recovering.markdown +++ b/_publications/vasilescu2017recovering.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Recovering Clear, Natural Identifiers from Obfuscated JS Names" -authors: B. Vasilescu, C. Casalnuovo, P. Devanbu +authors: Bogdan Vasilescu, Casey Casalnuovo, Premkumar Devanbu conference: FSE year: 2017 -bibkey: vasilescu2017recovering tags: ["deobfuscation", "naming"] --- Well-chosen variable names are critical to source code readability, reusability, and maintainability. Unfortunately, in deployed JavaScript code (which is ubiquitous on the web) the identifier names are frequently minified and overloaded. This is done both for efficiency and also to protect potentially proprietary intellectual property. In this paper, we describe an approach based on statistical machine translation (SMT) that recovers some of the original names from the JavaScript programs minified by the very popular UglifyJS. This simple tool, Autonym, performs comparably to the best currently available deobfuscator for JavaScript, JSNice, which uses sophisticated static analysis. In fact, Autonym is quite complementary to JSNice, performing well when it does not, and vice versa. We also introduce a new tool, JSNaughty, which blends Autonym and JSNice, and significantly outperforms both at identifier name recovery, while remaining just as easy to use as JSNice. JSNaughty is available online at http://jsnaughty.org. diff --git a/_publications/villmow2021contest.markdown b/_publications/villmow2021contest.markdown new file mode 100644 index 00000000..4e5e976c --- /dev/null +++ b/_publications/villmow2021contest.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "ConTest: A Unit Test Completion Benchmark featuring Context" +authors: Johannes Villmow, Jonas Depoix, Adrian Ulges +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.2.pdf"} +tags: ["benchmark", "dataset", "verification", "Transformer"] +--- +We introduce CONTEST, a benchmark for NLP-based unit test completion, the task of predicting a test’s assert statements given its setup and focal method, i.e. the method to be tested. ConTest is large-scale (with 365k datapoints). Besides the test code and tested code, it also features context code called by either. We found context to be crucial for accurately predicting assertions. We also introduce baselines based on transformer encoder-decoders, and study the effects of including syntactic information and context. Overall, our models achieve a BLEU score of 38.2, while only generating unparsable code in 1.92% of cases. diff --git a/_publications/wan2018improving.markdown b/_publications/wan2018improving.markdown index e66a2e53..232a4f63 100644 --- a/_publications/wan2018improving.markdown +++ b/_publications/wan2018improving.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Improving Automatic Source Code Summarization via Deep Reinforcement Learning" -authors: Y. Wan, Z. Zhao, M. Yang, G. Xu, H. Ying, J. Wu, P.S. Yu +authors: Yao Wan, Zhou Zhao, Min Yang, Guandong Xu, Haochao Ying, Jian Wu, Philip S. Yu conference: ASE year: 2018 -bibkey: wan2018improving additional_links: - {name: "ACM", url: "https://dl.acm.org/citation.cfm?id=3238206"} tags: ["summarization", "documentation"] diff --git a/_publications/wan2019multimodal.markdown b/_publications/wan2019multimodal.markdown index a17e0be6..88d486a0 100644 --- a/_publications/wan2019multimodal.markdown +++ b/_publications/wan2019multimodal.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Multi-Modal Attention Network Learning for Semantic Source Code Retrieval" -authors: Y. Wan, J. Shu, Y. Sui, G. Xu, Z. Zhao, J. Wu, P. S. Yu +authors: Yao Wan, Jingdong Shu, Yulei Sui, Guandong Xu, Zhou Zhao, Jian Wu, Philip S. Yu conference: year: 2019 -bibkey: wan2019multimodal additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1909.13516"} tags: ["search"] diff --git a/_publications/wan2020naturalcc.markdown b/_publications/wan2020naturalcc.markdown new file mode 100644 index 00000000..ae4639ff --- /dev/null +++ b/_publications/wan2020naturalcc.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "NaturalCC: A Toolkit to Naturalize the Source Code Corpus" +authors: Yao Wan, Yang He, Jian-Guo Zhang, Yulei Sui, Hai Jin, Guandong Xu, Caiming Xiong, Philip S. Yu +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2012.03225"} + - {name: "website", url: "https://xcodemind.github.io/"} + - {name: "code", url: "https://github.com/CGCL-codes/naturalcc"} +tags: ["documentation", "search", "summarization"] +--- +We present NaturalCC, an efficient and extensible toolkit to bridge the gap between natural language and programming language, and facilitate the research on big code analysis. Using NaturalCC, researchers both from natural language or programming language communities can quickly and easily reproduce the state-of-the-art baselines and implement their approach. NaturalCC is built upon Fairseq and PyTorch, providing (1) an efficient computation with multi-GPU and mixed-precision data processing for fast model training, (2) a modular and extensible framework that makes it easy to reproduce or implement an approach for big code analysis, and (3) a command line interface and a graphical user interface to demonstrate each model's performance. Currently, we have included several state-of-the-art baselines across different tasks (e.g., code completion, code comment generation, and code retrieval) for demonstration. The video of this demo is available at https://www.youtube.com/watch?v=q4W5VSI-u3E&t=25s. diff --git a/_publications/wan2022what.markdown b/_publications/wan2022what.markdown new file mode 100644 index 00000000..5c8be571 --- /dev/null +++ b/_publications/wan2022what.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "What Do They Capture? -- A Structural Analysis of Pre-Trained Language Models for Source Code" +authors: Yao Wan, Wei Zhao, Hongyu Zhang, Yulei Sui, Guandong Xu, Hai Jin +conference: ICSE +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2202.06840"} + - {name: "Code", url: "https://github.com/CGCL-codes/naturalcc"} +tags: ["Transformer", "pretraining", "program analysis"] +--- +Recently, many pre-trained language models for source code have been proposed to model the context of code and serve as a basis for downstream code intelligence tasks such as code completion, code search, and code summarization. These models leverage masked pre-training and Transformer and have achieved promising results. However, currently there is still little progress regarding interpretability of existing pre-trained code models. It is not clear why these models work and what feature correlations they can capture. In this paper, we conduct a thorough structural analysis aiming to provide an interpretation of pre-trained language models for source code (e.g., CodeBERT, and GraphCodeBERT) from three distinctive perspectives: (1) attention analysis, (2) probing on the word embedding, and (3) syntax tree induction. Through comprehensive analysis, this paper reveals several insightful findings that may inspire future studies: (1) Attention aligns strongly with the syntax structure of code. (2) Pre-training language models of code can preserve the syntax structure of code in the intermediate representations of each Transformer layer. (3) The pre-trained models of code have the ability of inducing syntax trees of code. Theses findings suggest that it may be helpful to incorporate the syntax structure of code into the process of pre-training for better code representations. \ No newline at end of file diff --git a/_publications/wang2016automatically.markdown b/_publications/wang2016automatically.markdown index 95712819..20a03a7a 100644 --- a/_publications/wang2016automatically.markdown +++ b/_publications/wang2016automatically.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Automatically Learning Semantic Features for Defect Prediction" -authors: S. Wang, T. Liu, L. Tan +authors: Song Wang, Taiyue Liu, Lin Tan conference: ICSE year: 2016 -bibkey: wang2016automatically tags: ["defect", "representation"] --- Software defect prediction, which predicts defective code regions, can help developers find bugs and prioritize their testing efforts. To build accurate prediction models, previous diff --git a/_publications/wang2016bugram.markdown b/_publications/wang2016bugram.markdown index 5bcb72f4..34fd759b 100644 --- a/_publications/wang2016bugram.markdown +++ b/_publications/wang2016bugram.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Bugram: bug detection with n-gram language models" -authors: S. Wang, D. Chollak, D. Movshovitz-Attias, L. Tan +authors: Song Wang, Devin Chollak, Dana Movshovitz-Attias, Lin Tan conference: ASE year: 2016 -bibkey: wang2016bugram tags: ["defect", "representation"] --- diff --git a/_publications/wang2016neural.markdown b/_publications/wang2016neural.markdown index 9c8be0b7..3c82e8e0 100644 --- a/_publications/wang2016neural.markdown +++ b/_publications/wang2016neural.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Neural Code Completion" -authors: C. Liu, X. Wang, R. Shin, J.E. Gonzalez, D. Song +authors: Chang Liu, Xin Wang, Richard Shin, Joseph E. Gonzalez, Dawn Song conference: year: 2016 -bibkey: wang2016neural tags: ["autocomplete"] --- Code completion, an essential part of modern software development, yet can be diff --git a/_publications/wang2019learning.markdown b/_publications/wang2019learning.markdown index 14bfbce8..de57b30d 100644 --- a/_publications/wang2019learning.markdown +++ b/_publications/wang2019learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning Scalable and Precise Representation of Program Semantics" -authors: K. Wang +authors: Ke Wang conference: year: 2019 -bibkey: wang2019learning tags: ["representation", "dynamic"] --- Neural program embedding has shown potential in aiding the analysis of large-scale, complicated software. Newly proposed deep neural architectures pride themselves on learning program semantics rather than superficial syntactic features. However, by considering the source code only, the vast majority of neural networks do not capture a deep, precise representation of program semantics. In this paper, we present \dypro, a novel deep neural network that learns from program execution traces. Compared to the prior dynamic models, not only is \dypro capable of generalizing across multiple executions for learning a program's dynamic semantics in its entirety, but \dypro is also more efficient when dealing with programs yielding long execution traces. For evaluation, we task \dypro with semantic classification (i.e. categorizing programs based on their semantics) and compared it against two prominent static models: Gated Graph Neural Network and TreeLSTM. We find that \dypro achieves the highest prediction accuracy among all models. To further reveal the capacity of all aforementioned deep neural architectures, we examine if the models can learn to detect deeper semantic properties of a program. In particular given a task of recognizing loop invariants, we show \dypro beats all static models by a wide margin. diff --git a/_publications/wang2020blended.markdown b/_publications/wang2020blended.markdown new file mode 100644 index 00000000..fa57dff5 --- /dev/null +++ b/_publications/wang2020blended.markdown @@ -0,0 +1,9 @@ +--- +layout: publication +title: "Blended, precise semantic program embeddings" +authors: Ke Wang, Zhendong Su +conference: PLDI +year: 2020 +tags: ["dynamic"] +--- +Learning neural program embeddings is key to utilizing deep neural networks in program languages research --- precise and efficient program representations enable the application of deep models to a wide range of program analysis tasks. Existing approaches predominately learn to embed programs from their source code, and, as a result, they do not capture deep, precise program semantics. On the other hand, models learned from runtime information critically depend on the quality of program executions, thus leading to trained models with highly variant quality. This paper tackles these inherent weaknesses of prior approaches by introducing a new deep neural network, Liger, which learns program representations from a mixture of symbolic and concrete execution traces. We have evaluated Liger on two tasks: method name prediction and semantics classification. Results show that Liger is significantly more accurate than the state-of-the-art static model code2seq in predicting method names, and requires on average around 10x fewer executions covering nearly 4x fewer paths than the state-of-the-art dynamic model DYPRO in both tasks. Liger offers a new, interesting design point in the space of neural program embeddings and opens up this new direction for exploration. diff --git a/_publications/wang2020cocogum.markdown b/_publications/wang2020cocogum.markdown index d933331a..061f7c11 100644 --- a/_publications/wang2020cocogum.markdown +++ b/_publications/wang2020cocogum.markdown @@ -4,7 +4,6 @@ title: "CoCoGUM: Contextual Code Summarization with Multi-Relational GNN on UMLs authors: Yanlin Wang, Lun Du, Ensheng Shi, Yuxuan Hu, Shi Han, Dongmei Zhang conference: year: 2020 -bibkey: wang2020cocogum additional_links: - {name: "TR", url: "https://www.microsoft.com/en-us/research/publication/cocogum-contextual-code-summarization-with-multi-relational-gnn-on-umls/"} tags: ["summarization"] diff --git a/_publications/wang2020detecting.markdown b/_publications/wang2020detecting.markdown new file mode 100644 index 00000000..9b164e70 --- /dev/null +++ b/_publications/wang2020detecting.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree +authors: Wenhan Wang, Ge Li, Bo Ma, Xin Xia, Zhi Jin +conference: IEEE International Conference on Software Analysis, Evolution, and Reengineering +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2002.08653"} +tags: ["clone", "GNN"] +--- + +Code clones are semantically similar code fragments pairs that are syntactically similar or different. Detection of code clones can help to reduce the cost of software maintenance and prevent bugs. Numerous approaches of detecting code clones have been proposed previously, but most of them focus on detecting syntactic clones and do not work well on semantic clones with different syntactic features. To detect semantic clones, researchers have tried to adopt deep learning for code clone detection to automatically learn latent semantic features from data. Especially, to leverage grammar information, several approaches used abstract syntax trees (AST) as input and achieved significant progress on code clone benchmarks in various programming languages. However, these AST-based approaches still can not fully leverage the structural information of code fragments, especially semantic information such as control flow and data flow. To leverage control and data flow information, in this paper, we build a graph representation of programs called flow-augmented abstract syntax tree (FA-AST). We construct FA-AST by augmenting original ASTs with explicit control and data flow edges. Then we apply two different types of graph neural networks (GNN) on FA-AST to measure the similarity of code pairs. As far as we have concerned, we are the first to apply graph neural networks on the domain of code clone detection. We apply our FA-AST and graph neural networks on two Java datasets: Google Code Jam and BigCloneBench. Our approach outperforms the state-of-the-art approaches on both Google Code Jam and BigCloneBench tasks. diff --git a/_publications/wang2020learning.markdown b/_publications/wang2020learning.markdown index 16245ac0..01863571 100644 --- a/_publications/wang2020learning.markdown +++ b/_publications/wang2020learning.markdown @@ -4,7 +4,6 @@ title: "Learning Semantic Program Embeddings with Graph Interval Neural Network" authors: Yu Wang, Fengjuan Gao, Linzhang Wang, Ke Wang conference: year: 2020 -bibkey: wang2020learning additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2005.09997"} tags: ["GNN", "defect"] diff --git a/_publications/wang2020learning2.markdown b/_publications/wang2020learning2.markdown new file mode 100644 index 00000000..4817270e --- /dev/null +++ b/_publications/wang2020learning2.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "Learning to Represent Programs with Heterogeneous Graphs" +authors: Wenhan Wang, Kechi Zhang, Ge Li, Zhi Jin +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2012.04188"} +tags: ["GNN", "summarization"] +--- +Program source code contains complex structure information, which can be represented in structured data forms like trees or graphs. To acquire the structural information in source code, most existing researches use abstract syntax trees (AST). A group of works add additional edges to ASTs to convert source code into graphs and use graph neural networks to learn representations for program graphs. Although these works provide additional control or data flow information to ASTs for downstream tasks, they neglect an important aspect of structure information in AST itself: the different types of nodes and edges. In ASTs, different nodes contain different kinds of information like variables or control flow, and the relation between a node and all its children can also be different. + +To address the information of node and edge types, we bring the idea of heterogeneous graphs to learning on source code and present a new formula of building heterogeneous program graphs from ASTs with additional type information for nodes and edges. We use the ASDL grammar of programming language to define the node and edge types of program graphs. Then we use heterogeneous graph neural networks to learn on these graphs. We evaluate our approach on two tasks: code comment generation and method naming. Both tasks require reasoning on the semantics of complete code snippets. Experiment results show that our approach outperforms baseline models, including homogeneous graph-based models, showing that leveraging the type information of nodes and edges in program graphs can help in learning program semantics. diff --git a/_publications/wang2020modular.markdown b/_publications/wang2020modular.markdown new file mode 100644 index 00000000..96bd32bd --- /dev/null +++ b/_publications/wang2020modular.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Modular Tree Network for Source Code Representation Learning" +authors: Wenhan Wang, Ge Li, Sijie Shen, Xin Xia, Zhi Jin +conference: TOSEM +year: 2020 +additional_links: + - {name: "ACM", url: "https://dl.acm.org/doi/10.1145/3409331"} +tags: ["grammar", "representation"] +--- +Learning representation for source code is a foundation of many program analysis tasks. In recent years, neural networks have already shown success in this area, but most existing models did not make full use of the unique structural information of programs. Although abstract syntax tree (AST)-based neural models can handle the tree structure in the source code, they cannot capture the richness of different types of substructure in programs. In this article, we propose a modular tree network that dynamically composes different neural network units into tree structures based on the input AST. Different from previous tree-structural neural network models, a modular tree network can capture the semantic differences between types of AST substructures. We evaluate our model on two tasks: program classification and code clone detection. Our model achieves the best performance compared with state-of-the-art approaches in both tasks, showing the advantage of leveraging more elaborate structure information of the source code. diff --git a/_publications/wang2020trans.markdown b/_publications/wang2020trans.markdown index ca1ed0e6..49f05064 100644 --- a/_publications/wang2020trans.markdown +++ b/_publications/wang2020trans.markdown @@ -1,12 +1,11 @@ --- layout: publication title: "TranS^3: A Transformer-based Framework for Unifying Code Summarization and Code Search" -authors: W. Wang, Y. Zhang, Z. Zeng, G. Xu +authors: Wenhua Wang, Yuqun Zhang, Zhengran Zeng, Guandong Xu conference: year: 2020 -bibkey: wang2020trans additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2003.03238"} -tags: ["retrieval", "documentation"] +tags: ["search", "documentation"] --- Code summarization and code search have been widely adopted in sofwaredevelopmentandmaintenance. However, fewstudieshave explored the efcacy of unifying them. In this paper, we propose TranS^3 , a transformer-based framework to integrate code summarization with code search. Specifcally, for code summarization,TranS^3 enables an actor-critic network, where in the actor network, we encode the collected code snippets via transformer- and tree-transformer-based encoder and decode the given code snippet to generate its comment. Meanwhile, we iteratively tune the actor network via the feedback from the critic network for enhancing the quality of the generated comments. Furthermore, we import the generated comments to code search for enhancing its accuracy. To evaluatetheefectivenessof TranS^3 , we conduct a set of experimental studies and case studies where the experimental results suggest that TranS^3 can signifcantly outperform multiple state-of-the-art approaches in both code summarization and code search and the study results further strengthen the efcacy of TranS^3 from the developers' points of view. diff --git a/_publications/wang2021codet5.markdown b/_publications/wang2021codet5.markdown new file mode 100644 index 00000000..9b1ba6d4 --- /dev/null +++ b/_publications/wang2021codet5.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation" +authors: Yue Wang, Weishi Wang, Shafiq Joty, Steven C.H. Hoi +conference: EMNLP +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2109.00859"} + - {name: "Code & Model", url: "https://github.com/salesforce/CodeT5"} +tags: ["Transformer"] +--- +Pre-trained models for Natural Languages (NL) like BERT and GPT have been recently shown to transfer well to Programming Languages (PL) and largely benefit a broad set of code-related tasks. Despite their success, most current methods either rely on an encoder-only (or decoder-only) pre-training that is suboptimal for generation (resp. understanding) tasks or process the code snippet in the same way as NL, neglecting the special characteristics of PL such as token types. We present CodeT5, a unified pre-trained encoder-decoder Transformer model that better leverages the code semantics conveyed from the developer-assigned identifiers. Our model employs a unified framework to seamlessly support both code understanding and generation tasks and allows for multi-task learning. Besides, we propose a novel identifier-aware pre-training task that enables the model to distinguish which code tokens are identifiers and to recover them when they are masked. Furthermore, we propose to exploit the user-written code comments with a bimodal dual generation task for better NL-PL alignment. Comprehensive experiments show that CodeT5 significantly outperforms prior methods on understanding tasks such as code defect detection and clone detection, and generation tasks across various directions including PL-NL, NL-PL, and PL-PL. Further analysis reveals that our model can better capture semantic information from code. Our code and pre-trained models are released at https://github.com/salesforce/CodeT5 . diff --git a/_publications/wang2021syncobert.markdown b/_publications/wang2021syncobert.markdown new file mode 100644 index 00000000..1478c85f --- /dev/null +++ b/_publications/wang2021syncobert.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "SynCoBERT: Syntax-Guided Multi-Modal Contrastive Pre-Training for Code Representation" +authors: Xin Wang, Yasheng Wang, Fei Mi, Pingyi Zhou, Yao Wan, Xiao Liu, Li Li, Hao Wu, Jin Liu, Xin Jiang +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2108.04556"} +tags: ["pretraining"] +--- +Code representation learning, which aims to encode the semantics of source code into distributed vectors, plays an important role in recent deep-learning-based models for code intelligence. Recently, many pre-trained language models for source code (e.g., CuBERT and CodeBERT) have been proposed to model the context of code and serve as a basis for downstream code intelligence tasks such as code search, code clone detection, and program translation. Current approaches typically consider the source code as a plain sequence of tokens, or inject the structure information (e.g., AST and data-flow) into the sequential model pre-training. To further explore the properties of programming languages, this paper proposes SynCoBERT, a syntax-guided multi-modal contrastive pre-training approach for better code representations. Specially, we design two novel pre-training objectives originating from the symbolic and syntactic properties of source code, i.e., Identifier Prediction (IP) and AST Edge Prediction (TEP), which are designed to predict identifiers, and edges between two nodes of AST, respectively. Meanwhile, to exploit the complementary information in semantically equivalent modalities (i.e., code, comment, AST) of the code, we propose a multi-modal contrastive learning strategy to maximize the mutual information among different modalities. Extensive experiments on four downstream tasks related to code intelligence show that SynCoBERT advances the state-of-the-art with the same pre-training corpus and model size. \ No newline at end of file diff --git a/_publications/wang2023codet5.markdown b/_publications/wang2023codet5.markdown new file mode 100644 index 00000000..a75b04a2 --- /dev/null +++ b/_publications/wang2023codet5.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "CodeT5+: Open Code Large Language Models for Code Understanding and Generation" +authors: Yue Wang, Hung Le, Akhilesh Deepak Gotmare, Nghi D. Q. Bui, Junnan Li, Steven C. H. Hoi +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2305.07922"} +tags: ["Transformer"] +--- +Large language models (LLMs) pretrained on vast source code have achieved prominent progress in code intelligence. However, existing code LLMs have two main limitations in terms of architecture and pretraining tasks. First, they often adopt a specific architecture (encoder-only or decoder-only) or rely on a unified encoder-decoder network for different downstream tasks. The former paradigm is limited by inflexibility in applications while in the latter, the model is treated as a single system for all tasks, leading to suboptimal performance on a subset of tasks. Secondly, they often employ a limited set of pretraining objectives which might not be relevant to some downstream tasks and hence result in substantial performance degrade. To address these limitations, we propose ``CodeT5+'', a family of encoder-decoder LLMs for code in which component modules can be flexibly combined to suit a wide range of downstream code tasks. Such flexibility is enabled by our proposed mixture of pretraining objectives to mitigate the pretrain-finetune discrepancy. These objectives cover span denoising, contrastive learning, text-code matching, and causal LM pretraining tasks, on both unimodal and bimodal multilingual code corpora. Furthermore, we propose to initialize CodeT5+ with frozen off-the-shelf LLMs without training from scratch to efficiently scale up our models, and explore instruction-tuning to align with natural language instructions. We extensively evaluate CodeT5+ on over 20 code-related benchmarks in different settings, including zero-shot, finetuning, and instruction-tuning. We observe state-of-the-art (SoTA) model performance on various code-related tasks, such as code generation and completion, math programming, and text-to-code retrieval tasks. Particularly, our instruction-tuned CodeT5+ 16B achieves new SoTA results on HumanEval code generation task against other open code LLMs. diff --git a/_publications/wang2023deepvd.markdown b/_publications/wang2023deepvd.markdown new file mode 100644 index 00000000..5e797eaf --- /dev/null +++ b/_publications/wang2023deepvd.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "DeepVD: Toward Class-Separation Features for Neural Network Vulnerability Detection" +authors: Wenbo Wang, Tien N. Nguyen, Shaohua Wang, Yi Li, Jiyuan Zhang, Aashish Yadavally +conference: ICSE +year: 2023 +additional_links: + - {name: "website", url: "https://aashishyadavally.github.io/publication/C4"} + - {name: "code", url: "https://github.com/deepvd2022/deepvd2022"} +tags: ["vulnerability"] +--- +The advances of machine learning (ML) including deep learning (DL) have enabled several approaches to implicitly learn vulnerable code patterns to automatically detect software vulnerabilities. A recent study showed that despite successes, the existing ML/DL-based vulnerability detection (VD) models are limited in the ability to distinguish between the two classes of vulnerability and benign code. We propose DeepVD, a graph-based neural network VD model that emphasizes on class-separation features between vulnerability and benign code. DeepVD leverages three types of class-separation features at different levels of abstraction: statement types (similar to Part-of-Speech tagging), Post-Dominator Tree (covering regular flows of execution), and Exception Flow Graph (covering the exception and error-handling flows). We conducted several experiments to evaluate DeepVD in a real-world vulnerability dataset of 303 projects with 13,130 vulnerable methods. Our results show that DeepVD relatively improves over the state-of-the-art ML/DL-based VD approaches 13%–29.6% in precision, 15.6%–28.9% in recall, and 16.4%–25.8% in F-score. Our ablation study confirms that our designed features and components help DeepVD achieve high class-separability for vulnerability and benign code. diff --git a/_publications/watson2021systematic.markdown b/_publications/watson2021systematic.markdown new file mode 100644 index 00000000..01067564 --- /dev/null +++ b/_publications/watson2021systematic.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: A Systematic Literature Review on the Use of Deep Learning in Software Engineering Research +authors: Cody Watson, Nathan Cooper, David Nader Palacio, Kevin Moran, Denys Poshyvanyk +conference: TSE +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2009.06520"} + - {name: "website", url: "https://wm-semeru.github.io/dl4se/"} + - {name: "code", url: "https://github.com/WM-SEMERU/dl4se"} +tags: ["survey"] +--- +An increasingly popular set of techniques adopted by software engineering (SE) researchers to automate development tasks are those rooted in the concept of Deep Learning (DL). The popularity of such techniques largely stems from their automated feature engineering capabilities, which aid in modeling software artifacts. However, due to the rapid pace at which DL techniques have been adopted, it is difficult to distill the current successes, failures, and opportunities of the current research landscape. In an effort to bring clarity to this crosscutting area of work, from its modern inception to the present, this paper presents a systematic literature review of research at the intersection of SE & DL. The review canvases work appearing in the most prominent SE and DL conferences and journals and spans 128 papers across 23 unique SE tasks. We center our analysis around the components of learning, a set of principles that govern the application of machine learning techniques (ML) to a given problem domain, discussing several aspects of the surveyed work at a granular level. The end result of our analysis is a research roadmap that both delineates the foundations of DL techniques applied to SE research, and highlights likely areas of fertile exploration for the future. diff --git a/_publications/waunakh2019evaluating.markdown b/_publications/waunakh2019evaluating.markdown deleted file mode 100644 index aacea92d..00000000 --- a/_publications/waunakh2019evaluating.markdown +++ /dev/null @@ -1,12 +0,0 @@ ---- -layout: publication -title: "Evaluating Semantic Representations of Source Code" -authors: Y. Wainakh, M. Rauf, M. Pradel -conference: -year: 2019 -bibkey: waunakh2019evaluating -additional_links: - - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05177"} -tags: ["representation"] ---- -Learned representations of source code enable various software developer tools, e.g., to detect bugs or to predict program properties. At the core of code representations often are word embeddings of identifier names in source code, because identifiers account for the majority of source code vocabulary and convey important semantic information. Unfortunately, there currently is no generally accepted way of evaluating the quality of word embeddings of identifiers, and current evaluations are biased toward specific downstream tasks. This paper presents IdBench, the first benchmark for evaluating to what extent word embeddings of identifiers represent semantic relatedness and similarity. The benchmark is based on thousands of ratings gathered by surveying 500 software developers. We use IdBench to evaluate state-of-the-art embedding techniques proposed for natural language, an embedding technique specifically designed for source code, and lexical string distance functions, as these are often used in current developer tools. Our results show that the effectiveness of embeddings varies significantly across different embedding techniques and that the best available embeddings successfully represent semantic relatedness. On the downside, no existing embedding provides a satisfactory representation of semantic similarities, e.g., because embeddings consider identifiers with opposing meanings as similar, which may lead to fatal mistakes in downstream developer tools. IdBench provides a gold standard to guide the development of novel embeddings that address the current limitations. diff --git a/_publications/waunakh2019idbench.markdown b/_publications/waunakh2019idbench.markdown new file mode 100644 index 00000000..4bc73df9 --- /dev/null +++ b/_publications/waunakh2019idbench.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "IdBench: Evaluating Semantic Representations of Identifier Names in Source Code" +authors: Yaza Wainakh, Moiz Rauf, Michael Pradel +conference: ICSE +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05177"} +tags: ["representation"] +--- +Identifier names convey useful information about the intended semantics of code. Name-based program analyses use this information, e.g., to detect bugs, to predict types, and to improve the readability of code. At the core of namebased analyses are semantic representations of identifiers, e.g., in the form of learned embeddings. The high-level goal of such a representation is to encode whether two identifiers, e.g., len and size, are semantically similar. Unfortunately, it is currently unclear to what extent semantic representations match the semantic relatedness and similarity perceived by developers. This paper presents IdBench, the first benchmark for evaluating semantic representations against a ground truth created from thousands of ratings by 500 software developers. We use IdBench to study state-of-the-art embedding techniques proposed for natural language, an embedding technique specifically designed for source code, and lexical string distance functions. Our results show that the effectiveness of semantic representations varies significantly and that the best available embeddings successfully represent semantic relatedness. On the downside, no existing technique provides a satisfactory representation of semantic similarities, among other reasons because identifiers with opposing meanings are incorrectly considered to be similar, which may lead to fatal mistakes, e.g., in a refactoring tool. Studying the strengths and weaknesses of the different techniques shows that they complement each other. As a first step toward exploiting this complementarity, we present an ensemble model that combines existing techniques and that clearly outperforms the best available semantic representation. diff --git a/_publications/wei2019code.markdown b/_publications/wei2019code.markdown index 27202b72..e83ac638 100644 --- a/_publications/wei2019code.markdown +++ b/_publications/wei2019code.markdown @@ -1,12 +1,11 @@ --- layout: publication title: "Code Generation as a Dual Task of Code Summarization" -authors: B. Wei, G. Li, X. Xia, Z. Fu, Z. Jin +authors: Bolin Wei, Ge Li, Xin Xia, Zhiyi Fu, Zhi Jin conference: NeurIPS year: 2019 -bibkey: wei2019code additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1910.05923"} -tags: ["generation", "summarization"] +tags: ["code generation", "summarization"] --- Code summarization (CS) and code generation (CG) are two crucial tasks in the field of automatic software development. Various neural network-based approaches are proposed to solve these two tasks separately. However, there exists a specific intuitive correlation between CS and CG, which have not been exploited in previous work. In this paper, we apply the relations between two tasks to improve the performance of both tasks. In other words, exploiting the duality between the two tasks, we propose a dual training framework to train the two tasks simultaneously. In this framework, we consider the dualities on probability and attention weights, and design corresponding regularization terms to constrain the duality. We evaluate our approach on two datasets collected from GitHub, and experimental results show that our dual framework can improve the performance of CS and CG tasks over baselines. diff --git a/_publications/wei2020lambdanet.markdown b/_publications/wei2020lambdanet.markdown index cc7c99ed..12de7074 100644 --- a/_publications/wei2020lambdanet.markdown +++ b/_publications/wei2020lambdanet.markdown @@ -1,12 +1,12 @@ --- layout: publication title: "LambdaNet: Probabilistic Type Inference using Graph Neural Networks" -authors: J. Wei, M. Goyal, G. Durrett, I. Dillig +authors: Jiayi Wei, Maruth Goyal, Greg Durrett, Isil Dillig conference: ICLR year: 2020 -bibkey: wei2020lambdanet additional_links: - {name: "OpenReview", url: "https://openreview.net/forum?id=Hkx6hANtwH¬eId=Hkx6hANtwH"} + - {name: "ArXiV", url: "https://arxiv.org/abs/2005.02161"} - {name: "Code", url: "https://github.com/MrVPlusOne/LambdaNet"} tags: ["GNN", "types"] --- diff --git a/_publications/wei2023typet5.markdown b/_publications/wei2023typet5.markdown new file mode 100644 index 00000000..03b7262a --- /dev/null +++ b/_publications/wei2023typet5.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "TypeT5: Seq2seq Type Inference using Static Analysis" +authors: Jiayi Wei, Greg Durrett, Isil Dillig +conference: ICLR +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2303.09564"} +tags: ["types", "Transformer"] +--- +There has been growing interest in automatically predicting missing type annotations in programs written in Python and JavaScript. While prior methods have achieved impressive accuracy when predicting the most common types, they often perform poorly on rare or complex types. In this paper, we present a new type inference method that treats type prediction as a code infilling task by leveraging CodeT5, a state-of-the-art seq2seq pre-trained language model for code. Our method uses static analysis to construct dynamic contexts for each code element whose type signature is to be predicted by the model. We also propose an iterative decoding scheme that incorporates previous type predictions in the model's input context, allowing information exchange between related code elements. Our evaluation shows that the proposed approach, TypeT5, not only achieves a higher overall accuracy (particularly on rare and complex types) but also produces more coherent results with fewer type errors -- while enabling easy user intervention. diff --git a/_publications/white2015toward.markdown b/_publications/white2015toward.markdown index 2c8e2793..0c4406d7 100644 --- a/_publications/white2015toward.markdown +++ b/_publications/white2015toward.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Toward Deep Learning Software Repositories" -authors: M. White, C. Vendome, M. Linares-Vásquez, D. Poshyvanyk +authors: Martin White, Christopher Vendome, Mario Linares-Vasquez, Denys Poshyvanyk conference: MSR year: 2015 -bibkey: white2015toward tags: ["representation"] --- Deep learning subsumes algorithms that automatically learn compositional representations. The ability of these diff --git a/_publications/white2016deep.markdown b/_publications/white2016deep.markdown index d19800a3..b31d11da 100644 --- a/_publications/white2016deep.markdown +++ b/_publications/white2016deep.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Deep Learning Code Fragments for Code Clone Detection" -authors: M. White, M. Tufano, C. Vendome, D. Poshyvanyk +authors: Martin White, Michele Tufano, Christopher Vendome, Denys Poshyvanyk. conference: ASE year: 2016 -bibkey: white2016deep tags: ["clone"] --- Code clone detection is an important problem for software diff --git a/_publications/white2017sorting.markdown b/_publications/white2017sorting.markdown index 4a8e2341..f4653c43 100644 --- a/_publications/white2017sorting.markdown +++ b/_publications/white2017sorting.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Sorting and Transforming Program Repair Ingredients via Deep Learning Code Similarities" -authors: M. White, M. Tufano, M. Martínez, M. Monperrus, D. Poshyvanyk -conference: +authors: Martin White, Michele Tufano, Matias Martinez, Martin Monperrus, Denys Poshyvanyk +conference: SANER year: 2017 -bibkey: white2017sorting tags: ["repair"] --- In the field of automated program repair, the redundancy assumption claims large programs contain the seeds diff --git a/_publications/wong2021leveraging.markdown b/_publications/wong2021leveraging.markdown new file mode 100644 index 00000000..414b7031 --- /dev/null +++ b/_publications/wong2021leveraging.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Leveraging Language to Learn Program Abstractions and Search Heuristics" +authors: Catherine Wong, Kevin Ellis, Joshua B. Tenenbaum, Jacob Andreas +conference: Thirty-eighth International Conference on Machine Learning (ICML 2021) +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2106.11053"} + - {name: "Poster", url: "https://icml.cc/Conferences/2021/ScheduleMultitrack?event=10372"} +tags: ["synthesis", "search"] +--- +Inductive program synthesis, or inferring programs from examples of desired behavior, offers a general paradigm for building interpretable, robust, and generalizable machine learning systems. Effective program synthesis depends on two key ingredients: a strong library of functions from which to build programs, and an efficient search strategy for finding programs that solve a given task. We introduce LAPS (Language for Abstraction and Program Search), a technique for using natural language annotations to guide joint learning of libraries and neurally-guided search models for synthesis. When integrated into a state-of-the-art library learning system (DreamCoder), LAPS produces higher-quality libraries and improves search efficiency and generalization on three domains -- string editing, image composition, and abstract reasoning about scenes -- even when no natural language hints are available at test time. \ No newline at end of file diff --git a/_publications/wu2021prototransformer.markdown b/_publications/wu2021prototransformer.markdown new file mode 100644 index 00000000..802b2466 --- /dev/null +++ b/_publications/wu2021prototransformer.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "ProtoTransformer: A Meta-Learning Approach to Providing Student Feedback" +authors: Mike Wu, Noah D. Goodman, Chris Piech, Chelsea Finn +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2107.14035"} +tags: ["Transformer", "education"] +--- +High-quality computer science education is limited by the difficulty of providing instructor feedback to students at scale. While this feedback could in principle be automated, supervised approaches to predicting the correct feedback are bottlenecked by the intractability of annotating large quantities of student code. In this paper, we instead frame the problem of providing feedback as few-shot classification, where a meta-learner adapts to give feedback to student code on a new programming question from just a few examples annotated by instructors. Because data for meta-training is limited, we propose a number of amendments to the typical few-shot learning framework, including task augmentation to create synthetic tasks, and additional side information to build stronger priors about each task. These additions are combined with a transformer architecture to embed discrete sequences (e.g. code) to a prototypical representation of a feedback class label. On a suite of few-shot natural language processing tasks, we match or outperform state-of-the-art performance. Then, on a collection of student solutions to exam questions from an introductory university course, we show that our approach reaches an average precision of 88% on unseen questions, surpassing the 82% precision of teaching assistants. Our approach was successfully deployed to deliver feedback to 16,000 student exam-solutions in a programming course offered by a tier 1 university. This is, to the best of our knowledge, the first successful deployment of a machine learning based feedback to open-ended student code. diff --git a/_publications/xia2023universal.markdown b/_publications/xia2023universal.markdown new file mode 100644 index 00000000..0f20b845 --- /dev/null +++ b/_publications/xia2023universal.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Universal Fuzzing via Large Language Models" +authors: Chunqiu Steven Xia, Matteo Paltenghi, Jia Le Tian, Michael Pradel, Lingming Zhang +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2308.04748"} +tags: ["fuzzing"] +--- +Fuzzing has achieved tremendous success in discovering bugs and vulnerabilities in various software systems. Systems under test (SUTs) that take in programming or formal language as inputs, e.g., compilers, runtime engines, constraint solvers, and software libraries with accessible APIs, are especially important as they are fundamental building blocks of software development. However, existing fuzzers for such systems often target a specific language, and thus cannot be easily applied to other languages or even other versions of the same language. Moreover, the inputs generated by existing fuzzers are often limited to specific features of the input language, and thus can hardly reveal bugs related to other or new features. This paper presents Fuzz4All, the first fuzzer that is universal in the sense that it can target many different input languages and many different features of these languages. The key idea behind Fuzz4All is to leverage large language models (LLMs) as an input generation and mutation engine, which enables the approach to produce diverse and realistic inputs for any practically relevant language. To realize this potential, we present a novel autoprompting technique, which creates LLM prompts that are wellsuited for fuzzing, and a novel LLM-powered fuzzing loop, which iteratively updates the prompt to create new fuzzing inputs. We evaluate Fuzz4All on nine systems under test that take in six different languages (C, C++, Go, SMT2, Java and Python) as inputs. The evaluation shows, across all six languages, that universal fuzzing achieves higher coverage than existing, language-specific fuzzers. Furthermore, Fuzz4All has identified 76 bugs in widely used systems, such as GCC, Clang, Z3, CVC5, OpenJDK, and the Qiskit quantum computing platform, with 47 bugs already confirmed by developers as previously unknown. diff --git a/_publications/xu2019commit.markdown b/_publications/xu2019commit.markdown index 96e1a918..ca64886e 100644 --- a/_publications/xu2019commit.markdown +++ b/_publications/xu2019commit.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Commit Message Generation for Source Code Changes" -authors: S. Xu, Y. Yao, F. Xu, T. Gu, H. Tong, J. Lu +authors: Shengbin Xu, Yuan Yao, Feng Xu, Tianxiao Gu, Hanghang Tong, Jian Lu conference: IJCAI year: 2019 -bibkey: xu2019commit tags: ["edit", "summarization"] --- Commit messages, which summarize the source diff --git a/_publications/xu2019method.markdown b/_publications/xu2019method.markdown index d5358080..970a8d76 100644 --- a/_publications/xu2019method.markdown +++ b/_publications/xu2019method.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Method name suggestion with hierarchical attention networks" -authors: S. Xu, S. Zhang, W. Wang, X. Cao, C. Guo, J. Xu +authors: Sihan Xu, Sen Zhang, Weijing Wang, Xinya Cao, Chenkai Guo, Jing Xu. conference: PEPM year: 2019 -bibkey: xu2019method tags: ["naming"] --- Method Rename has been a widely used refactoring operation that improves program comprehension and maintenance. Descriptive method names that summarize functionalities of source code can facilitate program comprehension. Much research has been done to suggest method names through source code summarization. However, unlike natural language, a code snippet consists of basic blocks organized by complicated structures. In this work, we observe a hierarchical structure --- tokens form basic blocks and basic blocks form a code snippet. Based on this observation, we exploit a hierarchical attention network to learn the representation of methods. Specifically, we apply two-level attention mechanism to learn the importance of each token in a basic block and that of a basic block in a method respectively. We evaluated our approach on 10 open source repositories and compared it against three state-of-the-art approaches. The results on these open-source data show the superiority of our hierarchical attention networks in terms of effectiveness. diff --git a/_publications/xu2020incorporating.markdown b/_publications/xu2020incorporating.markdown index d00a5ceb..f4eeb528 100644 --- a/_publications/xu2020incorporating.markdown +++ b/_publications/xu2020incorporating.markdown @@ -4,10 +4,9 @@ title: "Incorporating External Knowledge through Pre-training for Natural Langua authors: Frank F. Xu, Zhengbao Jiang, Pengcheng Yin, Bogdan Vasilescu, Graham Neubig conference: ACL year: 2020 -bibkey: xu2020incorporating additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2004.09015"} - {name: "Code", url: "https://github.com/neulab/external-knowledge-codegen"} -tags: ["bimodal", "generation"] +tags: ["bimodal", "code generation"] --- Open-domain code generation aims to generate code in a general-purpose programming language (such as Python) from natural language (NL) intents. Motivated by the intuition that developers usually retrieve resources on the web when writing code, we explore the effectiveness of incorporating two varieties of external knowledge into NL-to-code generation: automatically mined NL-code pairs from the online programming QA forum StackOverflow and programming language API documentation. Our evaluations show that combining the two sources with data augmentation and retrieval-based data re-sampling improves the current state-of-the-art by up to 2.2% absolute BLEU score on the code generation testbed CoNaLa. The code and resources are available at [Open-domain code generation aims to generate code in a general-purpose programming language (such as Python) from natural language (NL) intents. Motivated by the intuition that developers usually retrieve resources on the web when writing code, we explore the effectiveness of incorporating two varieties of external knowledge into NL-to-code generation: automatically mined NL-code pairs from the online programming QA forum StackOverflow and programming language API documentation. Our evaluations show that combining the two sources with data augmentation and retrieval-based data re-sampling improves the current state-of-the-art by up to 2.2% absolute BLEU score on the code generation testbed CoNaLa. The code and resources are available at [https://github.com/neulab/external-knowledge-codegen](https://github.com/neulab/external-knowledge-codegen). diff --git a/_publications/xu2021capturing.markdown b/_publications/xu2021capturing.markdown new file mode 100644 index 00000000..db3498ac --- /dev/null +++ b/_publications/xu2021capturing.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Capturing Structural Locality in Non-parametric Language Models" +authors: Frank F. Xu, Junxian He, Graham Neubig, Vincent J. Hellendoorn +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2110.02870"} +tags: ["language model"] +--- +Structural locality is a ubiquitous feature of real-world datasets, wherein data points are organized into local hierarchies. Some examples include topical clusters in text or project hierarchies in source code repositories. In this paper, we explore utilizing this structural locality within non-parametric language models, which generate sequences that reference retrieved examples from an external source. We propose a simple yet effective approach for adding locality information into such models by adding learned parameters that improve the likelihood of retrieving examples from local neighborhoods. Experiments on two different domains, Java source code and Wikipedia text, demonstrate that locality features improve model efficacy over models without access to these features, with interesting differences. We also perform an analysis of how and where locality features contribute to improved performance and why the traditionally used contextual similarity metrics alone are not enough to grasp the locality structure. diff --git a/_publications/xu2022systematic.markdown b/_publications/xu2022systematic.markdown new file mode 100644 index 00000000..fc1885f2 --- /dev/null +++ b/_publications/xu2022systematic.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "A Systematic Evaluation of Large Language Models of Code" +authors: Frank F. Xu, Uri Alon, Graham Neubig, Vincent J. Hellendoorn +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2202.13169"} +tags: ["Transformer", "language model"] +--- +Large language models (LMs) of code have recently shown tremendous promise in completing code and synthesizing code from natural language descriptions. However, the current state-of-the-art code LMs (e.g., Codex (Chen et al., 2021)) are not publicly available, leaving many questions about their model and data design decisions. We aim to fill in some of these blanks through a systematic evaluation of the largest existing models: Codex, GPT-J, GPT-Neo, GPT-NeoX-20B, and CodeParrot, across various programming languages. Although Codex itself is not open-source, we find that existing open-source models do achieve close results in some programming languages, although targeted mainly for natural language modeling. We further identify an important missing piece in the form of a large open-source model trained exclusively on a multi-lingual corpus of code. We release a new model, PolyCoder, with 2.7B parameters based on the GPT-2 architecture, which was trained on 249GB of code across 12 programming languages on a single machine. In the C programming language, PolyCoder outperforms all models including Codex. Our trained models are open-source and publicly available at this https URL, which enables future research and application in this area. diff --git a/_publications/yadavally2023partial.markdown b/_publications/yadavally2023partial.markdown new file mode 100644 index 00000000..46ab23b5 --- /dev/null +++ b/_publications/yadavally2023partial.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "(Partial) Program Dependence Learning" +authors: Aashish Yadavally, Wenbo Wang, Shaohua Wang, Tien N. Nguyen +conference: ICSE +year: 2023 +additional_links: + - {name: "website", url: "https://aashishyadavally.github.io/publication/C5"} + - {name: "code", url: "https://github.com/aashishyadavally/NeuralPDA"} +tags: ["large language models", "program analysis", "static analysis", "tool"] +--- +Code fragments from developer forums often migrate to applications due to the code reuse practice. Owing to the incomplete nature of such programs, analyzing them to early determine the presence of potential vulnerabilities is challenging. In this work, we introduce NeuralPDA, a neural network-based program dependence analysis tool for both complete and partial programs. Our tool efficiently incorporates intra-statement and inter-statement contextual features into statement representations, thereby modeling program dependence analysis as a statement-pair dependence decoding task. In the empirical evaluation, we report that NeuralPDA predicts the CFG and PDG edges in complete Java and C/C++ code with combined F-scores of 94.29% and 92.46%, respectively. The F-score values for partial Java and C/C++ code range from 94.29%–97.17% and 92.46%–96.01%, respectively. We also test the usefulness of the PDGs predicted by NEURALPDA (i.e., PDG*) on the downstream task of method-level vulnerability detection. We discover that the performance of the vulnerability detection tool utilizing PDG* is only 1.1% less than that utilizing the PDGs generated by a program analysis tool. We also report the detection of 14 real-world vulnerable code snippets from StackOverflow by a machine learning-based vulnerability detection tool that employs the PDGs predicted by NeuralPDA for these code snippets. diff --git a/_publications/yadavally2024learning.markdown b/_publications/yadavally2024learning.markdown new file mode 100644 index 00000000..3a46067e --- /dev/null +++ b/_publications/yadavally2024learning.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "A Learning-Based Approach to Static Program Slicing" +authors: Aashish Yadavally, Yi Li, Shaohua Wang, Tien N. Nguyen +conference: OOPSLA +year: 2024 +additional_links: + - {name: "website", url: "https://aashishyadavally.github.io/assets/pdf/pub-oopsla2024.pdf"} + - {name: "code", url: "https://github.com/aashishyadavally/ns-slicer"} +tags: ["large language models", "program analysis", "static", "tool"] +--- +Traditional program slicing techniques are crucial for early bug detection and manual/automated debugging of online code snippets. Nevertheless, their inability to handle incomplete code hinders their real-world applicability in such scenarios. To overcome these challenges, we present NS-Slicer, a novel learning-based approach that predicts static program slices for both complete and partial code. Our tool leverages a pre-trained language model to exploit its understanding of fine-grained variable-statement dependencies within source code. With this knowledge, given a variable at a specific location and a statement in a code snippet, NS-Slicer determines whether the statement belongs to the backward slice or forward slice, respectively. We conducted a series of experiments to evaluate NS-Slicer’s performance. On complete code, it predicts the backward and forward slices with an F1-score of 97.41% and 95.82%, respectively, while achieving an overall F1-score of 96.77%. Notably, in 85.20% of the cases, the static program slices predicted by NS-Slicer exactly match entire slices from the oracle. For partial programs, it achieved an F1-score of 96.77%–97.49% for backward slicing, 92.14%–95.40% for forward slicing, and an overall F1-score of 94.66%–96.62%. Furthermore, we demonstrate NS-Slicer’s utility in vulnerability detection (VD), integrating its predicted slices into an automated VD tool. In this setup, the tool detected vulnerabilities in Java code with a high F1-score of 73.38%. We also include the analyses studying NS-Slicer’s promising performance and limitations, providing insights into its understanding of intrinsic code properties such as variable aliasing, leading to better slicing. diff --git a/_publications/yadavally2024predictive.markdown b/_publications/yadavally2024predictive.markdown new file mode 100644 index 00000000..9f8930b1 --- /dev/null +++ b/_publications/yadavally2024predictive.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Predictive Program Slicing via Execution Knowledge-Guided Dynamic Dependence Learning" +authors: Aashish Yadavally, Yi Li, Tien N. Nguyen +conference: FSE +year: 2024 +additional_links: + - {name: "website", url: "https://aashishyadavally.github.io/assets/pdf/pub-fse2024.pdf"} + - {name: "code", url: "https://github.com/aashishyadavally/nd-slicer"} +tags: ["large language models", "program analysis", "dynamic", "tool"] +--- +Program slicing, the process of extracting program statements that influence values at a designated location (known as the slicing criterion), is helpful in both manual and automated debugging. However, such slicing techniques prove ineffective in scenarios where executing specific inputs is prohibitively expensive, or even impossible, as with partial code. In this paper, we introduce ND-Slicer, a predictive slicing methodology that caters to specific executions based on a particular input, overcoming the need for actual execution. We enable such a process by leveraging execution-aware pre-training to learn the dynamic program dependencies, including both dynamic data and control dependencies between variables in the slicing criterion and the remaining program statements. Such knowledge forms the cornerstone for constructing a predictive backward slice. Our empirical evaluation revealed a high accuracy in predicting program slices, achieving an exact-match accuracy of 81.3% and a ROUGE-LCS F1-score of 95.4% on Python programs. As an extrinsic evaluation, we illustrate ND-Slicer’s usefulness in crash detection, with it locating faults with an accuracy of 63.9%. Furthermore, we include an in-depth qualitative evaluation, assessing ND-Slicer’s understanding of branched structures such as if-else blocks and loops, as well as the control flow in inter-procedural calls. diff --git a/_publications/yadid2016extracting.markdown b/_publications/yadid2016extracting.markdown index ce16e05e..5e91e271 100644 --- a/_publications/yadid2016extracting.markdown +++ b/_publications/yadid2016extracting.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Extracting Code from Programming Tutorial Videos" -authors: S. Yadid, E. Yahav +authors: Shir Yadid, Eran Yahav conference: Onward! year: 2016 -bibkey: yadid2016extracting tags: ["information extraction"] --- The number of programming tutorial videos on the web diff --git a/_publications/yan2020are.markdown b/_publications/yan2020are.markdown new file mode 100644 index 00000000..d8815855 --- /dev/null +++ b/_publications/yan2020are.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Are the Code Snippets What We Are Searching for? A Benchmark and an Empirical Study on Code Search with Natural-Language Queries" +authors: Shuhan Yan, Hang Yu, Yuting Chen, Beijun Shen, Lingxiao Jiang +conference: SANER +year: 2020 +additional_links: + - { name: "IEEE", url: "https://ieeexplore.ieee.org/document/9054840" } +tags: ["search"] +--- + +Code search methods, especially those that allow programmers to raise queries in a natural language, plays an important role in software development. It helps to improve programmers' productivity by returning sample code snippets from the Internet and/or source-code repositories for their natural-language queries. Meanwhile, there are many code search methods in the literature that support natural-language queries. Difficulties exist in recognizing the strengths and weaknesses of each method and choosing the right one for different usage scenarios, because (1) the implementations of those methods and the datasets for evaluating them are usually not publicly available, and (2) some methods leverage different training datasets or auxiliary data sources and thus their effectiveness cannot be fairly measured and may be negatively affected in practical uses. To build a common ground for measuring code search methods, this paper builds CosBench, a dataset that consists of 1000 projects, 52 code-independent natural-language queries with ground truths, and a set of scripts for calculating four metrics on code research results. We have evaluated four IR (Information Retrieval)-based and two DL (Deep Learning)-based code search methods on CosBench. The empirical evaluation results clearly show the usefulness of the CosBench dataset and various strengths of each code search method. We found that DL-based methods are more suitable for queries on reusing code, and IR-based ones for queries on resolving bugs and learning API uses. diff --git a/_publications/yang2017language.markdown b/_publications/yang2017language.markdown index 78a36aac..fccc44ba 100644 --- a/_publications/yang2017language.markdown +++ b/_publications/yang2017language.markdown @@ -1,10 +1,9 @@ --- layout: publication title: A Language Model for Statements of Software Code -authors: Y. Yang, Y. Jiang, M. Gu, J. Sun, J. Gao, H. Liu +authors: Yixiao Yang, Yu Jiang, Ming Gu, Jiaguang Sun, Jian Gao, Han Liu conference: ASE year: 2017 -bibkey: yang2017language additional_links: - {name: "ACM", url: "https://dl.acm.org/citation.cfm?id=3155647"} tags: ["language model"] diff --git a/_publications/yang2020survey.markdown b/_publications/yang2020survey.markdown new file mode 100644 index 00000000..bfa17b11 --- /dev/null +++ b/_publications/yang2020survey.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "A Survey on Deep Learning for Software Engineering" +authors: Yanming Yang, Xin Xia, David Lo, John Grundy +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2011.14597"} +tags: ["survey"] +--- +In 2006, Geoffrey Hinton proposed the concept of training ''Deep Neural Networks (DNNs)'' and an improved model training method to break the bottleneck of neural network development. More recently, the introduction of AlphaGo in 2016 demonstrated the powerful learning ability of deep learning and its enormous potential. Deep learning has been increasingly used to develop state-of-the-art software engineering (SE) research tools due to its ability to boost performance for various SE tasks. There are many factors, e.g., deep learning model selection, internal structure differences, and model optimization techniques, that may have an impact on the performance of DNNs applied in SE. Few works to date focus on summarizing, classifying, and analyzing the application of deep learning techniques in SE. To fill this gap, we performed a survey to analyse the relevant studies published since 2006. We first provide an example to illustrate how deep learning techniques are used in SE. We then summarize and classify different deep learning techniques used in SE. We analyzed key optimization technologies used in these deep learning models, and finally describe a range of key research topics using DNNs in SE. Based on our findings, we present a set of current challenges remaining to be investigated and outline a proposed research road map highlighting key opportunities for future work. diff --git a/_publications/yao2018staqc.markdown b/_publications/yao2018staqc.markdown index 5ba778a6..9d6fbc53 100644 --- a/_publications/yao2018staqc.markdown +++ b/_publications/yao2018staqc.markdown @@ -4,7 +4,6 @@ title: "StaQC: A Systematically Mined Question-Code Dataset from Stack Overflow" authors: Ziyu Yao, Daniel S. Weld, Wei-Peng Chen, Huan Sun conference: WWW 2018 year: 2018 -bibkey: yao2018staqc additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1803.09371"} - {name: "code", url: "https://github.com/LittleYUYU/StackOverflow-Question-Code-Dataset"} diff --git a/_publications/yao2019coacor.markdown b/_publications/yao2019coacor.markdown index 858e538d..0a67dfa1 100644 --- a/_publications/yao2019coacor.markdown +++ b/_publications/yao2019coacor.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "CoaCor: Code Annotation for Code Retrieval with Reinforcement Learning" -authors: Z Yao, JR Peddamail, H. Sun +authors: Ziyu Yao, Jayavardhan Reddy Peddamail, Huan Sun conference: year: 2019 -bibkey: yao2019coacor tags: ["search"] --- To accelerate software development, much research has been performed diff --git a/_publications/yasunaga2020graph.markdown b/_publications/yasunaga2020graph.markdown index f50bc103..4f46a739 100644 --- a/_publications/yasunaga2020graph.markdown +++ b/_publications/yasunaga2020graph.markdown @@ -4,7 +4,6 @@ title: "Graph-based, Self-Supervised Program Repair from Diagnostic Feedback" authors: Michihiro Yasunaga, Percy Liang conference: year: 2020 -bibkey: yasunaga2020graph additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2005.10636"} tags: ["repair", "edit", "GNN"] diff --git a/_publications/ye2020leveraging.markdown b/_publications/ye2020leveraging.markdown new file mode 100644 index 00000000..d74a7bd3 --- /dev/null +++ b/_publications/ye2020leveraging.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Leveraging Code Generation to Improve Code Retrieval and Summarization via Dual Learning" +authors: Wei Ye, Rui Xie, Jinglei Zhang, Tianxiang Hu, Xiaoyin Wang, Shikun Zhang +conference: WWW +year: 2020 +additional_links: + - { name: "ArXiV", url: "https://arxiv.org/abs/2002.10198" } +tags: ["search", "summarization"] +--- + +Code summarization generates brief natural language description given a source code snippet, while code retrieval fetches relevant source code given a natural language query. Since both tasks aim to model the association between natural language and programming language, recent studies have combined these two tasks to improve their performance. However, researchers have yet been able to effectively leverage the intrinsic connection between the two tasks as they train these tasks in a separate or pipeline manner, which means their performance can not be well balanced. In this paper, we propose a novel end-to-end model for the two tasks by introducing an additional code generation task. More specifically, we explicitly exploit the probabilistic correlation between code summarization and code generation with dual learning, and utilize the two encoders for code summarization and code generation to train the code retrieval task via multi-task learning. We have carried out extensive experiments on an existing dataset of SQL and Python, and results show that our model can significantly improve the results of the code retrieval task over the-state-of-art models, as well as achieve competitive performance in terms of BLEU score for the code summarization task. diff --git a/_publications/ye2020misim.markdown b/_publications/ye2020misim.markdown new file mode 100644 index 00000000..4bd0a8c3 --- /dev/null +++ b/_publications/ye2020misim.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "MISIM: An End-to-End Neural Code Similarity System" +authors: Fangke Ye, Shengtian Zhou, Anand Venkat, Ryan Marcus, Nesime Tatbul, Jesmin Jahan Tithi, Paul Petersen, Timothy Mattson, Tim Kraska, Pradeep Dubey, Vivek Sarkar, Justin Gottschlich +conference: +year: 2020 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2006.05265"} +tags: ["code similarity"] +--- +Code similarity systems are integral to a range of applications from code recommendation to automated construction of software tests and defect mitigation. In this paper, we present Machine Inferred Code Similarity (MISIM), a novel end-to-end code similarity system that consists of two core components. First, MISIM uses a novel context-aware similarity structure, which is designed to aid in lifting semantic meaning from code syntax. Second, MISIM provides a neural-based code similarity scoring system, which can be implemented with various neural network algorithms and topologies with learned parameters. We compare MISIM to three other state-of-the-art code similarity systems: (i) code2vec, (ii) Neural Code Comprehension, and (iii) Aroma. In our experimental evaluation across 45,780 programs, MISIM consistently outperformed all three systems, often by a large factor (upwards of 40.6x). diff --git a/_publications/ye2021neural.markdown b/_publications/ye2021neural.markdown new file mode 100644 index 00000000..71bceb57 --- /dev/null +++ b/_publications/ye2021neural.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Neural Program Repair with Execution-based Backpropagation" +authors: He Ye, Matias Martinez, Monperrus Martin +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2105.04123"} +tags: ["repair"] +--- +Neural machine translation (NMT) architectures have achieved promising results for automatic program repair. Yet, they have the limitation of generating low-quality patches (e.g., not compilable patches). This is because the existing works only optimize a purely syntactic loss function based on characters and tokens without incorporating program-specific information during neural net weight optimization. In this paper, we propose a novel program repair model called RewardRepair. The core novelty of RewardRepair is to improve NMT-based program repair with a loss function based on program compilation and test execution information, rewarding the network to produce patches that compile and that do not overfit. We conduct several experiments to evaluate RewardRepair showing that it is feasible and effective to use compilation and test execution results to optimize the underlying neural repair model. In total, RewardRepair correctly repairs 43 Defects4J bugs including eight that are fixed for the first time. diff --git a/_publications/ye2022selfapr.markdown b/_publications/ye2022selfapr.markdown new file mode 100644 index 00000000..65b9d363 --- /dev/null +++ b/_publications/ye2022selfapr.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "SelfAPR: Self-supervised Program Repair with Test Execution Diagnostics" +authors: He Ye, Matias Martinez, Xiapu Luo, Tao Zhang, Martin Monperrus +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2203.12755"} +tags: ["repair", "execution"] +--- +Neural program repair has achieved good results in a recent series of papers. Yet, we observe that the related work fails to repair some bugs because of a lack of knowledge about 1) the program being repaired, and 2) the actual fault being repaired. In this paper, we solve both problems by changing the learning paradigm from supervised training to self-supervised training in an approach called SelfAPR. First, SelfAPR generates and constructs training samples by perturbing a previous version of the program being repaired, enforcing the neural model to capture project-specific knowledge. This is different from all the existing work based on past commits. Second, SelfAPR extracts and encodes test execution diagnostics into the input representation, steering the neural model to fix the specific kind of fault. This is different from the existing studies that only consider static source code in the input. We implement SelfAPR and evaluate it in a systematic manner. We train SelfAPR with 253 411 training samples obtained by perturbing 17 open-source projects. We evaluate SelfAPR on 818 bugs from Defects4J, SelfAPR correctly repairs 112 of them. diff --git a/_publications/yefet2019adversarial.markdown b/_publications/yefet2019adversarial.markdown index d2381371..02b76b43 100644 --- a/_publications/yefet2019adversarial.markdown +++ b/_publications/yefet2019adversarial.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Adversarial Examples for Models of Code" -authors: N. Yefet, U. Alon, E. Yahav +authors: Noam Yefet, Uri Alon, Eran Yahav conference: year: 2019 -bibkey: yefet2019adversarial additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1910.07517"} tags: ["adversarial"] diff --git a/_publications/yin2017syntactic.markdown b/_publications/yin2017syntactic.markdown index 108041a3..436f0926 100644 --- a/_publications/yin2017syntactic.markdown +++ b/_publications/yin2017syntactic.markdown @@ -1,11 +1,10 @@ --- layout: publication title: "A Syntactic Neural Model for General-Purpose Code Generation" -authors: P. Yin, G. Neubig +authors: Pengcheng Yin, Graham Neubig conference: ACL year: 2017 -bibkey: yin2017syntactic -tags: ["generation", "AST", "bimodal"] +tags: ["code generation", "grammar", "bimodal"] --- We consider the problem of parsing natural language descriptions into source code written in a general-purpose programming diff --git a/_publications/yin2018mining.markdown b/_publications/yin2018mining.markdown index b0b465f1..1c6e9513 100644 --- a/_publications/yin2018mining.markdown +++ b/_publications/yin2018mining.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning to Mine Aligned Code and Natural Language Pairs from Stack Overflow" -authors: P. Yin, B. Deng, E. Chen, B. Vasilescu, G. Neubig +authors: Pengcheng Yin, B. Deng, E. Chen, B. Vasilescu, Graham Neubig conference: MSR year: 2018 -bibkey: yin2018mining additional_links: - {name: "data", url: "https://conala-corpus.github.io/"} tags: ["dataset"] diff --git a/_publications/yin2019learning.markdown b/_publications/yin2019learning.markdown index 9d89efbb..ddaa290e 100644 --- a/_publications/yin2019learning.markdown +++ b/_publications/yin2019learning.markdown @@ -1,11 +1,11 @@ --- layout: publication title: "Learning to Represent Edits" -authors: P. Yin, G. Neubig, M. Allamanis, M. Brockschmidt, A. L. Gaunt +authors: Pengcheng Yin, Graham Neubig, Miltiadis Allamanis, Marc Brockschmidt, Alexander L. Gaunt conference: ICLR year: 2019 -bibkey: yin2019learning additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1810.13337"} - {name: "data extraction", url: "https://github.com/Microsoft/msrc-dpu-learning-to-represent-edits"} - {name: "code edit data", url: "http://www.cs.cmu.edu/~pengchey/githubedits.zip"} tags: ["edit"] diff --git a/_publications/yin2022natural.markdown b/_publications/yin2022natural.markdown new file mode 100644 index 00000000..da39d6cf --- /dev/null +++ b/_publications/yin2022natural.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Natural Language to Code Generation in Interactive Data Science Notebooks" +authors: Pengcheng Yin, Wen-Ding Li, Kefan Xiao, Abhishek Rao, Yeming Wen, Kensen Shi, Joshua Howland, Paige Bailey, Michele Catasta, Henryk Michalewski, Alex Polozov, Charles Sutton +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2212.09248"} +tags: ["notebook", "evaluation"] +--- +Computational notebooks, such as Jupyter notebooks, are interactive computing environments that are ubiquitous among data scientists to perform data wrangling and analytic tasks. To measure the performance of AI pair programmers that automatically synthesize programs for those tasks given natural language (NL) intents from users, we build ARCADE, a benchmark of 1082 code generation problems using the pandas data analysis framework in data science notebooks. ARCADE features multiple rounds of NL-to-code problems from the same notebook. It requires a model to understand rich multi-modal contexts, such as existing notebook cells and their execution states as well as previous turns of interaction. To establish a strong baseline on this challenging task, we develop PaChiNCo, a 62B code language model (LM) for Python computational notebooks, which significantly outperforms public code LMs. Finally, we explore few-shot prompting strategies to elicit better code with step-by-step decomposition and NL explanation, showing the potential to improve the diversity and explainability of model predictions. diff --git a/_publications/yonai2019mercem.markdown b/_publications/yonai2019mercem.markdown index 2a1c9984..005ede5e 100644 --- a/_publications/yonai2019mercem.markdown +++ b/_publications/yonai2019mercem.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Mercem: Method Name Recommendation Based on Call Graph Embedding" -authors: H. Yonai, Y. Hayase, H. Kitagawa +authors: Hiroshi Yonai, Yasuhiro Hayase, Hiroyuki Kitagawa conference: year: 2019 -bibkey: yonai2019mercem additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1907.05690"} tags: ["naming", "representation", "refactoring"] diff --git a/_publications/yuan2017abridging.markdown b/_publications/yuan2017abridging.markdown index 0cbda96b..0420e19b 100644 --- a/_publications/yuan2017abridging.markdown +++ b/_publications/yuan2017abridging.markdown @@ -1,10 +1,9 @@ --- layout: publication title: Abridging Source Code -authors: B. Yuan, V. Murali, C. Jermain +authors: Binhang Yuan, Vijayaraghavan Murali, Christopher Jermaine conference: OOPSLA year: 2017 -bibkey: yuan2017abridging additional_links: - {name: "ACM", url: "https://dl.acm.org/citation.cfm?id=3133882"} tags: ["summarization"] diff --git a/_publications/zaremba2014learning.markdown b/_publications/zaremba2014learning.markdown index 9d2f4961..a697ced4 100644 --- a/_publications/zaremba2014learning.markdown +++ b/_publications/zaremba2014learning.markdown @@ -1,10 +1,11 @@ --- layout: publication title: "Learning to Execute" -authors: W. Zaremba, I. Sutskever -conference: ArXiV 1410.4615 +authors: Wojciech Zaremba, Ilya Sutskever +conference: +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/1810.13337"} year: 2014 -bibkey: zaremba2014learning -tags: ["representation"] +tags: ["execution", "representation"] --- Recurrent Neural Networks (RNNs) with Long Short-Term Memory units (LSTM) are widely used because they are expressive and are easy to train. Our interest lies in empirically evaluating the expressiveness and the learnability of LSTMs in the sequence-to-sequence regime by training them to evaluate short computer programs, a domain that has traditionally been seen as too complex for neural networks. We consider a simple class of programs that can be evaluated with a single left-to-right pass using constant memory. Our main result is that LSTMs can learn to map the character-level representations of such programs to their correct outputs. Notably, it was necessary to use curriculum learning, and while conventional curriculum learning proved ineffective, we developed a new variant of curriculum learning that improved our networks' performance in all experimental conditions. The improved curriculum had a dramatic impact on an addition problem, making it possible to train an LSTM to add two 9-digit numbers with 99% accuracy. diff --git a/_publications/zeng2022extensive.markdown b/_publications/zeng2022extensive.markdown new file mode 100644 index 00000000..f9418aa2 --- /dev/null +++ b/_publications/zeng2022extensive.markdown @@ -0,0 +1,38 @@ +--- +layout: publication +title: "An Extensive Study on Pre-trained Models for Program Understanding and Generation" +authors: Zhengran Zeng, Hanzhuo Tan, Haotian Zhang, Jing Li, Yuqun Zhang, Lingming Zhang +conference: ISSTA +year: 2022 +additional_links: + - {name: "Author Version", url: "http://lingming.cs.illinois.edu/publications/issta2022.pdf"} +tags: ["Transformer", "evaluation"] +--- +Automatic program understanding and generation techniques could +significantly advance the productivity of programmers and have +been widely studied by academia and industry. Recently, the advent of pre-trained paradigm enlightens researchers to develop +general-purpose pre-trained models which can be applied for a +broad range of program understanding and generation tasks. Such +pre-trained models, derived by self-supervised objectives on large +unlabelled corpora, can be fine-tuned in downstream tasks (such +as code search and code generation) with minimal adaptations. Although these pre-trained models claim superiority over the prior +techniques, they seldom follow equivalent evaluation protocols, e.g., +they are hardly evaluated on the identical benchmarks, tasks, or settings. Consequently, there is a pressing need for a comprehensive +study of the pre-trained models on their effectiveness, versatility +as well as the limitations to provide implications and guidance for +the future development in this area. To this end, we first perform +an extensive study of eight open-access pre-trained models over +a large benchmark on seven representative code tasks to assess +their reproducibility. We further compare the pre-trained models +and domain-specific state-of-the-art techniques for validating pre-trained effectiveness. At last, we investigate the robustness of the +pre-trained models by inspecting their performance variations under adversarial attacks. Through the study, we find that while we +can in general replicate the original performance of the pre-train +models on their evaluated tasks and adopted benchmarks, subtle +performance fluctuations can refute the findings in their original +papers. Moreover, none of the existing pre-trained models can dominate over all other models. We also find that the pre-trained models +can significantly outperform non-pre-trained state-of-the-art techniques in program understanding tasks. Furthermore, we perform +the first study for natural language-programming language pre-trained model robustness via adversarial attacks and find that a +simple random attack approach can easily fool the state-of-the-art +pre-trained models and thus incur security issues. At last, we also +provide multiple practical guidelines for advancing future research +on pre-trained models for program understanding and generation. diff --git a/_publications/zhang2019learning.markdown b/_publications/zhang2019learning.markdown index b428a24b..20b46dd2 100644 --- a/_publications/zhang2019learning.markdown +++ b/_publications/zhang2019learning.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Learning Uniform Semantic Features for Natural Language and Programming Language Globally, Locally and Sequentially" -authors: Y. Zhang, W. Zheng, M. Li +authors: Yudong Zhang, Wenhao Zheng, Ming Li conference: AAAI year: 2019 -bibkey: zhang2019learning tags: ["representation", "bimodal"] --- Semantic feature learning for natural language and programming language is a preliminary step in addressing many software mining tasks. Many existing methods leverage diff --git a/_publications/zhang2019novel.markdown b/_publications/zhang2019novel.markdown index b6f4924e..e4ae7613 100644 --- a/_publications/zhang2019novel.markdown +++ b/_publications/zhang2019novel.markdown @@ -1,12 +1,11 @@ --- layout: publication title: "A Novel Neural Source Code Representation based on Abstract Syntax Tree" -authors: J. Zhang, X. Wang, H. Zhang, H Sun, K. Wang, X. Liu +authors: Jian Zhang, Xu Wang, Hongyu Zhang, Hailong Sun, Kaixuan Wang, Xudong Liu conference: ICSE year: 2019 -bibkey: zhang2019novel additional_links: - {name: "PDF", url: "http://xuwang.tech/paper/astnn_icse2019.pdf"} -tags: ["representation", "AST"] +tags: ["representation", "grammar"] --- Exploiting machine learning techniques for analyzing programs has attracted much attention. One key problem is how to represent code fragments well for follow-up analysis. Traditional information retrieval based methods often treat programs as natural language texts, which could miss important semantic information of source code. Recently, state-of-the-art studies demonstrate that abstract syntax tree (AST) based neural models can better represent source code. However, the sizes of ASTs are usually large and the existing models are prone to the long-term dependency problem. In this paper, we propose a novel AST-based Neural Network (ASTNN) for source code representation. Unlike existing models that work on entire ASTs, ASTNN splits each large AST into a sequence of small statement trees, and encodes the statement trees to vectors by capturing the lexical and syntactical knowledge of statements. Based on the sequence of statement vectors, a bidirectional RNN model is used to leverage the naturalness of statements and finally produce the vector representation of a code fragment. We have applied our neural network based source code representation method to two common program comprehension tasks: source code classification and code clone detection. Experimental results on the two tasks indicate that our model is superior to state-of-the-art approaches. diff --git a/_publications/zhang2020generating.markdown b/_publications/zhang2020generating.markdown index 2da15e35..ae9a1ea9 100644 --- a/_publications/zhang2020generating.markdown +++ b/_publications/zhang2020generating.markdown @@ -1,12 +1,11 @@ --- layout: publication title: "Generating Adversarial Examples for Holding Robustness of Source Code Processing Models" -authors: H. Zhang, Z. Li, G. Li, L. Ma, Y. Liu, Z. Jin +authors: Huangzhao Zhang, Zhuo Li, Ge Li, Lei Ma, Yang Liu, Zhi Jin conference: AAAI year: 2020 -bibkey: zhang2020generating additional_links: - - {name: "Proceedings", url: "https://www.aaai.org/Papers/AAAI/2020GB/AAAI-ZhangH.6730.pdf"} + - {name: "Proceedings", url: "https://ojs.aaai.org/index.php/AAAI/article/view/5469"} tags: ["adversarial"] --- Automated processing, analysis, and generation of source code are among the key activities diff --git a/_publications/zhang2021bag.markdown b/_publications/zhang2021bag.markdown new file mode 100644 index 00000000..2578e786 --- /dev/null +++ b/_publications/zhang2021bag.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Bag-of-Words Baselines for Semantic Code Search" +authors: Xinyu Zhang, Ji Xin, Andrew Yates, Jimmy Lin +conference: NLP4Prog +year: 2021 +additional_links: + - {name: "PDF", url: "https://aclanthology.org/2021.nlp4prog-1.10.pdf"} +tags: ["search"] +--- +The task of semantic code search is to retrieve code snippets from a source code corpus based on an information need expressed in natural language. The semantic gap between natural language and programming languages has for long been regarded as one of the most significant obstacles to the effectiveness of keyword-based information retrieval (IR) methods. It is a common assumption that “traditional” bag-of-words IR methods are poorly suited for semantic code search: our work empirically investigates this assumption. Specifically, we examine the effectiveness of two traditional IR methods, namely BM25 and RM3, on the CodeSearchNet Corpus, which consists of natural language queries paired with relevant code snippets. We find that the two keyword-based methods outperform several pre-BERT neural models. We also compare several code-specific data pre-processing strategies and find that specialized tokenization improves effectiveness. diff --git a/_publications/zhang2021disentangled.md b/_publications/zhang2021disentangled.md new file mode 100644 index 00000000..318484b6 --- /dev/null +++ b/_publications/zhang2021disentangled.md @@ -0,0 +1,11 @@ +--- +layout: publication +title: Disentangled Code Representation Learning for Multiple Programming Languages +authors: Jingfeng Zhang, Haiwen Hong, Yin Zhang, Yao Wan, Ye Liu, Yulei Sui +conference: ACL +year: 2021 +additional_links: + - {name: "Proceedings", url: "https://aclanthology.org/2021.findings-acl.391/"} +tags: ["representation"] +--- +Developing effective distributed representations of source code is fundamental yet challenging for many software engineering tasks such as code clone detection, code search, code translation and transformation. However, current code embedding approaches that represent the semantic and syntax of code in a mixed way are less interpretable and the resulting embedding can not be easily generalized across programming languages. In this paper, we propose a disentangled code representation learning approach to separate the semantic from the syntax of source code under a multi-programming-language setting, obtaining better interpretability and generalizability. Specially, we design three losses dedicated to the characteristics of source code to enforce the disentanglement effectively. We conduct comprehensive experiments on a real-world dataset composed of programming exercises implemented by multiple solutions that are semantically identical but grammatically distinguished. The experimental results validate the superiority of our proposed disentangled code representation, compared to several baselines, across three types of downstream tasks, i.e., code clone detection, code translation, and code-to-code search. \ No newline at end of file diff --git a/_publications/zhang2022coditt5.markdown b/_publications/zhang2022coditt5.markdown new file mode 100644 index 00000000..99e60ac7 --- /dev/null +++ b/_publications/zhang2022coditt5.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "CoditT5: Pretraining for Source Code and Natural Language Editing" +authors: Jiyang Zhang, Sheena Panthaplackel, Pengyu Nie, Junyi Jessy Li, Milos Gligoric +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2208.05446"} +tags: ["Transformer", "edit"] +--- +Pretrained language models have been shown to be effective in many software-related generation tasks; however, they are not well-suited for editing tasks as they are not designed to reason about edits. To address this, we propose a novel pretraining objective which explicitly models edits and use it to build CoditT5, a large language model for software-related editing tasks that is pretrained on large amounts of source code and natural language comments. We fine-tune it on various downstream editing tasks, including comment updating, bug fixing, and automated code review. By outperforming pure generation-based models, we demonstrate the generalizability of our approach and its suitability for editing tasks. We also show how a pure generation model and our edit-based model can complement one another through simple reranking strategies, with which we achieve state-of-the-art performance for the three downstream editing tasks. diff --git a/_publications/zhang2023repocoder.markdown b/_publications/zhang2023repocoder.markdown new file mode 100644 index 00000000..5de5ff42 --- /dev/null +++ b/_publications/zhang2023repocoder.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation" +authors: Fengji Zhang, Bei Chen, Yue Zhang, Jin Liu, Daoguang Zan, Yi Mao, Jian-Guang Lou, Weizhu Chen +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2303.12570"} + - {name: "Code", url: "https://github.com/microsoft/CodeT/tree/main/RepoCoder"} +tags: ["completion", "Transformer", "retrieval"] +--- +The task of repository-level code completion is to continue writing the unfinished code based on a broader context of the repository. While for automated code completion tools, it is difficult to utilize the useful information scattered in different files. We propose RepoCoder, a simple, generic, and effective framework to address the challenge. It streamlines the repository-level code completion process by incorporating a similarity-based retriever and a pre-trained code language model, which allows for the effective utilization of repository-level information for code completion and grants the ability to generate code at various levels of granularity. Furthermore, RepoCoder utilizes a novel iterative retrieval-generation paradigm that bridges the gap between retrieval context and the intended completion target. We also propose a new benchmark RepoEval, which consists of the latest and high-quality real-world repositories covering line, API invocation, and function body completion scenarios. We test the performance of RepoCoder by using various combinations of code retrievers and generators. Experimental results indicate that RepoCoder significantly improves the zero-shot code completion baseline by over 10% in all settings and consistently outperforms the vanilla retrieval-augmented code completion approach. Furthermore, we validate the effectiveness of RepoCoder through comprehensive analysis, providing valuable insights for future research. diff --git a/_publications/zhao2018neural.markdown b/_publications/zhao2018neural.markdown index b8e42d65..91e84a63 100644 --- a/_publications/zhao2018neural.markdown +++ b/_publications/zhao2018neural.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Neural-Augumented Static Analysis of Android Communication" -authors: J. Zhao, A. Albarghouthi, V. Rastogi, S. Jha, D. Octeau +authors: Jinman Zhao, Aws Albarghouthi, Vaibhav Rastogi, Somesh Jha, Damien Octeau conference: FSE year: 2018 -bibkey: zhao2018neural additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/1809.04059"} tags: ["program analysis"] diff --git a/_publications/zhao2019neural.markdown b/_publications/zhao2019neural.markdown index 258d4cee..36c8bea9 100644 --- a/_publications/zhao2019neural.markdown +++ b/_publications/zhao2019neural.markdown @@ -1,10 +1,9 @@ --- layout: publication title: "Neural Networks for Modeling Source Code Edits" -authors: R. Zhao, D. Bieber, K. Swersky, D. Tarlow +authors: Rui Zhao, David Bieber, Kevin Swersky, Daniel Tarlow conference: year: 2019 -bibkey: zhao2019neural additional_links: - {name: "OpenReview", url: "https://openreview.net/forum?id=Sklr9i09KQ"} - {name: "ArXiV", url: "https://arxiv.org/abs/1904.02818"} diff --git a/_publications/zhong2018generating.markdown b/_publications/zhong2018generating.markdown index 80d09ed9..e4df8893 100644 --- a/_publications/zhong2018generating.markdown +++ b/_publications/zhong2018generating.markdown @@ -1,13 +1,12 @@ --- layout: publication title: "Generating Regular Expressions from Natural Language Specifications: Are We There Yet?" -authors: Z. Zhong, J. Guo, W. Yang, T. Xie, JG Lou, Y. Liu, D. Zhang +authors: Zexuan Zhong, Jiaqi Guo, Wei Yang, Tao Xie, Jian-Guang Lou, Ting Liu, Dongmei Zhang conference: NLSE year: 2018 -bibkey: zhong2018generating additional_links: - {name: "PDF", url: "http://taoxie.cs.illinois.edu/publications/nl4se18-regex.pdf"} -tags: ["bimodal", "generation"] +tags: ["bimodal", "code generation"] --- Recent state-of-the-art approaches automatically generate regular expressions from natural language specifications. diff --git a/_publications/zhong2020semantic.markdown b/_publications/zhong2020semantic.markdown index 49830699..4a260f1d 100644 --- a/_publications/zhong2020semantic.markdown +++ b/_publications/zhong2020semantic.markdown @@ -4,9 +4,8 @@ title: "Semantic Scaffolds for Pseudocode-to-Code Generation" authors: Ruiqi Zhong, Mitchell Stern, Dan Klein conference: year: 2020 -bibkey: zhong2020semantic additional_links: - {name: "ArXiV", url: "https://arxiv.org/abs/2005.05927"} -tags: ["generation", "synthesis"] +tags: ["code generation", "synthesis"] --- We propose a method for program generation based on semantic scaffolds, lightweight structures representing the high-level semantic and syntactic composition of a program. By first searching over plausible scaffolds then using these as constraints for a beam search over programs, we achieve better coverage of the search space when compared with existing techniques. We apply our hierarchical search method to the SPoC dataset for pseudocode-to-code generation, in which we are given line-level natural language pseudocode annotations and aim to produce a program satisfying execution-based test cases. By using semantic scaffolds during inference, we achieve a 10% absolute improvement in top-100 accuracy over the previous state-of-the-art. Additionally, we require only 11 candidates to reach the top-3000 performance of the previous best approach when tested against unseen problems, demonstrating a substantial improvement in efficiency. diff --git a/_publications/zhou2019devign.markdown b/_publications/zhou2019devign.markdown new file mode 100644 index 00000000..88c2af98 --- /dev/null +++ b/_publications/zhou2019devign.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks" +authors: Yaqin Zhou, Shangqing Liu, Jingkai Siow, Xiaoning Du, Yang Liu +conference: NeurIPS +year: 2020 +additional_links: + - {name: "Paper", url: "http://papers.nips.cc/paper/9209-devign-effective-vulnerability-identification-by-learning-comprehensive-program-semantics-via-graph-neural-networks"} +tags: ["GNN", "static analysis"] +--- +Vulnerability identification is crucial to protect the software systems from attacks for cyber security. It is especially important to localize the vulnerable functions among the source code to facilitate the fix. However, it is a challenging and tedious process, and also requires specialized security expertise. Inspired by the work on manually-defined patterns of vulnerabilities from various code representation graphs and the recent advance on graph neural networks, we propose Devign, a general graph neural network based model for graph-level classification through learning on a rich set of code semantic representations. It includes a novel Conv module to efficiently extract useful features in the learned rich node representations for graph-level classification. The model is trained over manually labeled datasets built on 4 diversified large-scale open-source C projects that incorporate high complexity and variety of real source code instead of synthesis code used in previous works. The results of the extensive evaluation on the datasets demonstrate that Devign outperforms the state of the arts significantly with an average of 10.51% higher accuracy and 8.68% F1 score, increases averagely 4.66% accuracy and 6.37% F1 by the Conv module. diff --git a/_publications/zhou2021improving.markdown b/_publications/zhou2021improving.markdown new file mode 100644 index 00000000..1930b938 --- /dev/null +++ b/_publications/zhou2021improving.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Improving Code Autocompletion with Transfer Learning" +authors: Wen Zhou, Seohyun Kim, Vijayaraghavan Murali, Gareth Ari Aye +conference: +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2105.05991"} +tags: ["autocomplete", "Transformer"] +--- +Software language models have achieved promising results predicting code completion usages, and several industry studies have described successful IDE integrations. Recently, accuracy in autocompletion prediction improved 12.8% from training on a real-world dataset collected from programmers' IDE activity. But what if limited examples of IDE autocompletion in the target programming language are available for model training? In this paper, we investigate the efficacy of pretraining autocompletion models on non-IDE, non-autocompletion, and different-language example code sequences. We find that these unsupervised pretrainings improve model accuracy by over 50% on very small fine-tuning datasets and over 10% on 50k labeled examples. We confirm the real-world impact of these pretrainings in an online setting through A/B testing on thousands of IDE autocompletion users, finding that pretraining is responsible for increases of up to 6.63% autocompletion usage. diff --git a/_publications/zhou2022codebertscore.markdown b/_publications/zhou2022codebertscore.markdown new file mode 100644 index 00000000..86ea2486 --- /dev/null +++ b/_publications/zhou2022codebertscore.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code" +authors: Shuyan Zhou, Uri Alon, Sumit Agarwal, Graham Neubig +conference: +year: 2023 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2302.05527"} + - {name: "Code", url: "https://github.com/neulab/code-bert-score"} +tags: ["evaluation", "Transformer"] +--- +Since the rise of neural models of code that can generate long expressions and statements rather than a single next-token, one of the major problems has been reliably evaluating their generated output. In this paper, we propose CodeBERTScore: an automatic evaluation metric for code generation, which builds on BERTScore (Zhang et al., 2020). Instead of measuring exact token matching as BLEU, CodeBERTScore computes a soft similarity score between each token in the generated code and in the reference code, using the contextual encodings of large pretrained models. Further, instead of encoding only the generated tokens as in BERTScore, CodeBERTScore also encodes the programmatic context surrounding the generated code. We perform an extensive evaluation of CodeBERTScore across four programming languages. We find that CodeBERTScore achieves a higher correlation with human preference and with functional correctness than all existing metrics. That is, generated code that receives a higher score by CodeBERTScore is more likely to be preferred by humans, as well as to function correctly when executed. Finally, while CodeBERTScore can be used with a multilingual CodeBERT as its base model, we release five language-specific pretrained models to use with our publicly available code at https://github.com/neulab/code-bert-score . Our language-specific models have been downloaded more than 25,000 times from the Huggingface Hub. diff --git a/_publications/zhou2022docoder.markdown b/_publications/zhou2022docoder.markdown new file mode 100644 index 00000000..8e23e65b --- /dev/null +++ b/_publications/zhou2022docoder.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "DocCoder: Generating Code by Retrieving and Reading Docs" +authors: Shuyan Zhou, Uri Alon, Frank F. Xu, Zhengbao JIang, Graham Neubig +conference: +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2207.05987"} + - {name: "Code and Data", url: "https://github.com/shuyanzhou/doccoder"} +tags: ["Transformer", "search", "code generation"] +--- +Natural-language-to-code models learn to generate a code snippet given a natural language (NL) intent. However, the rapid growth of both publicly available and proprietary libraries and functions makes it impossible to cover all APIs using training examples, as new libraries and functions are introduced daily. Thus, existing models inherently cannot generalize to using unseen functions and libraries merely through incorporating them into the training data. In contrast, when human programmers write programs, they frequently refer to textual resources such as code manuals, documentation, and tutorials, to explore and understand available library functionality. Inspired by this observation, we introduce DocCoder: an approach that explicitly leverages code manuals and documentation by (1) retrieving the relevant documentation given the NL intent, and (2) generating the code based on the NL intent and the retrieved documentation. Our approach is general, can be applied to any programming language, and is agnostic to the underlying neural model. We demonstrate that DocCoder consistently improves NL-to-code models: DocCoder achieves 11x higher exact match accuracy than strong baselines on a new Bash dataset tldr; on the popular Python CoNaLa benchmark, DocCoder improves over strong baselines by 1.65 BLEU. diff --git a/_publications/zhu2020ocor.markdown b/_publications/zhu2020ocor.markdown new file mode 100644 index 00000000..754c9ebd --- /dev/null +++ b/_publications/zhu2020ocor.markdown @@ -0,0 +1,13 @@ +--- +layout: publication +title: "OCoR: An Overlapping-Aware Code Retriever" +authors: Qihao Zhu, Zeyu Sun, Xiran Liang, Yingfei Xiong, Lu Zhang +conference: ASE +year: 2020 +additional_links: + - { name: "ArXiV", url: "https://arxiv.org/abs/2008.05201" } +tags: ["search"] +--- + +Code retrieval helps developers reuse the code snippet in the open-source projects. Given a natural language description, code retrieval aims to search for the most relevant code among a set of code. Existing state-of-the-art approaches apply neural networks to code retrieval. However, these approaches still fail to capture an important feature: overlaps. The overlaps between different names used by different people indicate that two different names may be potentially related (e.g., "message" and "msg"), and the overlaps between identifiers in code and words in natural language descriptions indicate that the code snippet and the description may potentially be related. To address these problems, we propose a novel neural architecture named OCoR, where we introduce two specifically-designed components to capture overlaps: the first embeds identifiers by character to capture the overlaps between identifiers, and the second introduces a novel overlap matrix to represent the degrees of overlaps between each natural language word and each identifier. +The evaluation was conducted on two established datasets. The experimental results show that OCoR significantly outperforms the existing state-of-the-art approaches and achieves 13.1% to 22.3% improvements. Moreover, we also conducted several in-depth experiments to help understand the performance of different components in OCoR. diff --git a/_publications/zhu2921syntax.markdown b/_publications/zhu2921syntax.markdown new file mode 100644 index 00000000..a2a8f1b9 --- /dev/null +++ b/_publications/zhu2921syntax.markdown @@ -0,0 +1,15 @@ +--- +layout: publication +title: "A Syntax-Guided Edit Decoder for Neural Program Repair" +authors: Qihao Zhu, Zeyu Sun, Yuan-an Xiao, Wenjie Zhang, Kang Yuan, Yingfei Xiong, Lu Zhang +conference: FSE +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2106.08253"} +tags: ["edit"] +--- +Automated Program Repair (APR) helps improve the efficiency of software development and maintenance. Recent APR techniques use deep learning, particularly the encoder-decoder architecture, to generate patches. +Though existing DL-based APR approaches have proposed different encoder architectures, the decoder remains to be the standard one, which generates a sequence of tokens one by one to replace the faulty statement. +This decoder has multiple limitations: 1) allowing to generate syntactically incorrect programs, 2) inefficiently representing small edits, and 3) not being able to generate project-specific identifiers. +In this paper, we propose Recoder, a syntax-guided edit decoder with placeholder generation. Recoder is novel in multiple aspects: 1) Recoder generates edits rather than modified code, allowing efficient representation of small edits; 2) Recoder is syntax-guided, with the novel provider/decider architecture to ensure the syntactic correctness of the patched program and accurate generation; 3) Recoder generates placeholders that could be instantiated as project-specific identifiers later. +We conduct experiments to evaluate Recoder on 395 bugs from Defects4J v1.2, 420 additional bugs from Defects4J v2.0, 297 bugs from IntroClassJava and 40 bugs from QuixBugs. Our results show that Recoder repairs 53 bugs on Defects4J v1.2, which achieves 26.2% (11 bugs) improvement over the previous state-of-the-art approach for single-hunk bugs (TBar). Importantly, to our knowledge, Recoder is the first DL-based APR approach that has outperformed the traditional APR approaches on this benchmark. diff --git a/_publications/ziegler2022productivity.markdown b/_publications/ziegler2022productivity.markdown new file mode 100644 index 00000000..5cb1d1bb --- /dev/null +++ b/_publications/ziegler2022productivity.markdown @@ -0,0 +1,12 @@ +--- +layout: publication +title: "Productivity Assessment of Neural Code Completion" +authors: Albert Ziegler, Eirini Kalliamvakou, Shawn Simister, Ganesh Sittampalam, Alice Li, Andrew Rice, Devon Rifkin, Edward Aftandilian +conference: MAPS +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2205.06537"} + - {name: "Data", url: "https://github.com/wunderalbert/prod-neural-materials"} +tags: ["evaluation", "human evaluation"] +--- +Neural code synthesis has reached a point where snippet generation is accurate enough to be considered for integration into human software development workflows. Commercial products aim to increase programmers' productivity, without being able to measure it directly. In this case study, we asked users of GitHub Copilot about its impact on their productivity, and sought to find a reflection of their perception in directly measurable user data. We find that the rate with which shown suggestions are accepted, rather than more specific metrics regarding the persistence of completions in the code over time, drives developers' perception of productivity. diff --git a/_publications/zlotchevski2022exploring.markdown b/_publications/zlotchevski2022exploring.markdown new file mode 100644 index 00000000..5bd5d5fc --- /dev/null +++ b/_publications/zlotchevski2022exploring.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Exploring and Evaluating Personalized Models for Code Generation" +authors: Andrei Zlotchevski, Dawn Drain, Alexey Svyatkovskiy, Colin Clement, Neel Sundaresan, Michele Tufano +conference: FSE +year: 2022 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2208.13928"} +tags: ["Transformer"] +--- +Large Transformer models achieved the state-of-the-art status for Natural Language Understanding tasks and are increasingly becoming the baseline model architecture for modeling source code. Transformers are usually pre-trained on large unsupervised corpora, learning token representations and transformations relevant to modeling generally available text, and are then fine-tuned on a particular downstream task of interest. While fine-tuning is a tried-and-true method for adapting a model to a new domain -- for example, question-answering on a given topic -- generalization remains an on-going challenge. In this paper, we explore and evaluate transformer model fine-tuning for personalization. In the context of generating unit tests for Java methods, we evaluate learning to personalize to a specific software project using several personalization techniques. We consider three key approaches: (i) custom fine-tuning, which allows all the model parameters to be tuned; (ii) lightweight fine-tuning, which freezes most of the model's parameters, allowing tuning of the token embeddings and softmax layer only or the final layer alone; (iii) prefix tuning, which keeps model parameters frozen, but optimizes a small project-specific prefix vector. Each of these techniques offers a trade-off in total compute cost and predictive performance, which we evaluate by code and task-specific metrics, training time, and total computational operations. We compare these fine-tuning strategies for code generation and discuss the potential generalization and cost benefits of each in various deployment scenarios. diff --git a/_publications/zugner2021language.markdown b/_publications/zugner2021language.markdown new file mode 100644 index 00000000..ec49df6c --- /dev/null +++ b/_publications/zugner2021language.markdown @@ -0,0 +1,11 @@ +--- +layout: publication +title: "Language-Agnostic Representation Learning of Source Code from Structure and Context" +authors: Daniel Zügner, Tobias Kirschstein, Michele Catasta, Jure Leskovec, Stephan Günnemann +conference: ICLR +year: 2021 +additional_links: + - {name: "ArXiV", url: "https://arxiv.org/abs/2103.11318"} +tags: ["Transformer", "representation"] +--- +Source code (Context) and its parsed abstract syntax tree (AST; Structure) are two complementary representations of the same computer program. Traditionally, designers of machine learning models have relied predominantly either on Structure or Context. We propose a new model, which jointly learns on Context and Structure of source code. In contrast to previous approaches, our model uses only language-agnostic features, i.e., source code and features that can be computed directly from the AST. Besides obtaining state-of-the-art on monolingual code summarization on all five programming languages considered in this work, we propose the first multilingual code summarization model. We show that jointly training on non-parallel data from multiple programming languages improves results on all individual languages, where the strongest gains are on low-resource languages. Remarkably, multilingual training only from Context does not lead to the same improvements, highlighting the benefits of combining Structure and Context for representation learning on code. diff --git a/base-taxonomy/generative.html b/base-taxonomy/generative.html deleted file mode 100644 index 69364ad9..00000000 --- a/base-taxonomy/generative.html +++ /dev/null @@ -1,40 +0,0 @@ ---- -layout: default -title: Code-Generating Models ---- -
| Name | Type | Representation | Model | Application | Abstract | -{% for publication in publicationsList %}{% if publication.categories contains "generative" %} - {% assign pubDetails = site.publications | where:"bibkey", publication.bibkey %} - -
|---|---|---|---|---|---|
| {{pubDetails[0].authors}}, {{pubDetails[0].year}}. {{pubDetails[0].title}} | -{{publication.type}} | -{{publication.representation}} | -{{publication.model}} | -{{publication.application}} | -{{pubDetails[0].content}} | -
| Name | Type | Representation | Application | Abstract | - -{% for publication in publicationsList %}{% if publication.categories contains "pattern" %} - {% assign pubDetails = site.publications | where:"bibkey", publication.bibkey %} - -
|---|---|---|---|---|
| {{pubDetails[0].authors}}, {{pubDetails[0].year}}. {{pubDetails[0].title}} | -{{publication.pattern_type}} | -{{publication.representation}} | -{{publication.application}} | -{{pubDetails[0].content}} | -
| Name | Input Code Representation | Target | Intermediate Representation | Application | Abstract | -{% for publication in publicationsList %}{% if publication.categories contains "representational" %} - {% assign pubDetails = site.publications | where:"bibkey", publication.bibkey %} - -
|---|---|---|---|---|---|
| {{pubDetails[0].authors}}, {{pubDetails[0].year}}. {{pubDetails[0].title}} | -{{publication.input_rep}} | -{{publication.modeled_target}} | -{{publication.intermediate_rep}} | -{{publication.application}} | -{{pubDetails[0].content}} | -
+
+```yaml
---
layout: publication
title: The title of the Publication
authors: F. M. LastName, F. M. LastName, ...
-conference: AbbreviatedNameOfConference
+conference: AbbreviatedNameOfConference # Or journal: AbbreviatedNameOfJournal
year: YEAR
-bibkey: lastnameYEARfirstword
additional_links:
- - {name: "ArXiV", url: "http://arxiv.org/abs/XXXX.YYYY"}
- - {name: "website", url: "http://paperwebsite.com/"}
- - {name: "code", url: "https://github.com/path-to/code"}
+ - {name: "ArXiV", url: "http://arxiv.org/abs/XXXX.YYYY"}
+ - {name: "website", url: "http://paperwebsite.com/"}
+ - {name: "code", url: "https://github.com/path-to/code"}
tags: ["tag1", "tag2"]
---
Text of abstract goes here.
-
+```
The `additional_links` are optional and arbitrary and they will appear on the page referring to this work. Feel free to add as many additional links as needed.
diff --git a/contributors.md b/contributors.md
deleted file mode 100644
index 65cefdff..00000000
--- a/contributors.md
+++ /dev/null
@@ -1,18 +0,0 @@
----
-layout: default
-title: Contributors
----
-The core survey and the original taxonomy was created by
-
-* [Miltos Allamanis](https://miltos.allamanis.com) Microsoft Research, Cambridge, UK
-* [Earl T. Barr](http://earlbarr.com) University College London, London, UK
-* [Prem Devanbu](http://web.cs.ucdavis.edu/~devanbu/) University of California, Davis, USA
-* [Charles Sutton](http://homepages.inf.ed.ac.uk/csutton/) University of Edinburgh and The Alan Turing Institute, UK
-
-#### Contributors to the website
-This website accepts external [contributions](/contributing.html).
-Please, feel free to add your name below, once you contribute to this
-website. A comprehensive list can be found [here](https://github.com/ml4code/ml4code.github.io/graphs/contributors).
-
-* [Uri Alon](http://www.cs.technion.ac.il/~urialon/) Technion, Israel
-* [Nghi D. Q. Bui](https://bdqnghi.github.io/) Singapore Management University, Singapore
\ No newline at end of file
diff --git a/etc/compute_embeddings.py b/etc/compute_embeddings.py
index 1e0c8da8..43f0ba7c 100644
--- a/etc/compute_embeddings.py
+++ b/etc/compute_embeddings.py
@@ -1,8 +1,11 @@
import argparse
import json
+from timeit import default_timer as timer
+from datetime import date
import numpy as np
import torch
+import torch.nn.functional as F
import sklearn.manifold
import transformers
@@ -13,13 +16,19 @@ def parse_arguments():
parser.add_argument("json", default=False, help="the path the json containing all papers.")
parser.add_argument("outpath", default=False, help="the target path of the visualizations papers.")
parser.add_argument("--seed", default=0, help="The seed for TSNE.", type=int)
+ parser.add_argument("--model", default='sentence-transformers/all-MiniLM-L6-v2', help="The name of the HF model")
+ parser.add_argument("--save_emb", action='https://github.com/store_true', help="Save embeddings in CSV for Tensorboard Projector")
+
return parser.parse_args()
+def mean_pooling(token_embeddings, attention_mask):
+ """ Mean Pooling, takes attention mask into account for correct averaging"""
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-if __name__ == "__main__":
- args = parse_arguments()
- tokenizer = transformers.AutoTokenizer.from_pretrained("deepset/sentence_bert")
- model = transformers.AutoModel.from_pretrained("deepset/sentence_bert")
+def main(args):
+ tokenizer = transformers.AutoTokenizer.from_pretrained(args.model)
+ model = transformers.AutoModel.from_pretrained(args.model)
model.eval()
with open(args.json) as f:
@@ -27,15 +36,34 @@ def parse_arguments():
print(f"Num papers: {len(data)}")
- all_embeddings = []
+ corpus = []
for paper_info in data:
+ corpus.append(tokenizer.sep_token.join([paper_info['title'], paper_info['abstract']]))
+
+ batch_size = 4
+ all_embeddings=[]
+ start = timer()
+ for i in range(0, len(corpus), batch_size):
+ encoded_batch = tokenizer(corpus[i:min(i+batch_size, len(corpus))], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
- token_ids = torch.tensor([tokenizer.encode(paper_info["abstract"])][:512])
- hidden_states, _ = model(token_ids)[-2:]
- all_embeddings.append(hidden_states.mean(0).mean(0).numpy())
+ hidden_state = model(**encoded_batch).last_hidden_state
+ all_embeddings.append(mean_pooling(hidden_state, encoded_batch['attention_mask']))
+
+ all_embeddings = torch.cat(all_embeddings, dim=0)
+ all_embeddings = F.normalize(all_embeddings, p=2, dim=1)
+ print(f"elapsed {timer()-start:.1f}s")
+
+ if args.save_emb:
+ filename = f"{args.model.replace('/', '_')}-{date.today().strftime('%d.%m.%y')}"
+ np.savetxt(f"{filename}-emb.tsv", all_embeddings, delimiter="\t")
+ import csv
+ with open(f"{filename}-meta.tsv", 'w', newline='') as csvfile:
+ w = csv.writer(csvfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
+ w.writerow(["year", "key", "title"])
+ for paper in data:
+ w.writerow([paper["year"], paper["key"], paper["title"]])
np.random.seed(args.seed)
- all_embeddings = np.array(all_embeddings)
out = sklearn.manifold.TSNE(n_components=2, metric="cosine").fit_transform(all_embeddings)
for i, paper_info in enumerate(data):
@@ -43,3 +71,7 @@ def parse_arguments():
with open(args.outpath, 'w') as f:
json.dump(data, f)
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ main(args)
diff --git a/etc/compute_related.py b/etc/compute_related.py
new file mode 100644
index 00000000..36f3bc2c
--- /dev/null
+++ b/etc/compute_related.py
@@ -0,0 +1,74 @@
+import argparse
+import json
+import os
+
+import nltk
+
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('punkt_tab')
+
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+
+import numpy as np
+import scipy
+
+from gensim.models import TfidfModel
+from gensim.corpora import Dictionary
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(description="TSNE Visualization of Papers in ML4Code")
+
+ parser.add_argument("json", default=False, help="the path the json containing all papers.")
+ parser.add_argument("outdir", default=False, help="the target path of the visualizations papers.")
+ parser.add_argument("--num-relwork", default=4, help="Number of related work per paper.", type=int)
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+ num_relworks = args.num_relwork
+
+ with open(args.json) as f:
+ data = json.load(f)
+
+ print(f"Num papers: {len(data)}")
+
+ lemmatizer = WordNetLemmatizer()
+ stopwords = set(stopwords.words('english'))
+ stopwords.update(["one", "two", "using"])
+
+ tokens_per_paper = []
+ keys = []
+
+ for paper_info in data:
+ keys.append((paper_info["key"], paper_info["title"]))
+ text = paper_info["title"] + " " + paper_info["abstract"].replace("", " ").replace("
", " ") + " ".join(paper_info["tags"]) + lemmatized_tokens = [lemmatizer.lemmatize(w).lower() for w in nltk.word_tokenize(text) if w.lower() not in stopwords and w.isalpha()] + tokens_per_paper.append(lemmatized_tokens) + + dictionary = Dictionary(tokens_per_paper) + dictionary.filter_extremes(no_below=2, no_above=0.5) + + corpus = [dictionary.doc2bow(line) for line in tokens_per_paper] + model = TfidfModel(corpus) + + tf_idf_vectors = [] + for bow in corpus: + vec = np.zeros(len(dictionary), dtype=np.float64) + for i, v in model[bow]: + vec[i] = v + tf_idf_vectors.append(vec) + tf_idf_vectors = np.array(tf_idf_vectors) + + distances = scipy.spatial.distance.cdist(tf_idf_vectors, tf_idf_vectors, metric='cosine') + sorted_idxs = np.argsort(distances, axis=-1)[:, 1:num_relworks+1] + + os.makedirs(args.outdir, exist_ok=True) + for i, (bibkey, title) in enumerate(keys): + with open(os.path.join(args.outdir, bibkey + ".json"), "w") as f: + json.dump([keys[j] for j in sorted_idxs[i]], f) + + \ No newline at end of file diff --git a/etc/compute_topics.py b/etc/compute_topics.py new file mode 100644 index 00000000..0bba7ade --- /dev/null +++ b/etc/compute_topics.py @@ -0,0 +1,83 @@ +import argparse +import json +import nltk + +nltk.download('omw-1.4') +nltk.download('stopwords') +nltk.download('wordnet') +nltk.download('punkt_tab') + +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +from gensim.corpora import Dictionary +from gensim.models import LdaModel + + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Topic Model of Papers in ML4Code") + + parser.add_argument("json", default=False, help="the path the json containing all papers.") + parser.add_argument("outpath", default=False, help="the target path of the visualizations papers.") + parser.add_argument("--num-topics", default=20, help="The number of topics.", type=int) + return parser.parse_args() + +if __name__ == "__main__": + args = parse_arguments() + with open(args.json) as f: + data = json.load(f) + + print(f"Num papers: {len(data)}") + + + lemmatizer = WordNetLemmatizer() + stopwords = set(stopwords.words('english')) + stopwords.update(["one", "two", "using"]) + + tokens_per_paper = [] + for paper_info in data: + text = paper_info["title"] + " " + paper_info["abstract"].replace("", " ").replace("
", " ") + " ".join(paper_info["tags"]) + lemmatized_tokens = [lemmatizer.lemmatize(w).lower() for w in nltk.word_tokenize(text) if w.lower() not in stopwords and w.isalpha()] + tokens_per_paper.append(lemmatized_tokens) + + dictionary = Dictionary(tokens_per_paper) + dictionary.filter_extremes(no_below=20, no_above=0.5) + + corpus = [dictionary.doc2bow(doc) for doc in tokens_per_paper] + + passes = 100 + iterations = 1000 + + temp = dictionary[0] # This is needed to "load" the dictionary. + + model = LdaModel( + corpus=corpus, + id2word=dictionary.id2token, + chunksize=1000, + alpha='asymmetric', + eta='auto', + iterations=iterations, + num_topics=args.num_topics, + passes=passes, + eval_every=None + ) + + topic_tokens = [] + for topicid in range(args.num_topics): + topic_tokens.append([dictionary.id2token[k[0]] for i, k in enumerate(model.get_topic_terms(topicid, topn=4)) if i < 2 or k[1] > 0.025]) + + paper_topic_data = [] + for paper, paper_bow in zip(data, corpus): + topic_distr = model.get_document_topics(paper_bow, minimum_probability=0) + paper_topic_data.append({ + "key": paper["key"], + "year": paper["year"], + "title": paper["title"], + "topic_distr": {t: float(p) for t, p in topic_distr} + }) + + with open(args.outpath, 'w') as f: + json.dump({ + "topics": topic_tokens, + "paper_data": paper_topic_data + }, f) diff --git a/index.md b/index.md index 0370f24b..44467cff 100644 --- a/index.md +++ b/index.md @@ -21,21 +21,19 @@ research is inherently interdisciplinary, uniting the machine learning and natural language processing communities with software engineering and programming language communities. -#### Browse Papers by Tag +#### 🏷 Browse Papers by Tag {% assign rawtags = Array.new %} {% for publication in site.publications %} {% assign ttags = publication.tags %} {% assign rawtags = rawtags | concat: ttags %} {% endfor %} -{% assign rawtags = rawtags | uniq | sort %} +{% assign rawtags = rawtags | uniq | sort_natural %} {% for tag in rawtags %}
@@ -58,3 +56,22 @@ But a website can! We hope to make this site a living document.
Anyone can add a paper to this web site, essentially by creating one Markdown file.
To contribute, open a pull request in GitHub, by following [these instructions
for contributing](contributing.html).
+
+### Contributors
+
+The core survey and the original taxonomy was created by
+
+* [Miltos Allamanis](https://miltos.allamanis.com) Microsoft Research, Cambridge, UK
+* [Earl T. Barr](http://earlbarr.com) University College London, London, UK
+* [Prem Devanbu](http://web.cs.ucdavis.edu/~devanbu/) University of California, Davis, USA
+* [Charles Sutton](http://homepages.inf.ed.ac.uk/csutton/) University of Edinburgh and The Alan Turing Institute, UK
+
+#### Contributors to the website
+This website accepts external [contributions](/contributing.html).
+Please, feel free to add your name below, once you contribute to this
+website. A comprehensive list can be found [here](https://github.com/ml4code/ml4code.github.io/graphs/contributors).
+
+* [Uri Alon](http://www.cs.technion.ac.il/~urialon/) Technion, Israel
+* [Shaked Brody](https://shakedbr.cswp.cs.technion.ac.il/) Technion, Israel
+* [Nghi D. Q. Bui](https://bdqnghi.github.io/) Singapore Management University, Singapore
+* [Rajaswa Patil](https://rajaswa.github.io/) Microsoft PROSE
diff --git a/paper-abstracts.json b/paper-abstracts.json
index 368fb1c1..4321f4ff 100644
--- a/paper-abstracts.json
+++ b/paper-abstracts.json
@@ -3,7 +3,7 @@ layout:
title:
---
[
-{% for publication in site.publications %}{"key": "{{ publication.bibkey }}", "year": "{{ publication.year }}", "title":{{ publication.title | jsonify }}, "abstract": {{ publication.content | jsonify }}, "tags": {{ publication.tags | jsonify }} }{% if forloop.rindex0 > 0 %},{% endif %}
+{% for publication in site.publications %}{"key": "{{ publication.path | replace_first: '_publications/', '' | replace: '.markdown', '' }}", "year": "{{ publication.year }}", "title":{{ publication.title | jsonify }}, "abstract": {{ publication.content | jsonify }}, "tags": {{ publication.tags | jsonify }} }{% if forloop.rindex0 > 0 %},{% endif %}
{% endfor %}
]
diff --git a/papers.html b/papers.html
index ccba7e4a..3988890c 100644
--- a/papers.html
+++ b/papers.html
@@ -18,7 +18,6 @@
-
{{ publication.authors }}
diff --git a/public/css/hyde.css b/public/css/hyde.css
index a45f1a10..002c3f4c 100644
--- a/public/css/hyde.css
+++ b/public/css/hyde.css
@@ -287,3 +287,36 @@ tag {
tag > a {
color: #fff;
}
+
+.ribbon {
+ /* positioning */
+ position: fixed;
+ padding: 2px 45px;
+ width: 128px;
+ /* bottom left of the page */
+ bottom: 50px;
+ left: -50px;
+ -webkit-transform: rotate(45deg);
+ -moz-transform: rotate(45deg);
+ -ms-transform: rotate(45deg);
+ transform: rotate(45deg);
+ /* effects with some shadow */
+ box-shadow: 0 0 0 3px #020202, 0 0 20px -3px rgba(0, 0, 0, 0.5);
+ text-shadow: 0 0 0 #e5e5e5, 0 0 5px rgba(0, 0, 0, 0.3);
+ /* looks */
+ background-color: #020202;
+ color: #e5e5e5;
+ font-size: 10px;
+ font-family: sans-serif;
+ text-decoration: none;
+ font-weight: bold;
+ /* ribbon effects */
+ /*border: 2px dotted #e5e5e5; */
+ /* webkit antialias fix */
+ -webkit-backface-visibility: hidden;
+ letter-spacing: .5px;
+
+ z-index:100;
+ box-sizing:unset;
+
+}
diff --git a/public/opensearchdescription.xml b/public/opensearchdescription.xml
index 33232678..4d94e44e 100644
--- a/public/opensearchdescription.xml
+++ b/public/opensearchdescription.xml
@@ -1,10 +1,13 @@
-
-
-
- ML4Code
- Search Papers in Machine Learning for Source Code
- ml4code
-
- https://ml4code.github.io/public/favicon.svg
+
+ML4Code
+ML4Code Survey
+
+
+
+https://ml4code.github.io/public/favicon.svg
+open
+false
+en-us
+UTF-8
+UTF-8
diff --git a/resources.md b/resources.md
index 0a3a7f6a..567c5608 100644
--- a/resources.md
+++ b/resources.md
@@ -14,8 +14,8 @@ A list of datasets used in this area can be found at the appendix of the
[survey](https://arxiv.org/abs/1709.06182) and at [learnbigcode.github.io](http://learnbigcode.github.io/datasets/).
### Courses
-A few university courses are been taught covering aspects of machine learning for code, big code or naturalnness of code. Below there are a few that have publicly available material.
-* [Analyzing Software using Deep Learning](http://software-lab.org/teaching/summer2020/asdl/) in T.U. Darmstadt [[videos](https://www.youtube.com/playlist?list=PLBmY8PAxzwIHIKq4tYLws25KqGvUM4iFD)]
+A few university courses are been taught covering aspects of machine learning for code, big code or naturalness of code. Below there are a few that have publicly available material.
+* [Analyzing Software using Deep Learning](http://software-lab.org/teaching/summer2020/asdl/) in University of Stuttgart [[videos](https://www.youtube.com/playlist?list=PLBmY8PAxzwIHIKq4tYLws25KqGvUM4iFD)]
* [Seminars on Applications of Deep Learning in Software Engineering and Programming Languages](https://sites.google.com/view/mlplse-sp18/) in U.C. Berkeley
* [Machine learning for programming](https://www.cl.cam.ac.uk/teaching/1920/P252/) in the University of Cambridge, UK
* [Deep Learning for Symbolic Reasoning](http://tiarkrompf.github.io/cs590/2018/) in Purdue University
@@ -26,6 +26,9 @@ Please, feel free to submit a pull request to adding more links in this page.
### Workshops and Other Academic Events
The last few years a few workshops have been organized in this area. Please, feel free to add any missing or future workshops here.
+* [Deep Learning for Code](https://dl4c.github.io) April 29 2022, ICLR 2022, virtual
+* [NLP4Prog Workshop](https://nlp4prog.github.io/2021/) 6 August 2021, ACL 2021, virtual
+* [Workshop on Computer-Assisted Programming](https://capworkshop.github.io/) 12 December 2020, NeurIPS 2020, virtual
* [ML on Code devroom at FOSDEM19](https://fosdem.org/2019/schedule/track/ml_on_code/) 2-3 February 2019, Brussels, EU [[videos](https://video.fosdem.org/2019/H.2213/)]
* [Machine Learning for Programming](http://ml4p.org/) 18–19 July 2018, Oxford, UK [[videos](https://www.youtube.com/watch?v=dQaAp9wdFtQ&list=PLMPy362FkW9pd96bwh0BuCGMo6fdMQ2aw)]
* [International Workshop on Machine Learning techniques for Programming Languages](https://conf.researchr.org/track/ecoop-issta-2018/ML4PL-2018-papers) 16 - 21 July 2018 Amsterdam, Netherlands
@@ -37,9 +40,12 @@ The last few years a few workshops have been organized in this area. Please, fee
### Courses on Important Relevant Background
-* [Sofware Analysis](http://rightingcode.org/) in Univ. of Pennsylvania. It is a great introduction to Program Analysis [[videos](https://www.youtube.com/playlist?list=PLF3-CvSRq2SaApl3Lnu6Tu_ecsBr94543)]
+* [Sofware Analysis](http://rightingcode.org/) at Univ. of Pennsylvania. It is a great introduction to Program Analysis [[videos](https://www.youtube.com/playlist?list=PLF3-CvSRq2SaApl3Lnu6Tu_ecsBr94543)]
+* [Program Analysis](https://software-lab.org/teaching/winter2020/pa/) at University of Stuttgart [[videos](https://www.youtube.com/playlist?list=PLBmY8PAxzwIEGtnJiucyGAnwWpxACE633)]
+* [Applications of Data Science for Software Engineering 2020](https://www.youtube.com/watch?v=34hcH7Js41I&list=PLmAXH4O57P5_0IflYjLIg8l0IupZPbdlY) at Eindhoven University of Technology.
### Competitions
+* [nlc2cmd](http://nlc2cmd.us-east.mybluemix.net/#/) in NeurIPS 2020 by Project CLAI. Starts July 2020.
* [CodeSearchNet Challenge: Evaluating the State of Semantic Code Search](https://github.com/github/CodeSearchNet) by Github. Starts Sep 2019.
* [CodRep 2019: Machine Learning on Source Code Competition](https://github.com/KTH/codrep-2019) by KTH. Starts on April 25th 2019.
* [CodRep 2018: Machine Learning on Source Code Competition](https://github.com/KTH/CodRep-competition) by KTH. Starts on April 14th 2018.
@@ -49,4 +55,4 @@ The last few years a few workshops have been organized in this area. Please, fee
papers in the area. You can access the list [here](https://github.com/src-d/awesome-machine-learning-on-source-code).
* [Autormated Program Repair](https://www.monperrus.net/martin/automatic-software-repair)
has a curated list of pointers for helping newcomers to understan the field,
-maintained by [Martin Monperrus](www.monperrus.net).
\ No newline at end of file
+maintained by [Martin Monperrus](https://www.monperrus.net/martin/).
diff --git a/tags.html b/tags.html
index 991dca18..bf9476cf 100644
--- a/tags.html
+++ b/tags.html
@@ -8,7 +8,7 @@
{% assign ttags = publication.tags %}
{% assign rawtags = rawtags | concat: ttags %}
{% endfor %}
-{% assign rawtags = rawtags | uniq | sort %}
+{% assign rawtags = rawtags | uniq | sort_natural %}
Publications by Tag
@@ -22,7 +22,7 @@
Tags
{% assign sortedPublications = site.publications | sort: "authors" | sort: "year"%}
{% for tag in rawtags %}
- {{ tag }}
+ 🏷 {{ tag }}
{% for publication in sortedPublications %}
{% if publication.tags contains tag %}
diff --git a/topic-viz.html b/topic-viz.html
new file mode 100644
index 00000000..d2ed5880
--- /dev/null
+++ b/topic-viz.html
@@ -0,0 +1,61 @@
+---
+layout: default
+title: Explore ML4Code papers with Topics
+description: A topic model for the papers in the ML4Code survey
+---
+Topic-based Explorer
+Using topic-modelling the following topics have been extracted. The top stemmed words apprear below.
+ Please change the slider to present the papers that mostly related to the appropria topics
+
+
+
+
+ - Please move the sliders to look at the papers.
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tsne-viz.html b/tsne-viz.html
index 9274f963..a8d66a05 100644
--- a/tsne-viz.html
+++ b/tsne-viz.html
@@ -1,11 +1,12 @@
---
layout: default
-title: Visualization of Publications on Machine Learning for Source Code
-description: A tSNE visualization of all the ML4Code papers
+title: A Map of Publications on Machine Learning for Source Code
+description: A map/visualization of the ML4Code papers.
---
2D Map of Papers
-Each dot represents one paper in this survey. Hover your mouse over each point to look
-at the details. Click on a point to go to the paper information page.
+Each dot represents one paper in this survey. Hover your mouse over each point to look
+at the details. Click on a point to go to the paper information page.
+
Please consider contributing by updating
@@ -65,15 +66,16 @@ 2D Map of Papers
for (i=0; i" + d.tags[i] + " "
}
+
+ var boundingRect = document.getElementById("paperviz").getBoundingClientRect();
+ var mousePos = d3.mouse(this);
+ var x = mousePos[0] + boundingRect.x + 20;
+ var y = mousePos[1] + boundingRect.y + 30;
+
tooltip
.html("" + d.title + " " + tags + "
")
- .style("left", (d3.mouse(this)[0]+30) + "px")
- .style("top", (d3.mouse(this)[1]) + "px");
- d3.selectAll("circle").filter(dd => dd.key == d.key).style("fill", "#ff0000");
- }
-
- var mouseleave = function(d) {
- d3.selectAll("circle").filter(dd => dd.key == d.key).style("fill", "#69b3a2");
+ .style("left", x + "px")
+ .style("top", y + "px");
}
var click_link = function(d) {
@@ -92,10 +94,23 @@ 2D Map of Papers
.style("fill", "#69b3a2")
.style("opacity", 0.4)
.style("stroke", "white")
- .on("mouseover", mouseover )
- .on("mousemove", mousemove )
- .on("mouseleave", mouseleave )
- .on("click", click_link)
+ .on("mouseover", mouseover)
+ .on("mousemove", mousemove)
+ .on("click", click_link);
+
+ var isMatch = function(d, searchTerm) {
+ if (searchTerm.length < 3) return false;
+ var allText = (d.title + " " + d.abstract + " " + d.tags.join(" ")).toLocaleLowerCase();
+ return allText.indexOf(searchTerm) != -1;
+ }
+
+ $("#filtermap").keyup(function (e) {
+ var searchTerm = $("#filtermap").val().toLocaleLowerCase();
+ var allPoints = d3.selectAll("circle");
+ // TODO: This seems quite inefficient...
+ allPoints.filter(dd => !isMatch(dd, searchTerm)).style("fill", "#69b3a2");
+ allPoints.filter(dd => isMatch(dd, searchTerm)).style("fill", "#aa0000");
+ });
});