From 695a9fc42b85451e3527aa2bac545a2dc7ceb9aa Mon Sep 17 00:00:00 2001
From: priya-mane <priya.hm@somaiya.edu>
Date: Sun, 25 Oct 2020 21:38:50 +0530
Subject: [PATCH 1/3] added python script for a research paper latex parser

---
 .../Research_paper_latex_parser/README.md     |  76 +++
 .../get_details.py                            | 141 +++++
 .../Research_paper_latex_parser/op_json.json  |  47 ++
 .../Research_paper_latex_parser/papers/p1.tex |  91 ++++
 .../Research_paper_latex_parser/papers/p2.tex |  50 ++
 .../Research_paper_latex_parser/papers/p3.tex |  63 +++
 .../Research_paper_latex_parser/papers/p4.tex |  87 +++
 .../Research_paper_latex_parser/papers/p5.tex | 146 ++++++
 .../Research_paper_latex_parser/parser.ipynb  | 494 ++++++++++++++++++
 .../results/Capture.JPG                       | Bin 0 -> 24998 bytes
 10 files changed, 1195 insertions(+)
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/README.md
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p1.tex
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p2.tex
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p3.tex
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p4.tex
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p5.tex
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/parser.ipynb
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/results/Capture.JPG

diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/README.md b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md
new file mode 100644
index 000000000..136d5a395
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md
@@ -0,0 +1,76 @@
+# Research paper parser
+
+The script reads latex files for research paper from the given directory and extracts essential information from the latex format.
+
+The script purges unwanted items like -
+* Images
+* Tables
+* Equations
+
+The script returns a json object containing following items for each research paper - 
+* title
+* author
+* abstract
+* introduction
+* conclusions
+* results
+* acknowledgments
+* Scrapes any other title defined by the user too.
+
+***
+### Prerequisites
+
+```
+pip install os
+pip install json
+pip install re
+pip install argparse
+pip install tqdm
+```
+
+***
+
+### How to run the script
+
+```
+python get_details.py -p <directory_containing_papers> -o <output_file_path>
+```
+
+Example :
+
+```
+python get_details.py -p ./papers -o op_json.json
+```
+
+![Output](results/Capture.JPG)
+
+***
+
+
+### Results
+
+A sample json object created for one research paper.
+
+```
+ {
+        "author": "I.M. Great and So.R. Yu",
+        "title": "A Sample Research Paper",
+        "Introduction": " \n\nUsing latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n\nMaecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n\n",
+        "Results": " \nIncluding figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n \n\n \n\n \n\n",
+        "Conclusions": " \n\nMan, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n",
+        "Some_title": " \n\nTest title for user defined  section.\n\n",
+        "user_defined_title_for_begin": " \n\nwjlrhfwer ljqr flwuer j rlferfurl u airlf  aiurf uoiruf iuoqir oiuqr iuq woe\n",
+        "acknowledgement": "\nThe author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n\n"
+    }
+```
+
+
+***
+
+## *Author Name*
+
+Priya Mane
+
+***
+
+
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
new file mode 100644
index 000000000..6ad39f48f
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
@@ -0,0 +1,141 @@
+from os.path import isfile, join
+import re
+import json
+from os import listdir
+import argparse
+import os
+from tqdm import tqdm
+
+
+class essential_data:
+    """
+    Extract essential data from the tex document.
+    Essential data includes - title, author, abstract, introduction, conclusions, results, acknowledgments.
+    """
+
+    def __init__(self, tex_data):
+        self.tex_data = tex_data
+
+    def get_elements(self):
+        data = self.tex_data
+        data_dict = dict()
+
+        sections = re.findall(r'section{(.*?)\\', data, re.S)
+        for obj in sections:
+            h = re.findall(r'(.*?)}', obj, re.S)
+            c = obj.replace(h[0]+"}", ' ')
+            data_dict['%s' % (h[0])] = '%s' % (c)
+            data = data.replace("section{" + obj, " ")
+
+        begins = re.findall(r'\\begin{(.*?)\\end', data, re.S)
+        for obj in begins:
+            h = re.findall(r'(.*?)}', obj, re.S)
+            if len(h) > 1:
+                continue
+            c = obj.replace(h[0]+"}", ' ')
+            data_dict['%s' % (h[0])] = '%s' % (c)
+            data = data.replace("\\begin{" + obj + "\\end", " ")
+
+        return data_dict
+
+    def get_author(self):
+        author = re.findall(r'[Aa]uthor(s?){(.*?)}', self.tex_data, re.S)
+        return author[0][1]
+
+    def get_title(self):
+        title = re.findall(r'[Tt]itle{(.*?)}', self.tex_data, re.S)
+        return title[0]
+
+    def get_ack(self):
+        acknowledgments = re.findall(
+            r'\\[Aa]cknowledgment(s?)(.*?)\\', self.tex_data, re.S)
+        return acknowledgments[0][1]
+
+
+class clean_data:
+    """
+    Contains functions to purge all unwanted elements from the tex file.
+    """
+
+    def __init__(self, tex_data):
+        self.tex_data = tex_data
+
+    def purge_images(self):
+        """
+        Purges images from the tex data using tag the '\begin{figure}'
+        """
+        imgs = re.findall(
+            r'begin{figure}(.*?)end{figure}', self.tex_data, re.S)
+        start = "\\begin{figure}"
+        end = "end{figure}"
+        imgs = [start + img + end for img in imgs]
+        for img in imgs:
+            self.tex_data = self.tex_data.replace(img, " ")
+
+    def purge_tables(self):
+        """
+        Purges tables from the tex data using tag the '\begin{table}'
+        """
+        tables = re.findall(
+            r'begin{table}(.*?)end{table}', self.tex_data, re.S)
+        start = "\\begin{table}"
+        end = "end{table}"
+        tables = [start + table + end for table in tables]
+        for table in tables:
+            self.tex_data = self.tex_data.replace(table, " ")
+
+    def purge_equations(self):
+        """
+        Purges equation from the tex data using tag the '\begin{equation}'
+        """
+        equations = re.findall(
+            r'begin{equation}(.*?)end{equation}', self.tex_data, re.S)
+        start = "\\begin{equation}"
+        end = "end{equation}"
+        equations = [start + equation + end for equation in equations]
+        for equation in equations:
+            self.tex_data = self.tex_data.replace(equation, " ")
+
+
+# python get_details.py -p papers -o op_json.json
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="extract title,author,abstract,introduction,results,conclusions and acknowledgments from given set of research papers.")
+
+    parser.add_argument("-parent", help="enter path to parent directory containing all research papers",
+                        dest="parent_path", type=str, required=True)
+    parser.add_argument("-output", help="enter path of output file",
+                        dest="op", type=str, required=True)
+
+    args = parser.parse_args()
+    directory_path = args.parent_path
+    op_file = args.op
+
+    all_data = []
+
+    all_files = [f for f in listdir(
+        directory_path) if isfile(join(directory_path, f))]
+
+    for tex_file in tqdm(all_files):
+
+        p = os.path.join(directory_path, tex_file)
+
+        data = open(p, encoding='latin-1').read()
+
+        cd = clean_data(data)
+        cd.purge_images()
+        cd.purge_tables()
+        cd.purge_equations()
+
+        ed = essential_data(cd.tex_data)
+        d = {}
+        d.update({"author": ed.get_author()})
+        d.update({"title": ed.get_title()})
+        d.update(ed.get_elements())
+        d.update({"acknowledgement": ed.get_ack()})
+        all_data.append(d)
+
+    with open(op_file, "w") as outfile:
+        json.dump(all_data, outfile, indent=4)
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json
new file mode 100644
index 000000000..73d6b1145
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json
@@ -0,0 +1,47 @@
+[
+    {
+        "author": "I.M. Great and So.R. Yu",
+        "title": "A Sample Research Paper",
+        "Introduction": " \n\nUsing latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n\nMaecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n\n",
+        "Results": " \nIncluding figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n \n\n \n\n \n\n",
+        "Conclusions": " \n\nMan, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n",
+        "Some_title": " \n\nTest title for user defined  section.\n\n",
+        "user_defined_title_for_begin": " \n\nwjlrhfwer ljqr flwuer j rlferfurl u airlf  aiurf uoiruf iuoqir oiuqr iuq woe\n",
+        "acknowledgement": "\nThe author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n\n"
+    },
+    {
+        "author": "Pratik Merchant, Smit Moradiya, Jignesh Nagda, Niket Mehta",
+        "title": "Parallel Implementation of Support Vector Machine",
+        "Introduction": " \n\nThe name \u00e2\u0080\u0098support vectors\u00e2\u0080\u0099 (data points) used to define this dividing plane. Since we only require the SVs to create a classifier, the non SVs can be discarded. However, it becomes a problem when the points are not separable by a simple linear plane. Hence, to handle this problem, SVM uses what is known as the \u00e2\u0080\u009ckernel trick\u00e2\u0080\u009d on the training data and the mapping to a higher dimensional space is done by it, where such a dividing plane can be found more easily. Every improve accuracy. The role of Kernel is to transform the problem using some linear algebra for linear SVM and this is how the learning of the hyperplane happens. To avoid misclassifying each training example is given by the regularisation parameter. Lower is the regularisation value, more is the misclassification. To decide how far the influence of each training parameter reaches, the gamma parameter is used. Low gamma value means points which are at a far distance from the separation line are considered for calculation whereas a high gamma value implies that only the points nearby to the separation line are considered for calculation. Lastly, the separation of the line/hyperplane to the point which closest to it is called as margin. A larger separation for both the classes means a good margin and also no crossing into other class.\nThe steps to implement SVM are as follows: Step 1: Import all the necessary libraries such as numpy, pandas, matplotlib. Step 2: Importing the dataset Step 3: Performing exploratory data analysis Step 4: Performing data pre-processing Step 5: Splitting the data into train and test. Step 6: Import SVM, create a classifier and train the model. Step 6: Making predictions Step 7: Evaluating the algorithm  Step 8: Results \n\n",
+        "my_section": " \n\nnlwekndw lweuidwe nulei nameiude includewe oiu wede oiuwe dn eiuqwend\n\n",
+        "Results": " \nPSVM first loads the training data onto \u00e2\u0080\u0098m\u00e2\u0080\u0099 number of machines. This done in round robin fashion. The memory required by each memory is Big-Oh of nd/m. In the next step, PSVM performs a row-based ICF parallely on the data which has been loaded. At the end of this step, only a small portion of the factorized matrix is stored on each machine, which has a space complexity of Big-Oh of np/m. For the quadratic optimization problem, IPM is performed hereafter. Let, n- no. of training instances d-initial no. of dimensions p- After factorization, reduced matrix dimension (p<<<n) m- number of machines With the help of PSVM, the memory requirements reduce from a complexity of Big-Oh of n^2 to Big-Oh of np/m.  \n \nHDF5 which is a file format, data model, and a set of software allows users to store data and associated metadata. \n\nSeveral of its features make it useful for high performance computing and big data applications, such as data compression, which along with binary value storage can greatly reduce file sizes. It also allows parallel file access, which enables one or more processors to read/write data from a single file, and features customizable data \u00e2\u0080\u009cchunking\u00e2\u0080\u009d, which allows users to define how the data is internally arranged into subsets. This can be used to tune parallel performance. \n\nAn open source cluster computing system intended for the analysis and processing of big data is Apache Spark. Spark clusters have the ability to scale up to thousands of distributed nodes that can work cooperatively to process very large datasets in parallel, this makes them an example of an HTC system. It also supports real-time streaming processing of data as well as real-time streaming processing of data. Some of its main components are Spark streaming, Spark SQL, Spark core which are used for stream processing, to enable structured data processing and handle task distribution and scheduling respectively. (mllib) Machine Learning Library supports many tasks such as linear SVM training,linear regression and clustering. However some extra code needs to be written to handle multi-class problems since it only supports binary classification. \n \nThere maybe a period of CPU idle time if one segment finishes training before it concatenation counterpart. The issue can be cumulative in severely distributed databases. The benefits of attempting to balance the segments after the processing has started are likely to be outweighed by the overhead in MPI communication. This difference should be mitigated by proper randomization of the input vector order. \n \nApplications SVMs have been found particularly useful in earth observation and satellite imagery data analysis. Some of its other applications include: 1. Detetction of faces 2. Categorization of text and as well as hypertext 3. Image classifier 4. Fields related to biology 5. Remote homo-logy detection 6. Recognition of hand-written characters 7. GPC 8. Geo and Environmental Sciences \n\n",
+        "Conclusions": " \n\nMany algorithmic approaches are found to be more effective when kernels are not used or memory is not a constraint. Also, other approaches can be used to achieve good speedup by dividing a serial algorithm into subtasks. These subtasks are basically subsets of the training data. If no. of machines continue to expand and cross the data-size independent threshold, PSVM cannot achieve linear speedup in such cases. There are 2 types of overheads encountered while implementing parallel SVM- communication and synchronization overheads. During message passing, communication time is accounted for.  Computation, communication and synchronization together form the running time. To increase the accuracy, PSVM must select the correct no. of machines.  Hence, we can conclude by saying that when the parallelism of modern hardware is leveraged, massive speedups are possible a satisfactory performance is achieved\n\n",
+        "acknowledgement": "\nI am grateful to my college professors for providing me this wonderful oppurtunity to present this research paper.\n\n"
+    },
+    {
+        "author": "Pratik Merchant",
+        "title": "Prediction of human behaviour with the aid of sentiment analysis using social media datasets.",
+        "Introduction": " \n\nSocial media data like Facebook, Twitter, Instagram blogs, etc. is currently growing in an exploding speed. Sentiment analysis\u00e2\u0080\u0093also called opinion mining\u00e2\u0080\u0093is the process of defining and categorizing opinions in a given piece of text as positive, negative, or neutral. The main purpose of conducting this research is to understand the sentiments which in turn can help us mine knowledge and capture the ideas without necessarily going through all data, which will save us a huge amount of time. Also, this analysis can further be used for a variety of purposes such as identifying influencers, competitive benchmarking, consumer opinion and brand sentiment, etc.\nThe already existing models lack accuracy. Also, they predict on the basis of one or 2 factors which is too less a number considering the amount of thought processes a human brain goes through before coming on to a decision. Also, the inaccuracy occurring due to the automated bots need to be taken into consideration. Since, they can largely tilt the dataset to a particular side (positive or negative). \nThe main goal is to improve accuracy and also to remove the input of the bots from the datasets using appropriate filtering techniques. And also, to merge the prediction of all the various datasets together to obtain a cumulative prediction of all the social media accounts a person uses.\nThe Government or the common public can largely benefit from this since any negative event(protests) if predicted by the model may help in taking adequate protective measures and hence in turn maybe avoid or reduce the magnitude of the same. This issue if addressed before could have prevented the negative impacts of a lot of events such as the Muzaffarnagar riots, FTII Agitation, Pro-Jallikattu protests which took place in Tamil Nadu, etc. Hence, any such events if again predicted in the future, can very well be avoided by taking appropriate advance action.   \n\n",
+        "Results": " \nThere are three machine learning classification algorithms that are predominantly used for sentiment analysis in social media and they are as follows:\na.\tSupport Vector Machines (SVMs)\nb.\tNaive-bayes\nc.\tDecision Trees\nEach has it\u00e2\u0080\u0099s own advantages and drawbacks; however, a few different studies have concluded that the Naive-Bayes classifier is the more accurate of the three.[1]\n      \nNaive-Bayes classifier is a machine learning classification algorithm that asserts an independent value for each feature within a dataset. In other words, each element is valued individually to determine a probability that the sum of these values will constitute a pre-defined label or outcome. Effective sentiment analysis of social media datasets using Naive Bayesian Classification involves extraction of subjective information from textual data. A normal human can easily understand the sentiment of a document written in natural language based on its knowledge of understanding the polarity of words and in some cases the general semantics used to describe the subject. The project aims to make the machine extract the polarity (positive, negative or neutral) of social media dataset with respect to the queried keyword.\nThis project introduces an approach for automatically classifying the sentiment of social media data by using the following procedure: First the training data is fed to the sentiment analysis engine for learning by using machine learning algorithm. The next step is to filter misleading data(mostly encountered because of bots).The next step involved is the training of the dataset by mathematical formulations. After the learning is complete with qualified accuracy, the machine starts accepting individual social data with respect to keyword that it analyses and interprets, and then classifies it as positive, negative or neutral with respect to the query term.[2] The prediction of an individual once obtained from the different social media datasets may then be cumulated and then compared with the prediction of other individuals to see if there is anything in common. Common predictions if found any may indicate the mass sentiment of the people and will also hint about their future course of actions if any.\nWhen talking about textual sentiment analysis, this usually comes in the form of a training set bag-of-words already sorted into positive or negative categories. A positive word may have a +1 scoring while a negative word will have a -1 scoring. You can also assign higher values to certain words that may be more negative in degree. Regardless, if the final score of a mention is positive, then the mention is positive and vice versa for negative final score.\nIf word only appears once, we don\u00e2\u0080\u0099t need a frequency table. If we assign each positive and negative value a \u00e2\u0080\u009c1\u00e2\u0080\u009d, then we can simply divide the positive and negative words by the amount of words in the entire mention and then the subtract the negative words score from the positive one and  if the total of our mention comes out as positive, we can say the sentiment of the mention above is positive and vice versa for a negative result.\nSince the total of our mention comes out as positive, we can say the sentiment of the mention above is positive. This is a pretty clear-cut case as we didn\u00e2\u0080\u0099t encounter polarizing words that might skew the result if a computer can\u00e2\u0080\u0099t understand which category the word belongs to.[1] \n\nNow, the maxim that more data will lead to better predictive models is not always true, because noise in the data can overwhelm predictive models. The ability to deal with noisy, incomplete, and inconsistent data will be at the heart of next-generation predictive models. For instance, when identifying \u00e2\u0080\u009cbots\u00e2\u0080\u009d on Twitter that are seeking to sway opinion to be positive about a political candidate, we needed to ignore the huge numbers of bots that were seeking to achieve other ends- such as spreading spam or seeking to influence opinions about other topics or to deceive users into clicking on links that generate revenue for the person who included that link in their tweet. Moreover, data about many Twitter handles are limited and, in some cases, intentionally misleading. Bot developers go to considerable effort to ensure that their bots elude detection.\n\nThe generation and reduction to practice of robust multistage predictive modeling for emergent phenomena is an important step. For instance, social movements have been classified into five stages: genesis of the movement, increase in social unrest, enthusiastic mobilization to develop an organization, maintenance of the organization, and termination (when the movement starts to die down). When the protest is in an early stage (for example, of people expressing grievances on Twitter), some stakeholders would benefit from a prediction of the likelihood of violence occurring in any of the future stages. In such extreme cases, identifying bots is a very important part.\nIn this way, the above proposed methodology if implemented, can be of great help in a variety of applications as seen above.\n\n\n",
+        "Conclusions": " \n\nUltimately, once can say that sentiment analysis isn\u00e2\u0080\u0099t perfect, but neither are we when trying to decipher what someone means. Within social media monitoring, we need sentiment analysis as a starting point to understand general public sentiment in aggregate. \nHence, we can say that social media is perhaps the largest pool from which we can mine for public opinion and begin to gather informative data for prediction purposes. \nIn this way, I plan to complete the above mentioned process as soon as possible once begun. If done correctly, the process would be completed within a stipulated period of time. If I am successful in meeting my objectives then, this shall be largely benefical to the Government authorities, the Police authorities as well as the common people at large. \n\n",
+        "acknowledgement": "\nI would like to thank my college professors for supporting me immensely in this endeavor.\n\n"
+    },
+    {
+        "author": "\nAmeya Keskar                                                         Priya Mane\nChinmay Lotankar                                                     Jeet Mehta\n",
+        "title": "C4.5 CLASSIFICATION ALGORITHM",
+        "Introduction": " \n\nData mining is the process of analyzing large data and getting valuable information from it. There are various algorithms and techniques done to do so. One such data structure is Decision tree; it is a flowchart-like structure with nodes and arrows directing from one node to another. At each node, one attribute is considered and further split branches equal to the number of unique values the attribute can take. Each of this branch is connected to other node where the value of next attribute is defined. Hence in going from one node to another, we fix or determine the value of each attribute. Each leaf node consists of one of the values of classification variable.\nNow the question is which attribute must be placed at what level in a tree.C4.5 Algorithm is used to choose the attributes to be placed at each level of the tree. The main advantage of C4.5 algorithm can deal with attributes having numeric data/non-categorical data which is difficult to classify per say and also deal with missing value data.C4.5 algorithm makes a decision tree by using the concept of information entropy. The parameter used here is normalized information gain. Normalized information gain is calculated for each attribute and the one with maximum value is chosen as the first attribute/root node of the decision tree.\n\n\nliterature review\n\nFor the construction of a decision tree, we can use the C4.5 algorithm. The algorithm is based on Information gain entropy. We can say that, if an event is highly probable, there is no surprise if it occurs. This means that it gives very little information. This means amount of information gained is inversely proportional to the probability of the event. Entropy is proportional to the probability of an event; hence we can also say that information gain and entropy are inversely proportional.\nIn decision trees, it is necessary that with each split the entropy decreases. Hence, if the splitting is done accurately, we may arrive to a very definite decision. So, we check each node for all possible splitting. cases for First, we calculate the entropy difference and the case for which difference is least is considered..\n\nALGORITHM:\nCalculate Information gain for each parameter.\nDetermine the attribute with maximum Information gain entropy.\nChoose this attribute as next splitting node.\nContinue in similar manner for all attributes.\n \nLet us consider an example \n\n \n\n",
+        "Results": " \nC4.5 Vs C5.0 C4.5 was superseded in 1997 by a commercial system See5/C5.0 (C5.0 for Unix / Linux, See5 pour Windows).  \nThe changes hold within new capabilities as well as much improved efficiency, and include: \n  A variant of boosting that constructs an ensemble of classifiers which are then voted to give a final classification. This often leads to a dramatic improvement in predictive accuracy.   New data types (e.g., dates), \u00e2\u0080\u009cnot applicable\u00e2\u0080\u009d values, variable misclassification costs, and mechanisms to pre-filter attributes. \n Unordered rule set when a case is classified, all applicable rules are found and voted. \n This improves both their predictive accuracy and the interpretability of rule sets.  \nMulti-threading enhances scalability. C5.0 have the ability to take advantage of computers with multiple CPUs and/or cores\n\n\n",
+        "Conclusions": " \n\nThe decision tree is a usual algorithm in data mining.C4.5 algorithm is a wide application scope, high frequency decision tree algorithm. It constructs and prunes the decision tree analysis and estimates, completes the classified data mining by data preprocessing and choosing parameters or catalog.The article analyzes the C4.5 and improved methods for the calculation speed of C4.5 algorithm in detail. At least, it is proved by experiment data set that the improved C4.5 algorithm is well-performed on the training speed classify and accuracy. In this Paper C4.5 algorithm was improved the experiment proved that it has minimal impact on the classification accuracy, but the efficiency increased a lot. We can not only speed up the growing of the decision tree, so that better information of rules can be generated. In this paper the algorithm was verified by different large datasets which are publicly available on UCI\nmachine learning repository. With the improved algorithm ,we can get faster and more effective results without the change of the final decision and the presented algorithm constructs the decision tree more clear and understandable .Efficiency and classification is greatly improved and the disadvantages of low efficiency and memory consumption while dealing with large amount of data were overcome as it was in C4.5.If the amount of data is small original C4.5 is\nused because of its higher accuracy.\n\n\n",
+        "thebibliography": " \n\nhttps://towardsdatascience.com/what-is-the-c4-5-algorithm-and-how-does-it-work-2b971a9e7db0\nhttps://medium.com/greyatom/decision-trees-a-simple-way-to-visualize-a-decision-dc506a403aeb\nhttps://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/\nhttps://arxiv.org/abs/1310.2071\nhttps://www.sciencedirect.com/science/article/pii/S0925231298000903\nhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC4466856/\n\n\n\n",
+        "acknowledgement": "\nThe authors are grateful to K.J Somaiya college of Engineering faculty.\n\n"
+    },
+    {
+        "author": "\nArman Cohan\nFranck Dernoncourt\nDoo Soon Kim\nTrung Bui\nSeokhwan Kim \nWalter Chang\nNazli Goharian\u00e2\u0080\u00a0\n",
+        "title": "A Discourse-Aware Attention Model for\nAbstractive Summarization of Long Documents",
+        "Introduction": " \nExisting large-scale summarization datasets\nconsist of relatively short documents. For exam\u0002ple, articles in the CNN/Daily Mail dataset (Her\u0002mann et al., 2015) are on average about 600 words\nlong. Similarly, existing neural summarization\nmodels have focused on summarizing sentences\nand short documents. In this work, we propose a\nmodel for effective abstractive summarization of\nlonger documents. Scientific papers are an ex\u0002ample of documents that are significantly longer\nthan news articles (see Table 1). They also fol\u0002low a standard discourse structure describing the\nproblem, methodology, experiments/results, and\nfinally conclusions (Suppe, 1998).\nMost summarization works in the literature\nfocus on extractive summarization. Examples\nof prominent approaches include frequency-based\nmethods (Vanderwende et al., 2007), graph-based\nmethods (Erkan and Radev, 2004), topic mod\u0002eling (Steinberger and Jezek, 2004), and neural\nmodels (Nallapati et al., 2017). Abstractive sum\u0002marization is an alternative approach where the\ngenerated summary may contain novel words and\nphrases and is more similar to how humans sum\u0002marize documents (Jing, 2002). Recently, neu\u0002ral methods have led to encouraging results in\nabstractive summarization (Nallapati et al., 2016;\nSee et al., 2017; Paulus et al., 2017; Li et al.,\n2017). These approaches employ a general frame\u0002work of sequence-to-sequence (seq2seq) models\n(Sutskever et al., 2014) where the document is\nfed to an encoder network and another (recurrent)\nnetwork learns to decode the summary. While\npromising, these methods focus on summarizing\nnews articles which are relatively short. Many\nother document types, however, are longer and\nstructured. Seq2seq models tend to struggle with\nlonger sequences because at each decoding step,\nthe decoder needs to learn to construct a context\nvector capturing relevant information from all the\ntokens in the source sequence (Shao et al., 2017).\nOur main contribution is an abstractive model\nfor summarizing scientific papers which are an\nexample of long-form structured document types.\nOur model includes a hierarchical encoder, captur\u0002ing the discourse structure of the document and a\ndiscourse-aware decoder that generates the sum\u0002mary. Our decoder attends to different discourse\nsections and allows the model to more accurately\nrepresent important information from the source\nresulting in a better context vector. We also in\u0002troduce two large-scale datasets of long and struc\u0002tured scientific papers obtained from arXiv and\nPubMed to support both training and evaluating\nmodels on the task of long document summariza\u0002tion. Evaluation results show that our method out\u0002performs state-of-the-art summarization models1.\n\n\n",
+        "Results": " \nOur main results are shown in Tables 2\nand 3. Our model significantly outperforms the\nstate-of-the-art abstractive methods, showing its\neffectiveness on both datasets. We observe that\nin our ROUGE-1 score is respectively about 4 and\n3 points higher than the abstractive model PntrGen-Seq2Seq for the arXiv and PubMed datasets,\nproviding a significant improvement. Our method\nalso outperforms most of the extractive methods\nexcept for LexRank in one of the ROUGE scores.\nWe note that since extractive methods copy salient\nsentences from the document, it is usually easier for them to achieve higher ROUGE scores.\nFigure 2 illustrates the effectiveness of our\nmodel extensions in capturing various discourse\ninformation from the papers. It can be observed\nthat the state-of-the-art Pntr-Gen-Seq2Seq model\ngenerates a summary that mostly focuses on introducing the problem, whereas our model generates\na summary that includes more information about\nthe methodology and impacts of the target paper.\nThis indicates that the context vector in our model\ncompared with Pntr-Gen-Seq2Seq is better able to\ncapture important information from the source by\nattending to various discourse sections.\n\n\n",
+        "Conclusions": " \nThis work was the first attempt at addressing\nneural abstractive summarization of single, long\ndocuments. We presented a neural sequence-tosequence model that is able to effectively summarize long and structured documents such as scientific papers. While our results are encouraging,\nthere is still much room for improvement for this\nchallenging task; our new datasets can help the\ncommunity to further explore this problem.\nWe note that following the convention in the\nsummarization research, our quantitative evaluation is performed by ROUGE automatic metric.\nWhile ROUGE is an effective evaluation framework, nuances in the coherence or coverage of the\nsummaries are not captured with it. It is non-trivial\nto evaluate such qualities especially for long doc\u0002ument summarization; future work can design expert human evaluations to explore these nuances.\n\n",
+        "thebibliography": " \nDzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly\nlearning to align and translate. arXiv preprint\narXiv:1409.0473 .\nSumit Chopra, Michael Auli, Alexander M Rush, and\nSEAS Harvard. 2016. Abstractive sentence summarization with attentive recurrent neural networks. In\nHLT-NAACL. pages 93\u00e2\u0080\u009398.\nArman Cohan and Nazli Goharian. 2015. Scientific article summarization using citation-context\nand article\u00e2\u0080\u0099s discourse structure. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Lisbon, Portugal, pages 390\u00e2\u0080\u0093400. http://aclweb.org/\nanthology/D15-1045.\nArman Cohan and Nazli Goharian. 2017a. Contextu\u0002alizing citations for scientific summarization using\nword embeddings and domain knowledge. arXiv\npreprint arXiv:1705.08063 .\n\n",
+        "acknowledgement": "\nWe thank the three anonymous reviewers for\ntheir comments and suggestions.\n\n"
+    }
+]
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p1.tex b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p1.tex
new file mode 100644
index 000000000..84f86930e
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p1.tex
@@ -0,0 +1,91 @@
+\documentclass[aps,floatfix,prd,showpacs]{revtex4}
+%\documentclass[aps,floatfix,prd,showpacs,twocolumn]{revtex4}
+\usepackage{graphicx}% Include figure files
+\usepackage{dcolumn}% Align table columns on decimal point
+\usepackage{bm}% bold math
+
+\voffset 1.0cm
+
+\begin{document}
+
+\title{A Sample Research Paper}
+\author{I.M. Great and So.R. Yu}
+\affiliation{
+Department of Physics and Astronomy,
+University of Pittsburgh,
+Pittsburgh, PA 15260,
+USA.}
+
+\date{\today}
+
+\begin{abstract}
+An abstract is a great convenience for the reader and is required by all journals. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam convallis diam at lobortis dapibus. In id efficitur libero. Vestibulum vel ullamcorper neque. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Quisque et felis commodo, rutrum erat at, sodales odio. Nam mi ipsum, imperdiet vitae augue non, convallis accumsan ante. Vivamus in lacus id nisi gravida condimentum vitae convallis tortor. In viverra congue sollicitudin. Quisque eget leo feugiat, tincidunt enim et, pretium orci. Duis condimentum maximus turpis at malesuada. Morbi laoreet metus felis, et varius odio consequat ac. Curabitur rutrum ac sapien ut ultrices. Donec et pretium elit. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas.
+\end{abstract}
+\pacs{PACS numbers go here. These are classification codes for your  research.}
+\maketitle
+
+\section{Introduction}
+
+Using latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.
+
+Maecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.
+
+\section{Results}
+Including figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.
+
+\begin{figure}[ht]
+\includegraphics[width=7cm,angle=-90]{linear_q_eq_0.ps}
+\caption{You will need to include the package graphicx to be able to make figures like this.}
+\label{fig1}
+\end{figure}
+
+\begin{table}[ht]
+\caption{$X(3872)$ Discovery Modes.}
+\label{XmodesTab}
+\begin{tabular}{cclccl}
+\hline
+mass & width & production/decay mode & events & significance & experiment\\
+\hline
+\hline
+$3872.0 \pm 0.6 \pm 0.5$  & $< 2.3$ 90\% C.L.  & $B^\pm \to K^\pm X \to K^\pm \pi^+ \pi^- J/\psi$   &  $25.6 \pm 6.8$ & $10 \sigma$     & Belle\\
+$3871.3 \pm 0.7 \pm 0.4$  & resolution & $p\bar p \to  X \to \pi^+ \pi^- J/\psi$   &  $730 \pm 90$ & $11.6 \sigma$  & CDFII\\
+$M(J/\psi) + 774.9 \pm 3.1 \pm 3.0$ & resolution & $p\bar p \to X \to \pi^+\pi^-J/\psi$ & $522 \pm 100$ & $5.2 \sigma$  & D{\O} \\
+$3873.4 \pm 1.4$  &  --  & $B^- \to K^- X \to K^- \pi^+ \pi^- J/\psi$   &  $25.4 \pm 8.7$ &$3.5 \sigma$ & BaBar\\
+\hline
+\hline
+\end{tabular}
+\end{table}
+
+\begin{equation}
+\Gamma(X \to \alpha\beta D) = \int {d^3Q\over (2\pi)^3}  \Gamma(C\to \alpha\beta) {|\tilde T(Q)|^2 \over
+(M(X) - E_{CD}(Q))^2 + \Gamma_C^2/4}
+\label{XCD}
+\end{equation}
+
+\section{Conclusions}
+
+Man, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.
+
+\section{Some_title}
+
+Test title for user defined  section.
+
+\begin{user_defined_title_for_begin}
+
+wjlrhfwer ljqr flwuer j rlferfurl u airlf  aiurf uoiruf iuoqir oiuqr iuq woe
+\end{begin{user_defined_title_for_begin}}
+
+\acknowledgments
+The author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.
+
+\begin{thebibliography}{99}
+
+\bibitem{lamport}
+ {\sl LaTeX : A Documentation Preparation System User's Guide and Reference Manual}, Leslie Lamport [1994] (ISBN: 0-201-52983-1) pages: xvi+272.
+
+\bibitem{latt}
+I.M. Smart {\it et al.}, J. Plumb Phys. {\bf 50}, 393 (1983).
+
+\end{thebibliography}
+
+\end{document}
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p2.tex b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p2.tex
new file mode 100644
index 000000000..e46bbd94f
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p2.tex
@@ -0,0 +1,50 @@
+\documentclass[aps,floatfix,prd,showpacs]{revtex4}
+%\documentclass[aps,floatfix,prd,showpacs,twocolumn]{revtex4}
+\usepackage{graphicx}% Include figure files
+\usepackage{dcolumn}% Align table columns on decimal point
+\usepackage{bm}% bold math
+
+\voffset 1.0cm
+
+\begin{document}
+
+\title{Parallel Implementation of Support Vector Machine}
+\author{Pratik Merchant, Smit Moradiya, Jignesh Nagda, Niket Mehta}
+
+\date{\today}
+
+\begin{abstract}
+There is a growing influx of large datasets that are analysed using machine learning methods in natural sciences and engineering. A supervised learning algorithm which gives a solution of regression and classification problems is what SVM basically is. Even though, they are tremendously popular nowadays, but the high computational demands phase is still one of its remaining drawbacks. SVM is mostly used for a nonlinear kernel as in the boundary need not be a straight line. Now, there is a trade-off between the accuracy and training time as this is more computationally expensive. Hence, in this research done by us, a PSVM has been proposed by us to reduce the memory which is being used by the algorithm and also to parallelize computation as well as data loadin
+\end{abstract}
+\maketitle
+
+\section{Introduction}
+
+The name ‘support vectors’ (data points) used to define this dividing plane. Since we only require the SVs to create a classifier, the non SVs can be discarded. However, it becomes a problem when the points are not separable by a simple linear plane. Hence, to handle this problem, SVM uses what is known as the “kernel trick” on the training data and the mapping to a higher dimensional space is done by it, where such a dividing plane can be found more easily. Every improve accuracy. The role of Kernel is to transform the problem using some linear algebra for linear SVM and this is how the learning of the hyperplane happens. To avoid misclassifying each training example is given by the regularisation parameter. Lower is the regularisation value, more is the misclassification. To decide how far the influence of each training parameter reaches, the gamma parameter is used. Low gamma value means points which are at a far distance from the separation line are considered for calculation whereas a high gamma value implies that only the points nearby to the separation line are considered for calculation. Lastly, the separation of the line/hyperplane to the point which closest to it is called as margin. A larger separation for both the classes means a good margin and also no crossing into other class.
+The steps to implement SVM are as follows: Step 1: Import all the necessary libraries such as numpy, pandas, matplotlib. Step 2: Importing the dataset Step 3: Performing exploratory data analysis Step 4: Performing data pre-processing Step 5: Splitting the data into train and test. Step 6: Import SVM, create a classifier and train the model. Step 6: Making predictions Step 7: Evaluating the algorithm  Step 8: Results 
+
+\section{my_section}
+
+nlwekndw lweuidwe nulei nameiude includewe oiu wede oiuwe dn eiuqwend
+
+\section{Results}
+PSVM first loads the training data onto ‘m’ number of machines. This done in round robin fashion. The memory required by each memory is Big-Oh of nd/m. In the next step, PSVM performs a row-based ICF parallely on the data which has been loaded. At the end of this step, only a small portion of the factorized matrix is stored on each machine, which has a space complexity of Big-Oh of np/m. For the quadratic optimization problem, IPM is performed hereafter. Let, n- no. of training instances d-initial no. of dimensions p- After factorization, reduced matrix dimension (p<<<n) m- number of machines With the help of PSVM, the memory requirements reduce from a complexity of Big-Oh of n^2 to Big-Oh of np/m.  
+ 
+HDF5 which is a file format, data model, and a set of software allows users to store data and associated metadata. 
+
+Several of its features make it useful for high performance computing and big data applications, such as data compression, which along with binary value storage can greatly reduce file sizes. It also allows parallel file access, which enables one or more processors to read/write data from a single file, and features customizable data “chunking”, which allows users to define how the data is internally arranged into subsets. This can be used to tune parallel performance. 
+
+An open source cluster computing system intended for the analysis and processing of big data is Apache Spark. Spark clusters have the ability to scale up to thousands of distributed nodes that can work cooperatively to process very large datasets in parallel, this makes them an example of an HTC system. It also supports real-time streaming processing of data as well as real-time streaming processing of data. Some of its main components are Spark streaming, Spark SQL, Spark core which are used for stream processing, to enable structured data processing and handle task distribution and scheduling respectively. (mllib) Machine Learning Library supports many tasks such as linear SVM training,linear regression and clustering. However some extra code needs to be written to handle multi-class problems since it only supports binary classification. 
+ 
+There maybe a period of CPU idle time if one segment finishes training before it concatenation counterpart. The issue can be cumulative in severely distributed databases. The benefits of attempting to balance the segments after the processing has started are likely to be outweighed by the overhead in MPI communication. This difference should be mitigated by proper randomization of the input vector order. 
+ 
+Applications SVMs have been found particularly useful in earth observation and satellite imagery data analysis. Some of its other applications include: 1. Detetction of faces 2. Categorization of text and as well as hypertext 3. Image classifier 4. Fields related to biology 5. Remote homo-logy detection 6. Recognition of hand-written characters 7. GPC 8. Geo and Environmental Sciences 
+
+\section{Conclusions}
+
+Many algorithmic approaches are found to be more effective when kernels are not used or memory is not a constraint. Also, other approaches can be used to achieve good speedup by dividing a serial algorithm into subtasks. These subtasks are basically subsets of the training data. If no. of machines continue to expand and cross the data-size independent threshold, PSVM cannot achieve linear speedup in such cases. There are 2 types of overheads encountered while implementing parallel SVM- communication and synchronization overheads. During message passing, communication time is accounted for.  Computation, communication and synchronization together form the running time. To increase the accuracy, PSVM must select the correct no. of machines.  Hence, we can conclude by saying that when the parallelism of modern hardware is leveraged, massive speedups are possible a satisfactory performance is achieved
+
+\acknowledgments
+I am grateful to my college professors for providing me this wonderful oppurtunity to present this research paper.
+
+\end{document}
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p3.tex b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p3.tex
new file mode 100644
index 000000000..064034622
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p3.tex
@@ -0,0 +1,63 @@
+\documentclass[aps,floatfix,prd,showpacs]{revtex4}
+%\documentclass[aps,floatfix,prd,showpacs,twocolumn]{revtex4}
+\usepackage{graphicx}% Include figure files
+\usepackage{dcolumn}% Align table columns on decimal point
+\usepackage{bm}% bold math
+
+\voffset 1.0cm
+
+\begin{document}
+
+\title{Prediction of human behaviour with the aid of sentiment analysis using social media datasets.}
+\author{Pratik Merchant}
+\affiliation{
+Department of Information Technology,
+K J Somaiya College of Engineering,
+Mumbai, Maharashtra,
+India.}
+
+\date{\today}
+
+\begin{abstract}
+The objective of this proposal is to better the prediction of human behaviour with the aid of sentiment analysis using social media datasets. What is the next step/action that a person or a group of people might take can be predicted by observing recurring patterns in their social media datasets. 
+The main issue addressed here is that of the accuracy in predicting the next course of action of an individual or a group of people might take. Negative actions if any predicted, can be stopped then and there itself be forehandedly. This research matters because we humans generally like to pride ourselves on our ability to be unpredictable, to do things for no other reason than because we want to. The truth is, we are complicated, but with the emergence of deep learning, we may become more predictable. Now, with the advent of sophisticated computer systems and cloud which helps to store huge repositories of data complement the development of deep learning. We can begin to parse that information to find patterns in the ways people operate. Deep learning is particularly tailored for the purpose of prediction since it has the intuitive ability similar to biological brains. 
+
+\end{abstract}
+\maketitle
+
+\section{Introduction}
+
+Social media data like Facebook, Twitter, Instagram blogs, etc. is currently growing in an exploding speed. Sentiment analysis–also called opinion mining–is the process of defining and categorizing opinions in a given piece of text as positive, negative, or neutral. The main purpose of conducting this research is to understand the sentiments which in turn can help us mine knowledge and capture the ideas without necessarily going through all data, which will save us a huge amount of time. Also, this analysis can further be used for a variety of purposes such as identifying influencers, competitive benchmarking, consumer opinion and brand sentiment, etc.
+The already existing models lack accuracy. Also, they predict on the basis of one or 2 factors which is too less a number considering the amount of thought processes a human brain goes through before coming on to a decision. Also, the inaccuracy occurring due to the automated bots need to be taken into consideration. Since, they can largely tilt the dataset to a particular side (positive or negative). 
+The main goal is to improve accuracy and also to remove the input of the bots from the datasets using appropriate filtering techniques. And also, to merge the prediction of all the various datasets together to obtain a cumulative prediction of all the social media accounts a person uses.
+The Government or the common public can largely benefit from this since any negative event(protests) if predicted by the model may help in taking adequate protective measures and hence in turn maybe avoid or reduce the magnitude of the same. This issue if addressed before could have prevented the negative impacts of a lot of events such as the Muzaffarnagar riots, FTII Agitation, Pro-Jallikattu protests which took place in Tamil Nadu, etc. Hence, any such events if again predicted in the future, can very well be avoided by taking appropriate advance action.   
+
+\section{Results}
+There are three machine learning classification algorithms that are predominantly used for sentiment analysis in social media and they are as follows:
+a.	Support Vector Machines (SVMs)
+b.	Naive-bayes
+c.	Decision Trees
+Each has it’s own advantages and drawbacks; however, a few different studies have concluded that the Naive-Bayes classifier is the more accurate of the three.[1]
+      
+Naive-Bayes classifier is a machine learning classification algorithm that asserts an independent value for each feature within a dataset. In other words, each element is valued individually to determine a probability that the sum of these values will constitute a pre-defined label or outcome. Effective sentiment analysis of social media datasets using Naive Bayesian Classification involves extraction of subjective information from textual data. A normal human can easily understand the sentiment of a document written in natural language based on its knowledge of understanding the polarity of words and in some cases the general semantics used to describe the subject. The project aims to make the machine extract the polarity (positive, negative or neutral) of social media dataset with respect to the queried keyword.
+This project introduces an approach for automatically classifying the sentiment of social media data by using the following procedure: First the training data is fed to the sentiment analysis engine for learning by using machine learning algorithm. The next step is to filter misleading data(mostly encountered because of bots).The next step involved is the training of the dataset by mathematical formulations. After the learning is complete with qualified accuracy, the machine starts accepting individual social data with respect to keyword that it analyses and interprets, and then classifies it as positive, negative or neutral with respect to the query term.[2] The prediction of an individual once obtained from the different social media datasets may then be cumulated and then compared with the prediction of other individuals to see if there is anything in common. Common predictions if found any may indicate the mass sentiment of the people and will also hint about their future course of actions if any.
+When talking about textual sentiment analysis, this usually comes in the form of a training set bag-of-words already sorted into positive or negative categories. A positive word may have a +1 scoring while a negative word will have a -1 scoring. You can also assign higher values to certain words that may be more negative in degree. Regardless, if the final score of a mention is positive, then the mention is positive and vice versa for negative final score.
+If word only appears once, we don’t need a frequency table. If we assign each positive and negative value a “1”, then we can simply divide the positive and negative words by the amount of words in the entire mention and then the subtract the negative words score from the positive one and  if the total of our mention comes out as positive, we can say the sentiment of the mention above is positive and vice versa for a negative result.
+Since the total of our mention comes out as positive, we can say the sentiment of the mention above is positive. This is a pretty clear-cut case as we didn’t encounter polarizing words that might skew the result if a computer can’t understand which category the word belongs to.[1] 
+
+Now, the maxim that more data will lead to better predictive models is not always true, because noise in the data can overwhelm predictive models. The ability to deal with noisy, incomplete, and inconsistent data will be at the heart of next-generation predictive models. For instance, when identifying “bots” on Twitter that are seeking to sway opinion to be positive about a political candidate, we needed to ignore the huge numbers of bots that were seeking to achieve other ends- such as spreading spam or seeking to influence opinions about other topics or to deceive users into clicking on links that generate revenue for the person who included that link in their tweet. Moreover, data about many Twitter handles are limited and, in some cases, intentionally misleading. Bot developers go to considerable effort to ensure that their bots elude detection.
+
+The generation and reduction to practice of robust multistage predictive modeling for emergent phenomena is an important step. For instance, social movements have been classified into five stages: genesis of the movement, increase in social unrest, enthusiastic mobilization to develop an organization, maintenance of the organization, and termination (when the movement starts to die down). When the protest is in an early stage (for example, of people expressing grievances on Twitter), some stakeholders would benefit from a prediction of the likelihood of violence occurring in any of the future stages. In such extreme cases, identifying bots is a very important part.
+In this way, the above proposed methodology if implemented, can be of great help in a variety of applications as seen above.
+
+
+\section{Conclusions}
+
+Ultimately, once can say that sentiment analysis isn’t perfect, but neither are we when trying to decipher what someone means. Within social media monitoring, we need sentiment analysis as a starting point to understand general public sentiment in aggregate. 
+Hence, we can say that social media is perhaps the largest pool from which we can mine for public opinion and begin to gather informative data for prediction purposes. 
+In this way, I plan to complete the above mentioned process as soon as possible once begun. If done correctly, the process would be completed within a stipulated period of time. If I am successful in meeting my objectives then, this shall be largely benefical to the Government authorities, the Police authorities as well as the common people at large. 
+
+\acknowledgments
+I would like to thank my college professors for supporting me immensely in this endeavor.
+
+\end{document}
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p4.tex b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p4.tex
new file mode 100644
index 000000000..04754c36f
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p4.tex
@@ -0,0 +1,87 @@
+\documentclass[aps,floatfix,prd,showpacs]{revtex4}
+%\documentclass[aps,floatfix,prd,showpacs,twocolumn]{revtex4}
+\usepackage{graphicx}% Include figure files
+\usepackage{dcolumn}% Align table columns on decimal point
+\usepackage{bm}% bold math
+
+\voffset 1.0cm
+
+\begin{document}
+
+\title{C4.5 CLASSIFICATION ALGORITHM}
+\author{
+Ameya Keskar                                                         Priya Mane
+Chinmay Lotankar                                                     Jeet Mehta
+}
+\affiliation{
+Department of Information Technology
+K.J Somaiya College of Engineering,
+Mumbai,Maharashtra,
+India.}
+
+\date{September 2019}
+
+\begin{abstract}
+This article explains the C4.5 algorithm for decision tree construction. C4.5 algorithm works on the principles of Information gain ratio. The comparison between C4.5 and ID3 is also discussed here. Also, the applications of C4.5 algorithm in forming decision trees are explained.  
+\end{abstract}
+
+
+\section{Introduction}
+
+Data mining is the process of analyzing large data and getting valuable information from it. There are various algorithms and techniques done to do so. One such data structure is Decision tree; it is a flowchart-like structure with nodes and arrows directing from one node to another. At each node, one attribute is considered and further split branches equal to the number of unique values the attribute can take. Each of this branch is connected to other node where the value of next attribute is defined. Hence in going from one node to another, we fix or determine the value of each attribute. Each leaf node consists of one of the values of classification variable.
+Now the question is which attribute must be placed at what level in a tree.C4.5 Algorithm is used to choose the attributes to be placed at each level of the tree. The main advantage of C4.5 algorithm can deal with attributes having numeric data/non-categorical data which is difficult to classify per say and also deal with missing value data.C4.5 algorithm makes a decision tree by using the concept of information entropy. The parameter used here is normalized information gain. Normalized information gain is calculated for each attribute and the one with maximum value is chosen as the first attribute/root node of the decision tree.
+
+
+literature review
+
+For the construction of a decision tree, we can use the C4.5 algorithm. The algorithm is based on Information gain entropy. We can say that, if an event is highly probable, there is no surprise if it occurs. This means that it gives very little information. This means amount of information gained is inversely proportional to the probability of the event. Entropy is proportional to the probability of an event; hence we can also say that information gain and entropy are inversely proportional.
+In decision trees, it is necessary that with each split the entropy decreases. Hence, if the splitting is done accurately, we may arrive to a very definite decision. So, we check each node for all possible splitting. cases for First, we calculate the entropy difference and the case for which difference is least is considered..
+
+ALGORITHM:
+Calculate Information gain for each parameter.
+Determine the attribute with maximum Information gain entropy.
+Choose this attribute as next splitting node.
+Continue in similar manner for all attributes.
+ 
+Let us consider an example 
+
+\begin{figure}
+    \centering
+    \includegraphics{image1.png}
+    \caption{Caption}
+    \label{fig:my_label}
+\end{figure}
+
+\section{Results}
+C4.5 Vs C5.0 C4.5 was superseded in 1997 by a commercial system See5/C5.0 (C5.0 for Unix / Linux, See5 pour Windows).  
+The changes hold within new capabilities as well as much improved efficiency, and include: 
+  A variant of boosting that constructs an ensemble of classifiers which are then voted to give a final classification. This often leads to a dramatic improvement in predictive accuracy.   New data types (e.g., dates), “not applicable” values, variable misclassification costs, and mechanisms to pre-filter attributes. 
+ Unordered rule set when a case is classified, all applicable rules are found and voted. 
+ This improves both their predictive accuracy and the interpretability of rule sets.  
+Multi-threading enhances scalability. C5.0 have the ability to take advantage of computers with multiple CPUs and/or cores
+
+
+\section{Conclusions}
+
+The decision tree is a usual algorithm in data mining.C4.5 algorithm is a wide application scope, high frequency decision tree algorithm. It constructs and prunes the decision tree analysis and estimates, completes the classified data mining by data preprocessing and choosing parameters or catalog.The article analyzes the C4.5 and improved methods for the calculation speed of C4.5 algorithm in detail. At least, it is proved by experiment data set that the improved C4.5 algorithm is well-performed on the training speed classify and accuracy. In this Paper C4.5 algorithm was improved the experiment proved that it has minimal impact on the classification accuracy, but the efficiency increased a lot. We can not only speed up the growing of the decision tree, so that better information of rules can be generated. In this paper the algorithm was verified by different large datasets which are publicly available on UCI
+machine learning repository. With the improved algorithm ,we can get faster and more effective results without the change of the final decision and the presented algorithm constructs the decision tree more clear and understandable .Efficiency and classification is greatly improved and the disadvantages of low efficiency and memory consumption while dealing with large amount of data were overcome as it was in C4.5.If the amount of data is small original C4.5 is
+used because of its higher accuracy.
+
+
+\acknowledgments
+The authors are grateful to K.J Somaiya college of Engineering faculty.
+
+\begin{thebibliography}
+
+https://towardsdatascience.com/what-is-the-c4-5-algorithm-and-how-does-it-work-2b971a9e7db0
+https://medium.com/greyatom/decision-trees-a-simple-way-to-visualize-a-decision-dc506a403aeb
+https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/
+https://arxiv.org/abs/1310.2071
+https://www.sciencedirect.com/science/article/pii/S0925231298000903
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4466856/
+
+
+
+\end{thebibliography}
+
+\end{document}
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p5.tex b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p5.tex
new file mode 100644
index 000000000..49f1105ee
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/papers/p5.tex
@@ -0,0 +1,146 @@
+%\documentclass[aps,floatfix,prd,showpacs]{revtex4}
+\documentclass[aps,floatfix,prd,showpacs,twocolumn]{revtex4}
+\usepackage{graphicx}% Include figure files
+\usepackage{dcolumn}% Align table columns on decimal point
+\usepackage{bm}% bold math
+
+\voffset 1.0cm
+
+\begin{document}
+
+\title{A Discourse-Aware Attention Model for
+Abstractive Summarization of Long Documents}
+\author{
+Arman Cohan
+Franck Dernoncourt
+Doo Soon Kim
+Trung Bui
+Seokhwan Kim 
+Walter Chang
+Nazli Goharian†
+}
+\affiliation{
+Department of Computer Science, 
+Georgetown University, 
+Washington, DC}
+
+\date{}
+
+\begin{abstract}
+Neural abstractive summarization models have
+led to promising results in summarizing relatively short documents. We propose the first
+model for abstractive summarization of single,
+longer-form documents (e.g., research papers).
+Our approach consists of a new hierarchical
+encoder that models the discourse structure of
+a document, and an attentive discourse-aware
+decoder to generate the summary. Empirical
+results on two large-scale datasets of scientific
+papers show that our model significantly outperforms state-of-the-art models. 
+\end{abstract}
+
+
+\section{Introduction}
+Existing large-scale summarization datasets
+consist of relatively short documents. For example, articles in the CNN/Daily Mail dataset (Hermann et al., 2015) are on average about 600 words
+long. Similarly, existing neural summarization
+models have focused on summarizing sentences
+and short documents. In this work, we propose a
+model for effective abstractive summarization of
+longer documents. Scientific papers are an example of documents that are significantly longer
+than news articles (see Table 1). They also follow a standard discourse structure describing the
+problem, methodology, experiments/results, and
+finally conclusions (Suppe, 1998).
+Most summarization works in the literature
+focus on extractive summarization. Examples
+of prominent approaches include frequency-based
+methods (Vanderwende et al., 2007), graph-based
+methods (Erkan and Radev, 2004), topic modeling (Steinberger and Jezek, 2004), and neural
+models (Nallapati et al., 2017). Abstractive summarization is an alternative approach where the
+generated summary may contain novel words and
+phrases and is more similar to how humans summarize documents (Jing, 2002). Recently, neural methods have led to encouraging results in
+abstractive summarization (Nallapati et al., 2016;
+See et al., 2017; Paulus et al., 2017; Li et al.,
+2017). These approaches employ a general framework of sequence-to-sequence (seq2seq) models
+(Sutskever et al., 2014) where the document is
+fed to an encoder network and another (recurrent)
+network learns to decode the summary. While
+promising, these methods focus on summarizing
+news articles which are relatively short. Many
+other document types, however, are longer and
+structured. Seq2seq models tend to struggle with
+longer sequences because at each decoding step,
+the decoder needs to learn to construct a context
+vector capturing relevant information from all the
+tokens in the source sequence (Shao et al., 2017).
+Our main contribution is an abstractive model
+for summarizing scientific papers which are an
+example of long-form structured document types.
+Our model includes a hierarchical encoder, capturing the discourse structure of the document and a
+discourse-aware decoder that generates the summary. Our decoder attends to different discourse
+sections and allows the model to more accurately
+represent important information from the source
+resulting in a better context vector. We also introduce two large-scale datasets of long and structured scientific papers obtained from arXiv and
+PubMed to support both training and evaluating
+models on the task of long document summarization. Evaluation results show that our method outperforms state-of-the-art summarization models1.
+
+
+\section{Results}
+Our main results are shown in Tables 2
+and 3. Our model significantly outperforms the
+state-of-the-art abstractive methods, showing its
+effectiveness on both datasets. We observe that
+in our ROUGE-1 score is respectively about 4 and
+3 points higher than the abstractive model PntrGen-Seq2Seq for the arXiv and PubMed datasets,
+providing a significant improvement. Our method
+also outperforms most of the extractive methods
+except for LexRank in one of the ROUGE scores.
+We note that since extractive methods copy salient
+sentences from the document, it is usually easier for them to achieve higher ROUGE scores.
+Figure 2 illustrates the effectiveness of our
+model extensions in capturing various discourse
+information from the papers. It can be observed
+that the state-of-the-art Pntr-Gen-Seq2Seq model
+generates a summary that mostly focuses on introducing the problem, whereas our model generates
+a summary that includes more information about
+the methodology and impacts of the target paper.
+This indicates that the context vector in our model
+compared with Pntr-Gen-Seq2Seq is better able to
+capture important information from the source by
+attending to various discourse sections.
+
+
+\section{Conclusions}
+This work was the first attempt at addressing
+neural abstractive summarization of single, long
+documents. We presented a neural sequence-tosequence model that is able to effectively summarize long and structured documents such as scientific papers. While our results are encouraging,
+there is still much room for improvement for this
+challenging task; our new datasets can help the
+community to further explore this problem.
+We note that following the convention in the
+summarization research, our quantitative evaluation is performed by ROUGE automatic metric.
+While ROUGE is an effective evaluation framework, nuances in the coherence or coverage of the
+summaries are not captured with it. It is non-trivial
+to evaluate such qualities especially for long document summarization; future work can design expert human evaluations to explore these nuances.
+
+\acknowledgments
+We thank the three anonymous reviewers for
+their comments and suggestions.
+
+\begin{thebibliography}
+Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly
+learning to align and translate. arXiv preprint
+arXiv:1409.0473 .
+Sumit Chopra, Michael Auli, Alexander M Rush, and
+SEAS Harvard. 2016. Abstractive sentence summarization with attentive recurrent neural networks. In
+HLT-NAACL. pages 93–98.
+Arman Cohan and Nazli Goharian. 2015. Scientific article summarization using citation-context
+and article’s discourse structure. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Lisbon, Portugal, pages 390–400. http://aclweb.org/
+anthology/D15-1045.
+Arman Cohan and Nazli Goharian. 2017a. Contextualizing citations for scientific summarization using
+word embeddings and domain knowledge. arXiv
+preprint arXiv:1705.08063 .
+
+\end{thebibliography}
+
+\end{document}
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/parser.ipynb b/Scripts/Miscellaneous/Research_paper_latex_parser/parser.ipynb
new file mode 100644
index 000000000..21d510103
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/parser.ipynb
@@ -0,0 +1,494 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Copy of NLP Phase 2.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "TPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "69Of-VcYM5Em",
+        "colab_type": "text"
+      },
+      "source": [
+        "#Extracting text from LaTex file of any research paper"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zOiPoXMvcB_E",
+        "colab_type": "text"
+      },
+      "source": [
+        "Importing the necessary libraries:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9nxilX8BcKKG",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import re\n",
+        "import json  "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-3E-_8Z3Jq2U",
+        "colab_type": "text"
+      },
+      "source": [
+        "Reading the latex file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sLQzklQTJu5n",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "data = open('p1.tex').read()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "K-zLqWwXky2N",
+        "colab_type": "text"
+      },
+      "source": [
+        "Getting rid of all the unwanted tags before extraction of text:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "PI591JMak3y9",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def purge_images(data):\n",
+        "    imgs = re.findall(r'begin{figure}(.*?)end{figure}', data,re.S)\n",
+        "    start = \"\\\\begin{figure}\"\n",
+        "    end = \"end{figure}\"\n",
+        "    imgs = [start + img + end for img in imgs]\n",
+        "    for img in imgs:\n",
+        "        data = data.replace(img,\" \")\n",
+        "    return data\n",
+        "\n",
+        "def purge_table(data):\n",
+        "    tables = re.findall(r'begin{table}(.*?)end{table}', data,re.S)\n",
+        "    start = \"\\\\begin{table}\"\n",
+        "    end = \"end{table}\"\n",
+        "    tables = [start + table + end for table in tables]\n",
+        "    for table in tables:\n",
+        "        data = data.replace(table,\" \")\n",
+        "    return data\n",
+        "\n",
+        "def purge_equation(data):\n",
+        "    equations = re.findall(r'begin{equation}(.*?)end{equation}', data,re.S)\n",
+        "    start = \"\\\\begin{equation}\"\n",
+        "    end = \"end{equation}\"\n",
+        "    equations = [start + equation + end for equation in equations]\n",
+        "    for equation in equations:\n",
+        "        data = data.replace(equation,\" \")\n",
+        "    return data\n",
+        "\n",
+        "data = purge_images(data)\n",
+        "data = purge_table(data)\n",
+        "data = purge_equation(data)   "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kFjDJ6JRWDc9",
+        "colab_type": "text"
+      },
+      "source": [
+        "Function to convert list to string since the findall function returns a list"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "kWqaRV4FVsRA",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def listToString(s):  \n",
+        "    # initialize an empty string \n",
+        "    str1 = \"\"  \n",
+        "    # traverse in the string   \n",
+        "    for ele in s:  \n",
+        "        str1 += ele   \n",
+        "    # return string   \n",
+        "    return str1  "
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hBi8McmScrUl",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the title:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fnHec6ObcwQT",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "68eb56a5-3fac-4999-919f-ee83dea5b5d0"
+      },
+      "source": [
+        "title = re.findall(r'title{(.*?)}',data,re.S)\n",
+        "title = listToString(title)\n",
+        "print(title)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "A Sample Research Paper\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lcYbME2oomQg",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the authors:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "CI4olibco6O0",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "4730937b-4c98-4bc0-cef3-d2be97f06fe0"
+      },
+      "source": [
+        "author = re.findall(r'author{(.*?)}',data,re.S)\n",
+        "author = listToString(author)\n",
+        "print(author)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "I.M. Great and So.R. Yu\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "msjnC9RKcOJC",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the abstract:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ewj1hcl-bCGe",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 88
+        },
+        "outputId": "b6f5bce6-769b-43cc-eb9d-ffac7e3784a5"
+      },
+      "source": [
+        "abstract = re.findall(r'\\\\begin{abstract}(.*?)\\\\end{abstract}', data, re.S)\n",
+        "abstract = listToString(abstract)\n",
+        "print(abstract)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "An abstract is a great convenience for the reader and is required by all journals. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam convallis diam at lobortis dapibus. In id efficitur libero. Vestibulum vel ullamcorper neque. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Quisque et felis commodo, rutrum erat at, sodales odio. Nam mi ipsum, imperdiet vitae augue non, convallis accumsan ante. Vivamus in lacus id nisi gravida condimentum vitae convallis tortor. In viverra congue sollicitudin. Quisque eget leo feugiat, tincidunt enim et, pretium orci. Duis condimentum maximus turpis at malesuada. Morbi laoreet metus felis, et varius odio consequat ac. Curabitur rutrum ac sapien ut ultrices. Donec et pretium elit. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas.\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iVVbaGWEpl0C",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the introduction:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "grtQQhm2po9b",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 156
+        },
+        "outputId": "c2e83fc6-f342-4f72-8379-5b1011909daa"
+      },
+      "source": [
+        "introduction = re.findall(r'\\\\section{Introduction}(.*?)\\\\',data,re.S)\n",
+        "introduction = listToString(introduction)\n",
+        "print(introduction)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "Using latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n",
+            "\n",
+            "Maecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FYxnWEPUJSwi",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the results:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YG_GrwJhJXZ-",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 207
+        },
+        "outputId": "04faf45b-b02a-4d1f-e03d-cec3dc55425e"
+      },
+      "source": [
+        "results = re.findall(r'\\\\section{Results}(.*?)\\\\',data,re.S)\n",
+        "results = listToString(results)\n",
+        "print(results)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "Including figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n",
+            "\n",
+            " \n",
+            "\n",
+            " \n",
+            "\n",
+            " \n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OCLM7tNVrOGP",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the conclusions:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Xc1qbvqgrLkB",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 122
+        },
+        "outputId": "956acbdb-228b-4ebc-c0ae-3821f5adc794"
+      },
+      "source": [
+        "conclusions = re.findall(r'\\\\section{Conclusions}(.*?)\\\\',data,re.S)\n",
+        "conclusions = listToString(conclusions)\n",
+        "print(conclusions)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "\n",
+            "Man, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "J_Lvs9FkMGEn",
+        "colab_type": "text"
+      },
+      "source": [
+        "Extracting the acknowledgments:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3hL78TPFffGH",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 105
+        },
+        "outputId": "95857884-8fc1-474a-ca0e-871b46175a3c"
+      },
+      "source": [
+        "acknowledgments = re.findall(r'\\\\acknowledgments(.*?)\\\\',data,re.S)\n",
+        "acknowledgments = listToString(acknowledgments)\n",
+        "print(acknowledgments)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "The author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7yu6bbjIbhkr",
+        "colab_type": "text"
+      },
+      "source": [
+        "Creating a dictionary of all the extracted text:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ka6qb6bRM0Hx",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "a_dict = {}\n",
+        "\n",
+        "for variable in [\"title\", \"author\", \"abstract\",\"introduction\",\"results\",\"conclusions\",\"acknowledgments\"]:\n",
+        "    a_dict[variable] = eval(variable)\n",
+        "    dict_1.append(a_dict)\n",
+        "\n",
+        "print(a_dict)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uE4P_xlSbmlj",
+        "colab_type": "text"
+      },
+      "source": [
+        "Converting the dictionary into a JSON file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nErVmQx_Z515",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "with open(\"extracted_data.json\", \"w\") as outfile:  \n",
+        "    json.dump(a_dict, outfile) "
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/results/Capture.JPG b/Scripts/Miscellaneous/Research_paper_latex_parser/results/Capture.JPG
new file mode 100644
index 0000000000000000000000000000000000000000..f8f2353d59b670226bcc3339aee51f9747deab90
GIT binary patch
literal 24998
zcmeHu2UJu|w(cQEQ4o-vlah0eN@x%p5y_b*G|*%uNmi7oG?J6#oEw@PBnkqOb54?z
zAVJ}^|37!;UgxfvJMZ1~)_v=FSk<Tcbe&VRzuL9;cXn;AC$ATQTdGPRB>)W#4KPLh
z0oO~nW)(f5Rsf)`4%`C(02jbSlLXLFHEh%u)mKme6IDk;)iqL5f1UwC0Jzn&MFZfV
zuA%CH45|P$Tc{7*pS9n|Zv=iL@Ed{O2>eFiHv<1ZBA}rO(vt@OthX3{@CZ^AUr2k4
z`PX_SJo=A1hPW62z~O(Z|BX}N!hhoSn^XM%s6D@B{*AzI1b!p%8-d>l{0#vCJ^?-n
zK2ZriF$O+<2|*DFVL{+Oy9NLnfF=L}^Z*nCK}9zRC~Vx^oFsU89b9?LAdcpiJQj{H
zUQaV8UVa`vUO-CL)5*-j&eDy++|nBAAkDtt(8A6Dg-EmO2&?m{JKeXmfvR}BSU&N7
zq-Ei4XCV$@mzBAJFXbuW33Gy3x|uO}!t5PfB|N2>{&>0ss{ZpbFO!Uv3&cu7^MTS|
zTA;p3GyP>O9v&V%9)diMF4nyK;^N}Gd;+`z0^Fz;+^${@Zf2g`4zA39dB6ipR|^-Y
zlN;30f#K%^&CDI$-K3e^-JuW(D>Ex$a|lG3+uTeL!p+YQG2=Fa2=a471O*|Y!u(<a
zR{Z8nf9W1#@z>s+++FPd=o@0eYiVx@vvhECMWMmZ%f$QVp#Qu2L_ze|PXAN*Q8J?1
zNZfa^G;_0jfGRRSr<9vdl$%dX>))oCM_g1)iuadxQoKLG_;&;S?;ZJn2`w2Zh=qj3
z&#^nY{Mr2zOXvTy*|vwu{6U`o6c6?7&kE|fD5b;0!;|9u_g(%y27g>Z=^X05ziKGz
zPKN&r$L~)5C6Ir^^&75#34wp9@o(+=4cEVfz`xY^w|4zM2G`$MuB8Kt+IpZk?DY(A
zAHc!F!p6eH!N$VI#lgYFCni8~41BViw+M+T$fzhO$SCem)3GyB)3Vaup<v=+V&%BY
z$;C;<z$?gmPk{X%=e?f~LBqkt#mB=ZB_JTZM?*n#?_WNyTLEHRzy;G11C0qlCq}~{
zM!W6+=uvx)Sg2jfpBt5bIndBCFtM<4aPja7Pz|bY0qAHL80eT7SXh{-ElM<B)b9W$
zF%}6UzdZKs$7VQ8&ZGi?(HXeR3KeZ+nnMRHg61wkc=+UZC@85|+1NSm-V+iQ5fu}c
zxc@*=Nm&J?`b6ugwvMizz6He6%Gw5M>+0tI%)`^mJNU)RkXNrm!{9ORV&mcy5|c8s
zvU76t@(T(ps}R*S$lAL45A7YDUEMt&dxu9x$Hpfnr>2*dS60{7H@<9c9UdKjJvse$
zc7E|wE;In+PqF?g*)MVtqvS%z#KgeF`6(9~x(BLY5MyF7@?(?8KgKb0zRe^Mh)b#v
zol((-$1JFMKxXbTgip>Qw8VP&Q?x%M`>zQW^uHz9Uj_S{T+;v{1{!MeFo*$J;NXIz
zh+O-QcVrK8WY@YOCFp^MlF@70M>s-g#$oW~c^|b`$c7LVwuoKNHO=C@&%r$fmgRN9
zU}O_q%3!vt_z?w7Y}cZkI3~osTOTM<)PsqfTYKvFE~-X*^;3o-xqLhvnQ3_DT}Nof
zYypnUFo2_oPuFlxZ_Y@b?hHnzZ&8b$k!&M2pz;wTDv`3F7{P7w1PT2*ckgtM%)Eq!
ze4i;eV(~413b7n1N73OIeZ;i$DC6)xr1B}3gZA7fQ=+_Q{DHxm-Aj>vc97av_eZmX
z6ba!a*)MhJUeK5Z%Tc;%^+R3CB&r)TN?Im@UhzL~^K%3z1P;1j>4z(fu8(pT`*m`8
zh}u%&uA6Z7KlvOqiAm$UJslwUrRbT2c2=S4;bz<&9k`OfrXzuhmiJq<&in3m&K54G
z6Xj}-Qr0&~^B>2zFdT89-{UCKp@PljrbB(4$@_dhuj9-T{m8R(B2v0p0zA#k@qsQQ
z!@SI*8IxtSB3^JMh>1xbcrRzZM-QtoGjlCi=(Mz;fG=Cr2=Ux;{`!$YQ{K|ut%}+J
zntO{8Zb4(&SWT-}@!2N{2nwro+a3$}WWR^0z<>8`$BY8`G_}|lFy&_2Jv(j!ozdNC
zUs2VjhL^fUAgqR-_bXy7gw~iESoj>70e>q+qEzl<C9x&5hp?bSz4yMY`}i|pVAVzP
zGnVaN8x}ErSVPo|VtQ5Uz2I7UMI_HaoN|P=L9zIkM)&ke@@kH*h6tm*@>$+|QfT=6
z)hzA(#i$nWSdsKEMICA<e6k}FSFjG23a5d-73CMhV2{d;Yz+y6dqQez;dt@((c;R6
z8<;f2k3Xl5ea0evEca)5n!dALwo|st%lYH>;H7M0t$*=M1I2*1M;8ZrH5qpk;#)ic
zR<;0G(AwUa(MpU`3}aFY)wPb>F4Lv<CHV9?oyU}^U1GDlQAMf1Y;;;yV84Vk$pwrY
zrp*$%%fO(jy)3X9H!VoF+j@I}6ip$F12q}<41A#bJmdr_kTk1-$z_F4P6AFi<?nIx
zi}*Gtbf|p$zdm-TsfOsKJ5j5_??kL_;N)ndZu@!b-62Jm5f+Us5OcZK14=Hi9FD8p
zjm%UQN5V4Fo}r$Ir$Zt5(WAtZBQ`*0@s@=9c&tBb?04g*9lG<$!4?d}JuGfaS(ifB
z0OV39hJEVWx!<$)DnCJ_v|yC89PwjClX%6JU~qX;5?R!YhLr(S|0!B;&G?`!&rR1b
zAVbk;Oenmc$}@A6L>QWRTgf7kRYyUEkP;6kV_z#D$PDB7ccscVZw^|iOuOUvq>toN
zQSI&O5Usa2DQOmu`vL9$*!16St8>ntpN2m3j_i=+c2o=Z5-nILU#i7n#+e{Rd-NxC
z>#5oETV1QrgY(>R?Aqjwr!!PP)>k7=zKFlacYCY|kp4FovI$QYg7AF|yEq9xFX6jv
zfvZ6@iB?GJK>VyYv@kf%ieV(FTe<tgFI@K3uN|swSVkBdYJ(p+B1*mxj#prrq`I`*
z<xc~vQ3=c02iL%Nb;OM)^0GoReshqU-@isY-;$n|y1K`iT)@4@hNuPYTEX$M#6isF
z#d)K|J-H{^7{d;`>o60IiRbZDTJtn_X_qKMQ%hc$VYtn;tj$bF7a%3LOIDxV-Xgb_
z8(yhHg!vf!5RtTUQW`(n6Z2BWOr2~M-!;UXmdUecLaoJF1)FKz#vV$K%*f6VWd<S;
zjd|IZwA-WAbN3%1khea#GcqFMtUn;*Q>SL&62!xW-~1<@eN^pRyI{Sb3nWT~JO|~?
zZ28e8$j;=n_h)unI}CX}dMKAw9uuos?Bay?vD89oY;oLO&E1iJ<yfAq>rtX9JeCNE
zjF5cR66bmTabPJXdZ6VyN7-%v!PA&)z(4yMSaI|J5Oyf~{ThJEe!Bo)iE1G3Zdrnm
z?ak6rw3?o2BN{cjrNfe1H8#t!eA+7UYs$tHQC`5eq9OemPoEDOouz{wm`NV5A<R3@
zoI(*58H-gspXGWSyuTJ)151jY$yfe}^vfUY*8p_^`x*OkUqJ*?KQ~$E1z@Zes{0lj
z`1qzA@Wr|_0^ZiT2Ff%R{I}KojbCnaKcao`W#V84q2IYG@ow5XQT~O>QQgs$6JC!w
z+N^jtc3n*NR_H;<U8Qu-4EHCljSB6=Ud$xc^d3J-V^Lk>b9O8o|3C@2uqk^sM}Lsn
zsD2H|PJ@43p#+Z{8k)HnUz)hsH3y}-Z{N5<f_7st%vyVlO?xSBt|o<Os?Bxp-kb1h
zj_Motq;0#kYHI7t#}Z9NikK2I-`<I!{nA(vS`Ts6ZuyTAW?hl$gtly4g06v<VgJK=
zU&kMA*8rog$0;P~WqK&Dn;nSljs8eReGs)9&;7)SemNX>r&*%OB2Zp&90BdZg+r4?
zo`F0Izizj`5W#w{uGgbha-bvoNQ*95oNQq+J)8v@g)=5J9>UZm*58q%Q6nICbRZa*
z{yzU#urnk%dAI$K%k@sK5Ncm*x2bV?t9?~x9v-Znrdc<><{rIZGGuM(5zXpmWFniL
zIl^Of?gEPVp$Eu7M1DzD^{9IfQzJQFWvJOZyq@SGm5zuzv6}o5DI`E)`mbRAiZ)N&
zRSnhWB3IXZ+cfmg^VDMORS~m_`D%{29~P{P7B?7gzI&FkYMR!VG-1TXyBnV1_!TBY
z%gVbm0!lDo`jPRlm#-2nF<7xKnYGEY(Bzly#v&ybyTcH3kH#+u`e}1&60PKpdbMI1
zB9<hapOVMuT(y+GK<o^oPPsav%gGOi)n_j(o8S047F#?zDtXx$iC(~3(Jn^Fe4Z>7
z(+|~~Rn9fqHfBP+)bG|DS#CHA=zTkqIY_)a;n|%*YV`c}q59nFgn;oT+In=hX=Psk
z=wtWZCjCE}i<iSXSBNU!U6a9+Iy>#5H7fPJ<m_&4lm$}Zj;swNKvX1mSu&udHw5dB
z^WV=rqw6^fuoWSh+AiUZA*($G5i7OQergcl4-THlF}Ma^k}W%)Uk&^3e@*}H@aF8X
zG21nezRP}f4WRU_YpblOrNrTZg{9-R9i4ySg1@buC~}>jxV{$d$VKyFwzDGLe!`^K
zR)oqZF2>XiCe`Vc*=3d9V<Ot`Bp2_8hov%rE$W&!xtVdMSgeTY+#I_EP1M3*iPvdM
zxkxR>2A5+@SN@{s*@#L`U*)ryN98rhx{!tTZys^;?k%U!Yo;|7Z}Y5C+)>#XK7Z}p
z!|qIj(WyF3TQcz)vE6X#R)I)bcCQ)UcT0e=@-WX)EvcS=Y_MHRfE<2*YVyn4k^PTL
zE0<Q8Cjwa?TbUc{YUc~Hbml5^U;6rbHbBQZ)|MU;B}yC<7;zhpr8mA)?4~ZM)nvrh
zzkf%)@~v-Xm(bA~l$P>)#yjl~5&0|3Z@r&_e?{@jFEGbUT%24r%e9oXW;t8~)6=wj
zMN8>7r#}{|c}Dl!(PkR{NMpQp4rj^+1}HWhX$7~7a>UYZ;%!RAr9zpE2|DWZO73|3
z9f4zAWIcih`-0y}^K1NKXqA68#Az)Pu`P>1Lv*$+p1yWtCv!I-&11!RLx>+DBfHua
zP*>^v01$IeM)?ZimR21D34lSdT-<l?Mi-9rV7QT+pz<xNTG4Wk^Qh(Pp6wpy=^{H>
zTq%Q6yE{~|bMs*_h9VofIqx{cx+xWWyvL0D{f%Di4SCbwm_8pjaU9M{b06hlcp9$~
zHP{cX>X9SPBH6A((-w17{Zu);yAd*OWbqJ*n7Hb8JDlvy?Ox|23Z&lY$ZWt8SQKU9
zahfFKd2<llS<jXqKOXPR*&DWn3GdY*%mF{3O&H_-741##tNR2k<&&F9Ye#3#6}=p-
zZTJhIq1dcY+J{9%dRk;Nt>+*d=4x{&RcdoX*%p;e^(!!Pd{I}E;($47O^ad{z(#ZO
z#0Xf`UicNof&<Y`;`(}9D$UB2klU)bGA&zQjLpAtdAh2Ew6{yZR+Ap+m@2w`D;I%J
zxb!h_RRprf^(NdC$cd+c6WLbx$E8~=)vnbH=QUSJXnv&aY<@#iQPL$!Nw?u;7K%pr
z!W1pCX%K`H@Z9>gK3LH6+1AxZU4w;LHJ9NGuKwBxOe?~UwRgu8!u{94e{`(s4GS0h
zSN@ztT^<p8!jii<poXw6N}-7n29mk<ya(js7`LAEYxSB&z%Yp>*DJnbIXb$pM9NUl
zc<z~6FnU^Q^qxgV;mWFR@5<nwg-vg=CaII`-VqOhZs5;)Ocu_6Cmt?d_)asyv#7hf
zD<y8;hNR@)bLb-u?%o$A^rKuU0XuD4+-#2@r(>>-wHmfxYC%+|?#7LGi4(3R@?l~Y
z_Vgql*22Aybdv30Xyc`t3m>L4XN6y=9{ZbSQRK|KvQOzIe5fxKj1Hm*{FwJowAo!}
zQk$*f^X}KxCIUk5{vC-T*2R&po@KJnNXfulmFKD;CsiWcE~R;UPrG&z<W0P*!GS~{
z=}ErkIJgi8WvsT{E7*`&>xVq&*`%yr9n7!peE5r?puuRdTdJg5rO|D$Tf-1>PJGYf
zyRoA$V}4pV91O}0>54q**tzsbp?pLW6U)JovA%EGg(Rt?5V-=yEQ|}p9^EiQi*lp?
z*xGL9LD|<ip^nVFoxDUDOMNpFm589_1w$W*k0-f+1;-<bD3jO$1NN9fVYSw0Ew=Sz
zz0e_Bom-n_+>Ipi8&-0gL3ip-_eXN=3M<YymkB_h;xXi#HEW4y!A_*~CU5T!u0(lj
zhkfTc<Zkr1n>XWW+hJrNjhNT^2CLI87#wWCgssr!iLVfo^^rHWsFC@!q;I?46(Ubt
zDmAY}hHJcfRC5U{sfnQ-EfQ=X;{HgSF`2LX7I#oSMqeT-gqtUi3g=bvUcxp9yh=B_
zk5rxTQF}@u8)zdl$Vn0MtH$aWEa8M3$f+~TeQB{)i*cdukM~>7a86C=!pSl4Z7F%e
zHfVPZXqSiP+BnI<p-GVGnz-Y+@IbR1)*0e;fx^Uyz~vA2oB@KVQ{pQahNH!ZR8h4`
z_1oZRp07es%Og@%OLs=dLXf;B+R7gj^y@s!&<D=5M7ep?4Xr;leds9EW{B~539apv
z;d0pf8Zge<b`3sQt4`erMa;m|Yo^s*LixFoYg}j>$nhqTL2wdGVtk>K%Dp6H*IWl#
zMIPNz`rzhTft-#L51(jyGGL7(G}97%G1gzyNtC{ntZZ*?<KU>qE571odAdqho&0yh
ziMy)DK{;e2H3_3$=sf>tCJ9K5=#%pyu=Hv8jkoCx=VrIa(E`o0ONU+FuU8_B?&vkt
zjXDTR3&ToxGeZ^YZHPgfoONh%=*0YOebLbYXhr{&OR+F@%+tL)%j?wE_j?foUG7zW
z-JMtTqLwcUOBsk2w|}YKEu~Eq_}=cuVme9Trf6r5E(srZI?qSZ0UVmxw!vKS631W7
z@2}1-pd#(U!r1c`h$!tsRev(nGVWOIO<#lLalK@w=$>gizSs(|oQn+WT6ZMsTz(Lx
z{!FdNU3Xa#cG~%lqmEV-wmDX_;$d6;D_*)I&4ivs>Z7}k?;D4v4WMz5n_M{{P27Ro
zd1RncyYlgN9;T|1Ri#6bSC(eCxB8*z)WR<#vy^%VBOFZg`b;<2#oKbfG@FD-IcIu$
zDw^|T=wa>l$x{pVF;zSuYK;yoC!}9yZZbli$uH|xCkj(XFI`T+@3Y4^b6x40PYsaC
z%POdxU>6yGzjOF3%U61BFDFM$Y7Y|qY1jq)qpEmpr|kHu`BZeYzq)?Rt%cw$3}&E;
zL^eFJA)$^G5POS049sh{7JDDb|G*Y;=ceR7VY~)#YhGD+3DrmVJBIF(xFeOqM4nE!
zo;$;bY1?V}qa&Wu|1-2pBt;fZkmq<?h9MKmMkWtN>=APkJ~x!?R=tw2qE|yL-znGJ
zhx75>C#1uX(s`FZyXc+b;(RfY^NYoO^k&f`j475S3dI8nBPhYijH~`L5|4-GA{R0W
zioRSt4Og@D&6B4~6h6u3H{^LY<ePL$Ik9;y3k`V-dvP~qLu|RqaV=bNOlZ?VD3ohM
zT~Dr|_H#xxQWx+1?X7`QB{ZGOR!GQt%?a--I_^=^QBwAhUCYrF;nf3>i?O}P_vcGN
zk*)Oah=P%1fxk{YD#B6L@V$jtsOGAr><mYErS?SYkaqAiCjb#K`w2Fyn~iR{@C|wI
zjbMZeXnX&Or{)K{RGcU2D&?;<#^5#<Dr-p4_uAxV8@P*Q8t*G`fYSlHqLC{?yKz73
zX|sc?j24PZ_0G>z5nqo^PrePRT3!QfCbx_RDh(pBI~8{MXJ!p!+1b<QvBkeTcdI58
zao%WXNRr5jE!w7fxhkTbKX(n-DDh<03}?1D-+GB(I__LyuyH=>8UZo2#&U2hZKU+K
zTk5~dddXASW<T>V44M&;fe+D)@E|m;``FWE`%ekvt3u?-$oD{-J&^Sh0*qrw-Di{g
z`b(q-O~vNx{7MV&<uAyDVdvh!2#6b5<gzr*)0hwXH$Ej1mK#;|)2!2qDLgVFXJ{jJ
zaScp+WT*3wxpAR)LwE*uUXJm!T^dR9<AalvlSc37#A^233Gq%4^91HNjP+^nz(KN@
zWs{pGYDJqGa)lETVPqjgJ51$i0yOy~4_Y_|t=OYC!%(>V69?#i9yFv!=|?ln3q$ym
zD3XVhU6ol%5wkejBGvI`w^n&@rKP)pQ?e?N*i2jf&XaOvC6Y>d)MLU}HU9?AyWC~|
zs0;`NM$IyoZVhVP&KS{Msz>cB^35Y)%tDoWSa{6zY_hL6f;HO?;ybnLxh{N*lL*;X
zvC=jzQ~K{=G?=lf5Tsb!P__=(G@xGKFZM)A2k$o38uTkHuQ5i=S;c1G>foygazzle
zTB8@^BFc6auL1i;(S_soo$xhl+2fH<gWyxyu7<(CTd4Ht-yahB&HWKH@IRXqn-W4l
z8&p>_026a}g+uf7S`TgrXd!g<^#~#_O@P{ahS6|r9E`AIEhV_(14=6fv@N@_>hu?M
z+!Mm#k*a(9il9<z*J90HQW}eh4CyA@_+PlHJO7j+k;$WjlFpV0rPyU$zfW<&yLZ|>
zVQ&lf9kkMlx^W7~klA35vMJ9jo$8_GQFpd9Y-D6{XWL8N+7AVorW_b=m4TH&eSXwi
zsy)HIwnAtHUQFLx){ZRiRoXn_BLTGTYQ2AmGC1#K!K4+BTT0qt`o~yjY)*Duw?+uw
zB64`8kr??dlEZw5oT8na(%3a!?(UNZ$eP*_({Kd@jA7O|H_MCry(14xk|K4kW39qh
zh7d8DG%TyXBN2`=w>)dUsg3MdHVi$`#@{B^vz%?4tQh>b<zpUd_ruFSA$GXB_fC>u
zDs0Krbhy;f-ULJ#FH)~6!FA~Zv3q|xS`<X70-!~B2tU`!*E2Xv`lf(9n2V?&N9vMK
z--7R-Q<nEOL>;K^(hvq=yC(lrmnwNVK>)R2?v`+6&6Wsjbvt9NgNq?9Q+YV=u29^F
zF1OO6#Bko9Nm?x)9@=~Aj+YxmNIQthq)nH8*OI=(oBfwr{560EvJowa%$RJQ7AfKl
zfwfehaIC$wP%9#ZD+X-%6l2!Pij1+RV9W@|tcY=!G#dGkrD!OGukDA;UhRoLw1qtV
ziuR50Sr=#7KV6CCluJKE?o@sD8_b!Svl;2+w7U~yP#i+u=Xm2T<ttk(&Y-#huYNW6
z+-z?SL7SSlQ04X_NfT;C%?Ty`s@ue=ok5X>ULSkJ+5V2`*qA(LJ~Ym8w-jEzpbl4I
zLh5!@y^Ln2Kr63Xb#J4a%zU){=o&!L@<20giEBXL&lk)dq8$Q>MUPV=#CY${;$47A
zyyIDVDtKc)d5KnG<^C&;B!i-KnfVXBV$(|+rPw{A*sUMiD5N%m>@s4O=*qXyH)Y5o
z_Ha1eebb1X`{-_hyCdjqr45!u!%hr*B+b}@t)W)Mj7VghZ=qi0C$%DemX(4XemTni
z3?ZG{hF;{~^ox^RKjv*tP-$sbapH+o*jZv3)i_uVxCS{N4dp9iCoWxXp9@6_3w!Qz
zMI1II5jzSZ-d}x*EbK%lsqK2?ob*pkddzg+#putF9$%ySQr+Hnv*O&B7F2Ce^tpZ-
z`}_*VNWIsXQ3Bn8XZTW-Vok5K${VUHl5pLM60>+MiK7j9?t3MBtlsVxZur$%+Ub42
zcc_$}3$}wa(}vh3P|X{(ardbXRrC?^aR^`wkkULJdaqOOSU1p_!~HEd!>S@i30Z^C
z<FRfSUOC!p@szRId%e&UOvpijB5E}{pzurcb?P6~;8!5EOQB&a@}3r(Cjt>%0am6%
z4ptu>jqm+xiGI~#tsHA-;hdJT?Yih?d6@f>Pf=`nRW*D>-%cg6q`v!gW92irxPul=
z4VXP?d3*TR;N!1x(APyw$#v`RbaH+*4u2N|>F~-S+w}h>;y3gwV8DOYYFy{_6`5Tc
z#pm2H$34V3!;5^&5M?e8zQ+L?M0w)@VoCQpf4m$>^yN<d>Zf%`BjIo(N^sP9zUVWW
zq69~F5+sZi%JK~Vbueq_7hul@E*hc}2C-RHy7#3WOJxgp#|91YSciRBn9~5atkmr?
zAI8=l@61A2(X6CI5kV6M<SCMBp#8(54z+gmcd0oQIhq|fpCRQiO2%LZTt92Em@Xcm
zwu^F*HEBmnric*Gy8Uhx3s*PlS0DVa0Q`G%12Oqi`uE+v%-kIv#jx+@yo_bRguqx#
z4+qIK|J?A~k)c?SMn-WlL~(#><8N@X8EJ7s?#g;dL83gnoK&QI4M$EQ!JuQ=$`k4a
zUP0=seo5s9T0um_iO_hE?U()U{P#A~NJsWdcpNg|Fte;QP8K$`I>6)}yF)k~0fctW
z^T~h&n5|p=5Ifew>Zml?$Jv8&WY?nOcDneuQsSH5ry7Jjc<25vOefyvy{^vRW3PRq
z+X5<;yWN9)L}f^Lqg&jbNIc(kteV<@(!Qi@F9VS*V^934k0y~qT*JxpB%_J@2h0w8
z@2Vwr5Y$vRb|U-g*w9cU-P%h)2!4y@n2J2ynZEN?wCRA%)9I|-EiiBm^f<P*?3wtB
zmWfl1=TohP<ako;iWm}0`kgyR$jb6{QwR^8>2zsNi<B*$mz7xYK{=f$KHL{dv)VDG
zK?|BktT^~wT&ZK6+57S;dgXoRH6gqP2BS)@0Y3}W#>hMW4<`qM-_N!dzOBn%iE9cD
ze<?%J+Lzl)>F0-IE&HGRzq`CSGj66tMPq3VuD*9)1Kk`LzZ?CA;I|h1UI@Q!!EbZ;
z+avr-PH{W*tmztH8GMX#!1O<;s$;JKf{Aw7<CP-+g@kM1Y|V9E;;bm+EWPcU_tTf9
z88pX2mijiIIi{DQR9w7oswUbCIW<dN1JNs**MP78SFECIfY32*rknhhGAYZfT1adR
zPa#<)X?jDNfU=Z@7z@n}pQl$d6Flzp>^;Ym^t{h@YXSGxLDg{ZxUe+)r>nkEhn>bx
zj+a8abUSVX!-AtDA)zE=L)A+!seHDK8RG+UGsvf|IOBV}Pr5%{0|!(M877U_08yj5
z-P(sxQa*mjL#blxfenZ3z}6icc{pf&@1!5BQElh5J>~E?_jtB<CUL<=qw~!trz}*$
z)aWIccsq`~2xd{dMLxcrT5;d)OBOpjK;xb_yfATc4FE;k{*u>#$%Ly&!#8A2!(m=f
z$^IDKyt+)k5&a&LJHR)zwG_Rky2N2br($GOPyNPRZ1J;P6GTYl5ZZ=St31m!;Q8b3
z&dBjKAfVqg5xpg$Ve~lF(ZQ;D)V})e%ci-n?DnqbBo$FCYc-4yOvS{zduun|9+Dm`
zY+jMo+3;4!LpNTg?Q=T&g&n?a;aSIUl_=v<{vuAD>iD!N4R2pqvM&@iU<NmMW%YVP
zToX)-16GA~l@%B*bo;vJ8>xGcOErYDSw-`?@O7-3j)0&3c#et_c5t_x99^|wAP&BS
z47Oh}k23m{kf`vQgZw@fe-k+%cbvP+s&XC_r2D2}0E|;S#`5N261PIf?4+7|q4H^w
z0qLjs+X+mTAxcXMYw6GAKbk9~Ij@{fH|JRTu&$+LxI4P@JpZ_y7`wpE&PDVHEVOOp
z__Px4#dRXt=TRq~nptZpkmh%Yla^OjkfFLTk+)%HaPq^SxqQ?uRian+mdTgX&$c>m
z=M6i?J7v@*Xm>Kfyk}=$7Tm=K#k!R%2e_vnRt@qb^8~P|39HwX=f$@SSjl?d=hy-+
z7kp&7y~vtAomP3(RDlQ+IGaW8IVs`xoM67`%9E(cbZg6_zRWTwQY=fE*LF>>OL5=G
z;b`zTem^$8k#p7-?KViZIo#*$Qlsv0pft3Dte`V5hT@8g^H)IvvVy<WkJG;>VMy=t
zcFBdi)x|%AnqDfMV;;YrEM8Lih#3*^{!M(?DvvNs>1cpjt>Lm@_kDLPW|bg|Ck3@u
z3X#}y(naJ>Oq<}mp-@e45oU-s_brop(v3BY8VQZG^a-;juzr;%Z=*m`lJPwCr5$xn
z_|y7u_mKycicMqkEX2)<hIQStd>b?#5xcqL?vad$KDEB0<PsinYB!%f_I8O-8jiKF
zdodGH8zruerMAPipnOoFanp{4tzmV0BB^Vu{i?fcs{ZhIa6UAtZ)-8UMDg8=`g#wK
z$gtg68T;G8Z<3u(Gf;2K#x3jQ$dfM?(ZRuOWvZVukyP}r+>-BBOM=|16Z*Z(?sbTU
zSfa}#5Mf!AM!f*bahlXrLKV3M{1K?=*+p{>)Pg&<G{gNy@zk+tl$GvkPg`$raCEm&
zy<X&g#%T3y{562r7#l*-q2I`_Ort`|e`LS~e#Q<5oh|~{-@>!4KR;KtsrtyQ*Ho5j
z!OR~Fro_jpn+b+L`}&Y@#c93q@`^;-bw}3xYHC#G<J!52_2rXuAO6S&*9eOUEf4Sc
zB7d7W<aFv<e<Or6#4g0mkdW}M7zNE^Zj+O>E4fm*(n2r-*34HKZcv{1_&t1`gEctu
zkXityy%fkCao`-*RA?I`ut{~XoyJgQuz*n5`9}A=0`nd?PCkOvycM>Ho>$5VkAKpV
zw2ZgjTnD$ad>W*j82;c+EWYbo9DBDS`h)0gf5%IY)2@uu7T(@V<>~ZKQz^T1DJxRT
zX+e@b`r^$TP#BA`?UpuQqlb)r`E20YEu#qg+URoEZy`6Ij8{kr`(hen98$!nR@d|U
zCZ_a`SZP?w`q)LKR=jyWA&K+ay~r+c4_57SW?jqB&*i~Wa+g;0mWw}?z4u-Tc^2z;
z{hTf1-UdjW!7CE6m7uAQG&g&+A|^&C-3D*i)-S}B_j`AJ=oMzirLfWz0JT2k5SR5&
zN_5*5e0)$KK}zmwJUx}T-~4%`q?gP;F(w+%kgWA`a*(pr@Azr+C_@nSOh_UF&(L0%
z%Whq$G>E-GT<F|=PTyBd7hGuk5mHEMAl;tON{c=092gj8iV#Lcc;R|DKSEGDo<nyH
z$L!~}wJSZfXOs|?NxVF(vwcEHboD)ryB}T{WkoE;oD|!dKwH2c8NRfXW+;+A`FA;%
z|CTuUf9sqyJg|PX$z9C7URV2kCybP*KQJahTngO=or+W88jv3sswIbkRhz3qlTG2v
zQ!AR2?34_y0iLJD7k_4E|IE(*A6PguhV1Q44+b(yZAF64+J&BtpAIrJ-Er!u+GTu=
z<_ScpyN(rPZ;xzhBcEB#&mZ+3@TS6Cs4d9EwXyVci3b^?8izhhWaKAVWw`oq!AoPP
zW)c|!AFq?B0zW3jdxKo$&7D8TjF+~<uyYo@R>L@kzkw2@BJwfRFkcYHjK(@ioch~l
zC-p?FpFSNC`hnD5N4u?{XG#GGd5IJ8e+y+tnUXApv4O~Zq5hlG3`uu_mTz1>pPV&~
zSX`;V8O5h-8{Kqhl+Nj1E>Y;`cE*&THp=vh?8qFI+1ZfwNpU;veA068AZ`RYUNbyr
z2=-^;@06W9ofeOlKW8KIu#lajy_uuse`l^Kxyyd1b7U%<y&3n)^?pEu-|44aY7f|e
z<j~FVy<%?t8{}PsstZC-R8HsZst}NOg9&?lc!Pj-3(Zrg|8~ncNsQXs1@Gt#J>e?+
zbXcgd0U6?htcE&%T}Lv!o*EldqwP@;702`-ONTDeN@_OMyi%KJ9%WGraWgPw`y_X(
zfYLwR&(AQn2PZ)3UWzg2B~Jx;o>}H*P5k4AI-3_WCN6K~pD+T?r#=KmNjmF^h-yqf
zg4C8HQ7H^ZdsK4oFTO|Q$wrA67+cCRXAZj+Sxq>87v~bhb7Y)?v7ThM2|dZ54|dSa
zH^)C&<rDUAr=t(~_ThBGqP=B$<YQs7T?n_pIp5Cxlq@Vm5<e?UqLWWs#te<4)oOOj
z%t#-xm9QyB9D7dsISgl(l~NE}@pc+}jBb#f&CYUlP26Ew*j=s<+Hz%s>e)eZ-IQJG
zGU*fU7bZtHw+|#a8t*QB9a(;>KQjx>l-#Ht^&4GZWO0#DWMXQj@coqf(rbfku%rH=
z$<6m;*`K-A>Zx8$(I(jiHes3}O+f)27lc0Amhs`dJ4y*mVRJE$qI6?v@$}#dYQ@1)
z!L?ER_dSz6LX=+ph&JVJKmVX%bu2a4@O<zjVL;Ntv6|=2#rBo_#Kn$-tsmL&QWg9d
zEaPJLM3L?l?z=ekPlU;@eABni2ic{)^6Z}JvxVkEFG^z|F_ZenAI*G8>!PhZ598V)
z&wC&7w{|IUVvlsj*Z5D)8;bP*h=AC^wr1mDQ;x|gbP3(fw$uxxc2sHaru_>1rFYaN
z9+kLqs`T_0*J^@+SK@RZ(v4)xwoMxKf6~Z|mOHO@gp%Lf{!u#2ttj+`Icadfi_C~l
zak|^$uDCMo#c0W5AIyrP+DKsfVthc2BiF}(Q>XE*>afbo#~>W*y>QFqh`<3w%k~uz
z(=c;oym~dtTPt4D^gA7y>&D3)Mlj#SdZgsq)1AlPx1u(aF2Om&Nu5l|93@=gIc5Gr
z38bE^D-m5+H#e}7GL)%(RUcAjvTb=WJLLFk5UVSXwzGeB{e#WQ5#o|w#=JDSBa+gc
zqqbGr$wkttSbeF;pzPHJTZy2unKhvYy={XEh8kz0N+kNKqebv&03h{sP}*#n!focz
z3zh+$f9}flxk|e2NTmz?jc9aqda|^ySdraIlh%wP%$2sJzg92K#!_j<h1GY*5MBI6
zZ|t-pPx^%8_7(0#I*JZzb8Shm1*DM;IdJXq`ptN=;j0%ed+6X}SlKgRt}xI<sqBKm
z82bJB;Jg_-u5UFux8yx@M{pZfbtk)KjxLP?R(ZU`I^T2%M3T^o_(&SOthTn(Tll00
z>sw6CF~sQUnY}4oVYU(q8Q)!Rtf@p8g!g$>Q`a*23pi7U^xfznQd%ZfQYj46`I+CE
z-7qzCaaDf<M%Ugx+*zdPR^L<|m&vscTSwp)KZ8G8tyxs4{fI9QESGH>lMZK1oLt_}
z5in>&r<P1zwI`|Pa!I*`i!P@k<z7sG%QO5^ZI@hIyQZdw+w+@PpoyS(4^ijT8wF1a
z4E8X+=Xz2}B+zTX>dmK1_Q{-DE0#tDc_u)tO%r!e9Mg&aF^*gE72+C@4!bmwo%oIv
zslG|5JNe}fSU4}<$1%HI@HX@FD(c+GD8V=N{88_k?|f!xKBz8P_Vl?l_Hus$lZ;RB
z()eS=8-vahh`047=wwED^#-ymEIccxl&$aeve21l#S^O5N`|8{^rVY;z8S;!Rj44^
z&uZer-5lb?p0`(rVKVc4az5HfTRmMaLA^wlwyZvUQk{O1cOL7m%U$B0JmZ7cc&S^;
zqGH)2V~)*ez#JSMY=n75JhF>2sxU9j)BQ)6ZX#U^_?|rxb^eW$j7~%<yU{86Za$Qz
zQObvOi39>c7H~;;k1x@rO3x{a&nQ(?g>yl5%tI29%9^NYW(wxDhl?qwv@gy~3kjnq
zOJiNh7S5crhDxV!g?JL9DIxL=A1pR;kmQF70DdP*$yV|Wno1fwa_D`G#sV{_-X~Hl
zQ$R_;n7g#=ae#A#HRpEMql{(YF)PtK(;}&?GnI`KoEyv!l?3#(<$|Aye~8LP?T>wu
z5pJvso4z5?)zvjlnPycJ35jOoN~Vxa*iqq(4p25RywyEg=eh{uF8zAYU+}ujA~ueF
zc-AQ74dr88b0UwsI8GFU4^8`B53Z;&o6;YTSt4qx5kX(PY92iNpxIa6WwPeft0efw
zS&ux8gZ$2I>8S8LdD_mZ8AG`ealOZ{*;#sZ>Eq8-^0tfxnnsJQtL)S`J=u^U3lQ9P
z^9MV95fytKB(CXYLh11*E#CoS|M7(b(4G`$p5Bb%!tAz{IzO8V_eOOMN#ov7DDI>o
z)-*K^ciVEc#X(nm@fTxSB(m|6dw1Fh7d+V}iUo1W)xL3b0Ic{4e&LKCIoYThULA4Z
zf(a9dzI()2M<fD@aL`uLrlcWGaS|&MndfOj)D7n%YpUgQDE$WMCOAVW;6w)O)U_Mi
zN#M$#I?Jcej7#{Ivt+FCww<Xv2h2Uj)iqbj1gedI$W@AIJPqb&AvcU{8o=wc_?WHR
zNzjwVJi-dm{J;{m;Tk<@bxXk8MebYas}4GGf@OogH$-hV(Q2-QyKc-ezTI*6IZSt3
zXVE^q9Kh0!*SV^>-GrcPxsj$_&9rwqe0KwmEfJM8zLR4P#kjX8`R*mN6!90^>byYa
z(X5bt)t35(a!>2%DK5MeibSo#4vIaNdW&@$l1Kd7ZnAXUKg#g8M6}68jGs@q3gHP-
zvf4nv3Gr5KyipQ9uDaVRlC{H<HuhZ1BXN|BFe&DQ6mi1NP?yWfLixF9gav8b#p!n?
zsG%o1C&j*lFt?_iq=A7hPketRla(H2yjeu~^wPTsNbm%M7GCxw$gad@+~bys8jQr2
z+@qqaaJcZjfRY#cUd^-Br7U6i#U%m3C8xB^z`*>uIY~B~c3WgPIza<X%JGkTrEdz=
zGq*+RBg?VXyfAj9DLkI+C@HTfOek?{=P71*GA(E!Msk|RSP?Og)j}C`cQFe|=o^i~
zONX7D_<SG{qu~cf&RIW7y&NsvEA9-_@0^M(OF3|`aiPUni?vdodstQPG=eikA}uYs
zQ3+<@G@1Kgs3Yd2fTTT`qZr(#L3u9ygEO+>5O0(#D_moQ%YnxtKrX&yy*NY5%(eOh
z2{yKB8ZZEDZkg~u!$74fbqpO^#k}9FnRbQ7%;Rm0ETu5d0F`kAM#=rGAdI~LWm-GV
zc5l(|EHaGM!&jfVn?LJ%eQF3=QgiG1T6ClMhSl&Ocr<Hd(a!S$@zxVOWH%MOO+`U|
z8!3smahRvyn?^CpeWEU?J>pLNIwAihbM2x)Im><|w)l=KTx|QvtOOOgA3@a^t%35`
zTFXnOB(;}89f-~H!pI72*5G(*5NE5obMO=TS8%p0AreEQST&A)<{(KDM;0C5m(PP%
zuvEm`>G-_*UA-Swh(vR(i)GY?tTPjye#2K)eU(CUOCA#)mmx@Ew4i8;yRQ$G%yyd8
z+l$@-shfUb9eug8x>3g*7~z^UB5w<`xx)0-rex<WV4;VJM4w9-s4+!<OM80bb!*Qc
zmum_}*t6J!i<QT<X+t>;2Km-W7%U<;Sqr7SmezY_;F6pc%rdb`+g<TVF|*eIOm}h%
zB-~c3yP-#_(5a9bBC6enri3T!M%Or6?;lnAb#f<;miJLK;+S`e?rCfUCo@SePGax`
zB}2ORPrl+DReOL+tj{w5IhSR7#OpiGEF<ln^Uk4cwI9Xe%U1Dj`-pzn(wwBqVMt=G
zi807MpP<3LwwgT5lseIa(dFD8WAJ0d;lANsUJ<zP$D3UOsvl8f+!jY|lv)FLAEl1<
zHP7myCj`E#FoCuc4oKJ&pV0d~JxT-|PAR*<l9FHJNH`+q_y?#8*^}NePhW{Z){TB`
z*heVThfk{!*BrfY3~IM#cR8VC1k;Ka($Nkvj;}Xes_#!`R8`v*upC9aD69+uFkO$G
zzwVBL$P3s?tX*^GjV34MrKeh;xghX9pN`FBv2zG}^h{Z4)GOy4Pq!_YOpaM0L5Tz^
z>wTpTf3OZ$oAp*ifENyu#W^9&R+hJGVco#PALVmH%7QK6r>v+k@p6}P!H?)GHcNzd
ziaS+o+IJ1fOdlQ3P!5upHe%T2s@>a~9%(J|<H|ZJ4zG%P;BHP3nRT;gKuK&)Z>8%S
zjP&$)bNczXKqsM^RUbo?@N0Mg%&9xh+=Kt4)Ob#J)N=ihV*gxJx@#dVc$Wz$>M@OK
zc_wE-G1{8UCX0`G$&CLtif<pY8?}_Qd@^u+`E59wuyT$dku^;$iTVzF?xB4TA$-+-
zw%57c)HKDaYVSFFo<6kZXm)#(c5OWQ^sHohf#93#T0Cz%&8=D-vEAq-SI<(POLx?N
zP3=^Jh_fMSiz=0Dr?c(+^+BQ}attf%G|pOO)#C02J0!dBkp_02yX5L6)sMXeEm%%A
zFuvTC-9Eeq0%WqOtkPvl3TL3F$<MbFwOMAJ(Aa4%bzsmXX-Vya9c_K?_Ss%{k0^sF
zE|J<uMy|G}dWj*}a*J@e)>2yz^Ur%sqhI%LK_^7DE4eBZNj<0&Lg*Jlb!Jzi1a*|(
z@Mc7D6ua$;4B+ljJqw}GEzWsYRJ3QVT3JD{!ylos>$u%Reho~df5)&^Jz#x_d@b~D
zJXu4eKB#|fW_B{!%R?#8$$8yU3BUZFfiw|k>Ij0G*7(8(F%zl3MUhGPfJ!G3mG830
z^#Y52o=0MZ43mygM4~e{CkGm*eOtf6w1UzIGKk3+Jre<Oj;c1F>GX$$9L>|-RwwsJ
z(@Ppn>?MzYm!yk6^K(e-#t6IPY-_LY`Hc3C_90uMRQc@OH1t@4Ap9mjzP%0+o|8ok
z<}-WO@hUfXJRQa%y1Ju%zfH|9VT;w9euWWWp|NXEnSw196*B8@xAnMqaiNqb-1%Jz
zFY!e$rX?9ZPzhk)+5Wg*IOUnjh3M;}_$n>D)W|$AKS60q?-IpIf$!~sBKueJWy>J1
zCHKmMrkc_bWAN$g7l9=^?K4Vhv6IJ+=Y|&1$O4S{`uz|;I*^oE)^<H}pL7r<Qv*nj
zM8DV6506Fw?n2?c7~p+KKF@4NE3!NR9<eWh)FUs2zNuJ3Srp{WLR^ByHt?k@jFj7b
zt+t<>#V%cXJyW!U(tN~1Fxxv{Egennee&XR0e#21*>{1m{UVJ6`zvMNUyu8r5`%x-
wVfCc-+2}SoCoaAQc9_yHcvKH0{znE+{wD@>evj#YW}xl&y!>nPay|9G0P%4$5&!@I

literal 0
HcmV?d00001


From b3388d2d1e285f0b7dcb46b1c8283114eef53980 Mon Sep 17 00:00:00 2001
From: priya-mane <priya.hm@somaiya.edu>
Date: Sun, 25 Oct 2020 22:22:23 +0530
Subject: [PATCH 2/3] replace open for file access with open with, replace dict
 literal

---
 .../get_details.py                            | 37 ++++++------
 .../Research_paper_latex_parser/op_json.json  | 56 +++++++++----------
 2 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
index 6ad39f48f..46673f5a2 100644
--- a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
@@ -18,7 +18,7 @@ def __init__(self, tex_data):
 
     def get_elements(self):
         data = self.tex_data
-        data_dict = dict()
+        data_dict = {}
 
         sections = re.findall(r'section{(.*?)\\', data, re.S)
         for obj in sections:
@@ -122,20 +122,21 @@ def purge_equations(self):
 
         p = os.path.join(directory_path, tex_file)
 
-        data = open(p, encoding='latin-1').read()
-
-        cd = clean_data(data)
-        cd.purge_images()
-        cd.purge_tables()
-        cd.purge_equations()
-
-        ed = essential_data(cd.tex_data)
-        d = {}
-        d.update({"author": ed.get_author()})
-        d.update({"title": ed.get_title()})
-        d.update(ed.get_elements())
-        d.update({"acknowledgement": ed.get_ack()})
-        all_data.append(d)
-
-    with open(op_file, "w") as outfile:
-        json.dump(all_data, outfile, indent=4)
+        with open(p, 'r', encoding='latin-1') as f:
+            data_lst = f.readlines()
+            data = ' '.join([str(elem) for elem in data_lst])
+            cd = clean_data(data)
+            cd.purge_images()
+            cd.purge_tables()
+            cd.purge_equations()
+
+            ed = essential_data(cd.tex_data)
+            d = {}
+            d.update({"author": ed.get_author()})
+            d.update({"title": ed.get_title()})
+            d.update(ed.get_elements())
+            d.update({"acknowledgement": ed.get_ack()})
+            all_data.append(d)
+
+            with open(op_file, "w") as outfile:
+                json.dump(all_data, outfile, indent=4)
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json
index 73d6b1145..43b88c6f9 100644
--- a/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json
@@ -2,46 +2,46 @@
     {
         "author": "I.M. Great and So.R. Yu",
         "title": "A Sample Research Paper",
-        "Introduction": " \n\nUsing latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n\nMaecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n\n",
-        "Results": " \nIncluding figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n \n\n \n\n \n\n",
-        "Conclusions": " \n\nMan, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n",
-        "Some_title": " \n\nTest title for user defined  section.\n\n",
-        "user_defined_title_for_begin": " \n\nwjlrhfwer ljqr flwuer j rlferfurl u airlf  aiurf uoiruf iuoqir oiuqr iuq woe\n",
-        "acknowledgement": "\nThe author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n\n"
+        "Introduction": " \n \n Using latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n \n Maecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n \n ",
+        "Results": " \n Including figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n \n  \n \n  \n \n  \n \n ",
+        "Conclusions": " \n \n Man, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n \n ",
+        "Some_title": " \n \n Test title for user defined  section.\n \n ",
+        "user_defined_title_for_begin": " \n \n wjlrhfwer ljqr flwuer j rlferfurl u airlf  aiurf uoiruf iuoqir oiuqr iuq woe\n ",
+        "acknowledgement": "\n The author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n \n "
     },
     {
         "author": "Pratik Merchant, Smit Moradiya, Jignesh Nagda, Niket Mehta",
         "title": "Parallel Implementation of Support Vector Machine",
-        "Introduction": " \n\nThe name \u00e2\u0080\u0098support vectors\u00e2\u0080\u0099 (data points) used to define this dividing plane. Since we only require the SVs to create a classifier, the non SVs can be discarded. However, it becomes a problem when the points are not separable by a simple linear plane. Hence, to handle this problem, SVM uses what is known as the \u00e2\u0080\u009ckernel trick\u00e2\u0080\u009d on the training data and the mapping to a higher dimensional space is done by it, where such a dividing plane can be found more easily. Every improve accuracy. The role of Kernel is to transform the problem using some linear algebra for linear SVM and this is how the learning of the hyperplane happens. To avoid misclassifying each training example is given by the regularisation parameter. Lower is the regularisation value, more is the misclassification. To decide how far the influence of each training parameter reaches, the gamma parameter is used. Low gamma value means points which are at a far distance from the separation line are considered for calculation whereas a high gamma value implies that only the points nearby to the separation line are considered for calculation. Lastly, the separation of the line/hyperplane to the point which closest to it is called as margin. A larger separation for both the classes means a good margin and also no crossing into other class.\nThe steps to implement SVM are as follows: Step 1: Import all the necessary libraries such as numpy, pandas, matplotlib. Step 2: Importing the dataset Step 3: Performing exploratory data analysis Step 4: Performing data pre-processing Step 5: Splitting the data into train and test. Step 6: Import SVM, create a classifier and train the model. Step 6: Making predictions Step 7: Evaluating the algorithm  Step 8: Results \n\n",
-        "my_section": " \n\nnlwekndw lweuidwe nulei nameiude includewe oiu wede oiuwe dn eiuqwend\n\n",
-        "Results": " \nPSVM first loads the training data onto \u00e2\u0080\u0098m\u00e2\u0080\u0099 number of machines. This done in round robin fashion. The memory required by each memory is Big-Oh of nd/m. In the next step, PSVM performs a row-based ICF parallely on the data which has been loaded. At the end of this step, only a small portion of the factorized matrix is stored on each machine, which has a space complexity of Big-Oh of np/m. For the quadratic optimization problem, IPM is performed hereafter. Let, n- no. of training instances d-initial no. of dimensions p- After factorization, reduced matrix dimension (p<<<n) m- number of machines With the help of PSVM, the memory requirements reduce from a complexity of Big-Oh of n^2 to Big-Oh of np/m.  \n \nHDF5 which is a file format, data model, and a set of software allows users to store data and associated metadata. \n\nSeveral of its features make it useful for high performance computing and big data applications, such as data compression, which along with binary value storage can greatly reduce file sizes. It also allows parallel file access, which enables one or more processors to read/write data from a single file, and features customizable data \u00e2\u0080\u009cchunking\u00e2\u0080\u009d, which allows users to define how the data is internally arranged into subsets. This can be used to tune parallel performance. \n\nAn open source cluster computing system intended for the analysis and processing of big data is Apache Spark. Spark clusters have the ability to scale up to thousands of distributed nodes that can work cooperatively to process very large datasets in parallel, this makes them an example of an HTC system. It also supports real-time streaming processing of data as well as real-time streaming processing of data. Some of its main components are Spark streaming, Spark SQL, Spark core which are used for stream processing, to enable structured data processing and handle task distribution and scheduling respectively. (mllib) Machine Learning Library supports many tasks such as linear SVM training,linear regression and clustering. However some extra code needs to be written to handle multi-class problems since it only supports binary classification. \n \nThere maybe a period of CPU idle time if one segment finishes training before it concatenation counterpart. The issue can be cumulative in severely distributed databases. The benefits of attempting to balance the segments after the processing has started are likely to be outweighed by the overhead in MPI communication. This difference should be mitigated by proper randomization of the input vector order. \n \nApplications SVMs have been found particularly useful in earth observation and satellite imagery data analysis. Some of its other applications include: 1. Detetction of faces 2. Categorization of text and as well as hypertext 3. Image classifier 4. Fields related to biology 5. Remote homo-logy detection 6. Recognition of hand-written characters 7. GPC 8. Geo and Environmental Sciences \n\n",
-        "Conclusions": " \n\nMany algorithmic approaches are found to be more effective when kernels are not used or memory is not a constraint. Also, other approaches can be used to achieve good speedup by dividing a serial algorithm into subtasks. These subtasks are basically subsets of the training data. If no. of machines continue to expand and cross the data-size independent threshold, PSVM cannot achieve linear speedup in such cases. There are 2 types of overheads encountered while implementing parallel SVM- communication and synchronization overheads. During message passing, communication time is accounted for.  Computation, communication and synchronization together form the running time. To increase the accuracy, PSVM must select the correct no. of machines.  Hence, we can conclude by saying that when the parallelism of modern hardware is leveraged, massive speedups are possible a satisfactory performance is achieved\n\n",
-        "acknowledgement": "\nI am grateful to my college professors for providing me this wonderful oppurtunity to present this research paper.\n\n"
+        "Introduction": " \n \n The name \u00e2\u0080\u0098support vectors\u00e2\u0080\u0099 (data points) used to define this dividing plane. Since we only require the SVs to create a classifier, the non SVs can be discarded. However, it becomes a problem when the points are not separable by a simple linear plane. Hence, to handle this problem, SVM uses what is known as the \u00e2\u0080\u009ckernel trick\u00e2\u0080\u009d on the training data and the mapping to a higher dimensional space is done by it, where such a dividing plane can be found more easily. Every improve accuracy. The role of Kernel is to transform the problem using some linear algebra for linear SVM and this is how the learning of the hyperplane happens. To avoid misclassifying each training example is given by the regularisation parameter. Lower is the regularisation value, more is the misclassification. To decide how far the influence of each training parameter reaches, the gamma parameter is used. Low gamma value means points which are at a far distance from the separation line are considered for calculation whereas a high gamma value implies that only the points nearby to the separation line are considered for calculation. Lastly, the separation of the line/hyperplane to the point which closest to it is called as margin. A larger separation for both the classes means a good margin and also no crossing into other class.\n The steps to implement SVM are as follows: Step 1: Import all the necessary libraries such as numpy, pandas, matplotlib. Step 2: Importing the dataset Step 3: Performing exploratory data analysis Step 4: Performing data pre-processing Step 5: Splitting the data into train and test. Step 6: Import SVM, create a classifier and train the model. Step 6: Making predictions Step 7: Evaluating the algorithm  Step 8: Results \n \n ",
+        "my_section": " \n \n nlwekndw lweuidwe nulei nameiude includewe oiu wede oiuwe dn eiuqwend\n \n ",
+        "Results": " \n PSVM first loads the training data onto \u00e2\u0080\u0098m\u00e2\u0080\u0099 number of machines. This done in round robin fashion. The memory required by each memory is Big-Oh of nd/m. In the next step, PSVM performs a row-based ICF parallely on the data which has been loaded. At the end of this step, only a small portion of the factorized matrix is stored on each machine, which has a space complexity of Big-Oh of np/m. For the quadratic optimization problem, IPM is performed hereafter. Let, n- no. of training instances d-initial no. of dimensions p- After factorization, reduced matrix dimension (p<<<n) m- number of machines With the help of PSVM, the memory requirements reduce from a complexity of Big-Oh of n^2 to Big-Oh of np/m.  \n  \n HDF5 which is a file format, data model, and a set of software allows users to store data and associated metadata. \n \n Several of its features make it useful for high performance computing and big data applications, such as data compression, which along with binary value storage can greatly reduce file sizes. It also allows parallel file access, which enables one or more processors to read/write data from a single file, and features customizable data \u00e2\u0080\u009cchunking\u00e2\u0080\u009d, which allows users to define how the data is internally arranged into subsets. This can be used to tune parallel performance. \n \n An open source cluster computing system intended for the analysis and processing of big data is Apache Spark. Spark clusters have the ability to scale up to thousands of distributed nodes that can work cooperatively to process very large datasets in parallel, this makes them an example of an HTC system. It also supports real-time streaming processing of data as well as real-time streaming processing of data. Some of its main components are Spark streaming, Spark SQL, Spark core which are used for stream processing, to enable structured data processing and handle task distribution and scheduling respectively. (mllib) Machine Learning Library supports many tasks such as linear SVM training,linear regression and clustering. However some extra code needs to be written to handle multi-class problems since it only supports binary classification. \n  \n There maybe a period of CPU idle time if one segment finishes training before it concatenation counterpart. The issue can be cumulative in severely distributed databases. The benefits of attempting to balance the segments after the processing has started are likely to be outweighed by the overhead in MPI communication. This difference should be mitigated by proper randomization of the input vector order. \n  \n Applications SVMs have been found particularly useful in earth observation and satellite imagery data analysis. Some of its other applications include: 1. Detetction of faces 2. Categorization of text and as well as hypertext 3. Image classifier 4. Fields related to biology 5. Remote homo-logy detection 6. Recognition of hand-written characters 7. GPC 8. Geo and Environmental Sciences \n \n ",
+        "Conclusions": " \n \n Many algorithmic approaches are found to be more effective when kernels are not used or memory is not a constraint. Also, other approaches can be used to achieve good speedup by dividing a serial algorithm into subtasks. These subtasks are basically subsets of the training data. If no. of machines continue to expand and cross the data-size independent threshold, PSVM cannot achieve linear speedup in such cases. There are 2 types of overheads encountered while implementing parallel SVM- communication and synchronization overheads. During message passing, communication time is accounted for.  Computation, communication and synchronization together form the running time. To increase the accuracy, PSVM must select the correct no. of machines.  Hence, we can conclude by saying that when the parallelism of modern hardware is leveraged, massive speedups are possible a satisfactory performance is achieved\n \n ",
+        "acknowledgement": "\n I am grateful to my college professors for providing me this wonderful oppurtunity to present this research paper.\n \n "
     },
     {
         "author": "Pratik Merchant",
         "title": "Prediction of human behaviour with the aid of sentiment analysis using social media datasets.",
-        "Introduction": " \n\nSocial media data like Facebook, Twitter, Instagram blogs, etc. is currently growing in an exploding speed. Sentiment analysis\u00e2\u0080\u0093also called opinion mining\u00e2\u0080\u0093is the process of defining and categorizing opinions in a given piece of text as positive, negative, or neutral. The main purpose of conducting this research is to understand the sentiments which in turn can help us mine knowledge and capture the ideas without necessarily going through all data, which will save us a huge amount of time. Also, this analysis can further be used for a variety of purposes such as identifying influencers, competitive benchmarking, consumer opinion and brand sentiment, etc.\nThe already existing models lack accuracy. Also, they predict on the basis of one or 2 factors which is too less a number considering the amount of thought processes a human brain goes through before coming on to a decision. Also, the inaccuracy occurring due to the automated bots need to be taken into consideration. Since, they can largely tilt the dataset to a particular side (positive or negative). \nThe main goal is to improve accuracy and also to remove the input of the bots from the datasets using appropriate filtering techniques. And also, to merge the prediction of all the various datasets together to obtain a cumulative prediction of all the social media accounts a person uses.\nThe Government or the common public can largely benefit from this since any negative event(protests) if predicted by the model may help in taking adequate protective measures and hence in turn maybe avoid or reduce the magnitude of the same. This issue if addressed before could have prevented the negative impacts of a lot of events such as the Muzaffarnagar riots, FTII Agitation, Pro-Jallikattu protests which took place in Tamil Nadu, etc. Hence, any such events if again predicted in the future, can very well be avoided by taking appropriate advance action.   \n\n",
-        "Results": " \nThere are three machine learning classification algorithms that are predominantly used for sentiment analysis in social media and they are as follows:\na.\tSupport Vector Machines (SVMs)\nb.\tNaive-bayes\nc.\tDecision Trees\nEach has it\u00e2\u0080\u0099s own advantages and drawbacks; however, a few different studies have concluded that the Naive-Bayes classifier is the more accurate of the three.[1]\n      \nNaive-Bayes classifier is a machine learning classification algorithm that asserts an independent value for each feature within a dataset. In other words, each element is valued individually to determine a probability that the sum of these values will constitute a pre-defined label or outcome. Effective sentiment analysis of social media datasets using Naive Bayesian Classification involves extraction of subjective information from textual data. A normal human can easily understand the sentiment of a document written in natural language based on its knowledge of understanding the polarity of words and in some cases the general semantics used to describe the subject. The project aims to make the machine extract the polarity (positive, negative or neutral) of social media dataset with respect to the queried keyword.\nThis project introduces an approach for automatically classifying the sentiment of social media data by using the following procedure: First the training data is fed to the sentiment analysis engine for learning by using machine learning algorithm. The next step is to filter misleading data(mostly encountered because of bots).The next step involved is the training of the dataset by mathematical formulations. After the learning is complete with qualified accuracy, the machine starts accepting individual social data with respect to keyword that it analyses and interprets, and then classifies it as positive, negative or neutral with respect to the query term.[2] The prediction of an individual once obtained from the different social media datasets may then be cumulated and then compared with the prediction of other individuals to see if there is anything in common. Common predictions if found any may indicate the mass sentiment of the people and will also hint about their future course of actions if any.\nWhen talking about textual sentiment analysis, this usually comes in the form of a training set bag-of-words already sorted into positive or negative categories. A positive word may have a +1 scoring while a negative word will have a -1 scoring. You can also assign higher values to certain words that may be more negative in degree. Regardless, if the final score of a mention is positive, then the mention is positive and vice versa for negative final score.\nIf word only appears once, we don\u00e2\u0080\u0099t need a frequency table. If we assign each positive and negative value a \u00e2\u0080\u009c1\u00e2\u0080\u009d, then we can simply divide the positive and negative words by the amount of words in the entire mention and then the subtract the negative words score from the positive one and  if the total of our mention comes out as positive, we can say the sentiment of the mention above is positive and vice versa for a negative result.\nSince the total of our mention comes out as positive, we can say the sentiment of the mention above is positive. This is a pretty clear-cut case as we didn\u00e2\u0080\u0099t encounter polarizing words that might skew the result if a computer can\u00e2\u0080\u0099t understand which category the word belongs to.[1] \n\nNow, the maxim that more data will lead to better predictive models is not always true, because noise in the data can overwhelm predictive models. The ability to deal with noisy, incomplete, and inconsistent data will be at the heart of next-generation predictive models. For instance, when identifying \u00e2\u0080\u009cbots\u00e2\u0080\u009d on Twitter that are seeking to sway opinion to be positive about a political candidate, we needed to ignore the huge numbers of bots that were seeking to achieve other ends- such as spreading spam or seeking to influence opinions about other topics or to deceive users into clicking on links that generate revenue for the person who included that link in their tweet. Moreover, data about many Twitter handles are limited and, in some cases, intentionally misleading. Bot developers go to considerable effort to ensure that their bots elude detection.\n\nThe generation and reduction to practice of robust multistage predictive modeling for emergent phenomena is an important step. For instance, social movements have been classified into five stages: genesis of the movement, increase in social unrest, enthusiastic mobilization to develop an organization, maintenance of the organization, and termination (when the movement starts to die down). When the protest is in an early stage (for example, of people expressing grievances on Twitter), some stakeholders would benefit from a prediction of the likelihood of violence occurring in any of the future stages. In such extreme cases, identifying bots is a very important part.\nIn this way, the above proposed methodology if implemented, can be of great help in a variety of applications as seen above.\n\n\n",
-        "Conclusions": " \n\nUltimately, once can say that sentiment analysis isn\u00e2\u0080\u0099t perfect, but neither are we when trying to decipher what someone means. Within social media monitoring, we need sentiment analysis as a starting point to understand general public sentiment in aggregate. \nHence, we can say that social media is perhaps the largest pool from which we can mine for public opinion and begin to gather informative data for prediction purposes. \nIn this way, I plan to complete the above mentioned process as soon as possible once begun. If done correctly, the process would be completed within a stipulated period of time. If I am successful in meeting my objectives then, this shall be largely benefical to the Government authorities, the Police authorities as well as the common people at large. \n\n",
-        "acknowledgement": "\nI would like to thank my college professors for supporting me immensely in this endeavor.\n\n"
+        "Introduction": " \n \n Social media data like Facebook, Twitter, Instagram blogs, etc. is currently growing in an exploding speed. Sentiment analysis\u00e2\u0080\u0093also called opinion mining\u00e2\u0080\u0093is the process of defining and categorizing opinions in a given piece of text as positive, negative, or neutral. The main purpose of conducting this research is to understand the sentiments which in turn can help us mine knowledge and capture the ideas without necessarily going through all data, which will save us a huge amount of time. Also, this analysis can further be used for a variety of purposes such as identifying influencers, competitive benchmarking, consumer opinion and brand sentiment, etc.\n The already existing models lack accuracy. Also, they predict on the basis of one or 2 factors which is too less a number considering the amount of thought processes a human brain goes through before coming on to a decision. Also, the inaccuracy occurring due to the automated bots need to be taken into consideration. Since, they can largely tilt the dataset to a particular side (positive or negative). \n The main goal is to improve accuracy and also to remove the input of the bots from the datasets using appropriate filtering techniques. And also, to merge the prediction of all the various datasets together to obtain a cumulative prediction of all the social media accounts a person uses.\n The Government or the common public can largely benefit from this since any negative event(protests) if predicted by the model may help in taking adequate protective measures and hence in turn maybe avoid or reduce the magnitude of the same. This issue if addressed before could have prevented the negative impacts of a lot of events such as the Muzaffarnagar riots, FTII Agitation, Pro-Jallikattu protests which took place in Tamil Nadu, etc. Hence, any such events if again predicted in the future, can very well be avoided by taking appropriate advance action.   \n \n ",
+        "Results": " \n There are three machine learning classification algorithms that are predominantly used for sentiment analysis in social media and they are as follows:\n a.\tSupport Vector Machines (SVMs)\n b.\tNaive-bayes\n c.\tDecision Trees\n Each has it\u00e2\u0080\u0099s own advantages and drawbacks; however, a few different studies have concluded that the Naive-Bayes classifier is the more accurate of the three.[1]\n       \n Naive-Bayes classifier is a machine learning classification algorithm that asserts an independent value for each feature within a dataset. In other words, each element is valued individually to determine a probability that the sum of these values will constitute a pre-defined label or outcome. Effective sentiment analysis of social media datasets using Naive Bayesian Classification involves extraction of subjective information from textual data. A normal human can easily understand the sentiment of a document written in natural language based on its knowledge of understanding the polarity of words and in some cases the general semantics used to describe the subject. The project aims to make the machine extract the polarity (positive, negative or neutral) of social media dataset with respect to the queried keyword.\n This project introduces an approach for automatically classifying the sentiment of social media data by using the following procedure: First the training data is fed to the sentiment analysis engine for learning by using machine learning algorithm. The next step is to filter misleading data(mostly encountered because of bots).The next step involved is the training of the dataset by mathematical formulations. After the learning is complete with qualified accuracy, the machine starts accepting individual social data with respect to keyword that it analyses and interprets, and then classifies it as positive, negative or neutral with respect to the query term.[2] The prediction of an individual once obtained from the different social media datasets may then be cumulated and then compared with the prediction of other individuals to see if there is anything in common. Common predictions if found any may indicate the mass sentiment of the people and will also hint about their future course of actions if any.\n When talking about textual sentiment analysis, this usually comes in the form of a training set bag-of-words already sorted into positive or negative categories. A positive word may have a +1 scoring while a negative word will have a -1 scoring. You can also assign higher values to certain words that may be more negative in degree. Regardless, if the final score of a mention is positive, then the mention is positive and vice versa for negative final score.\n If word only appears once, we don\u00e2\u0080\u0099t need a frequency table. If we assign each positive and negative value a \u00e2\u0080\u009c1\u00e2\u0080\u009d, then we can simply divide the positive and negative words by the amount of words in the entire mention and then the subtract the negative words score from the positive one and  if the total of our mention comes out as positive, we can say the sentiment of the mention above is positive and vice versa for a negative result.\n Since the total of our mention comes out as positive, we can say the sentiment of the mention above is positive. This is a pretty clear-cut case as we didn\u00e2\u0080\u0099t encounter polarizing words that might skew the result if a computer can\u00e2\u0080\u0099t understand which category the word belongs to.[1] \n \n Now, the maxim that more data will lead to better predictive models is not always true, because noise in the data can overwhelm predictive models. The ability to deal with noisy, incomplete, and inconsistent data will be at the heart of next-generation predictive models. For instance, when identifying \u00e2\u0080\u009cbots\u00e2\u0080\u009d on Twitter that are seeking to sway opinion to be positive about a political candidate, we needed to ignore the huge numbers of bots that were seeking to achieve other ends- such as spreading spam or seeking to influence opinions about other topics or to deceive users into clicking on links that generate revenue for the person who included that link in their tweet. Moreover, data about many Twitter handles are limited and, in some cases, intentionally misleading. Bot developers go to considerable effort to ensure that their bots elude detection.\n \n The generation and reduction to practice of robust multistage predictive modeling for emergent phenomena is an important step. For instance, social movements have been classified into five stages: genesis of the movement, increase in social unrest, enthusiastic mobilization to develop an organization, maintenance of the organization, and termination (when the movement starts to die down). When the protest is in an early stage (for example, of people expressing grievances on Twitter), some stakeholders would benefit from a prediction of the likelihood of violence occurring in any of the future stages. In such extreme cases, identifying bots is a very important part.\n In this way, the above proposed methodology if implemented, can be of great help in a variety of applications as seen above.\n \n \n ",
+        "Conclusions": " \n \n Ultimately, once can say that sentiment analysis isn\u00e2\u0080\u0099t perfect, but neither are we when trying to decipher what someone means. Within social media monitoring, we need sentiment analysis as a starting point to understand general public sentiment in aggregate. \n Hence, we can say that social media is perhaps the largest pool from which we can mine for public opinion and begin to gather informative data for prediction purposes. \n In this way, I plan to complete the above mentioned process as soon as possible once begun. If done correctly, the process would be completed within a stipulated period of time. If I am successful in meeting my objectives then, this shall be largely benefical to the Government authorities, the Police authorities as well as the common people at large. \n \n ",
+        "acknowledgement": "\n I would like to thank my college professors for supporting me immensely in this endeavor.\n \n "
     },
     {
-        "author": "\nAmeya Keskar                                                         Priya Mane\nChinmay Lotankar                                                     Jeet Mehta\n",
+        "author": "\n Ameya Keskar                                                         Priya Mane\n Chinmay Lotankar                                                     Jeet Mehta\n ",
         "title": "C4.5 CLASSIFICATION ALGORITHM",
-        "Introduction": " \n\nData mining is the process of analyzing large data and getting valuable information from it. There are various algorithms and techniques done to do so. One such data structure is Decision tree; it is a flowchart-like structure with nodes and arrows directing from one node to another. At each node, one attribute is considered and further split branches equal to the number of unique values the attribute can take. Each of this branch is connected to other node where the value of next attribute is defined. Hence in going from one node to another, we fix or determine the value of each attribute. Each leaf node consists of one of the values of classification variable.\nNow the question is which attribute must be placed at what level in a tree.C4.5 Algorithm is used to choose the attributes to be placed at each level of the tree. The main advantage of C4.5 algorithm can deal with attributes having numeric data/non-categorical data which is difficult to classify per say and also deal with missing value data.C4.5 algorithm makes a decision tree by using the concept of information entropy. The parameter used here is normalized information gain. Normalized information gain is calculated for each attribute and the one with maximum value is chosen as the first attribute/root node of the decision tree.\n\n\nliterature review\n\nFor the construction of a decision tree, we can use the C4.5 algorithm. The algorithm is based on Information gain entropy. We can say that, if an event is highly probable, there is no surprise if it occurs. This means that it gives very little information. This means amount of information gained is inversely proportional to the probability of the event. Entropy is proportional to the probability of an event; hence we can also say that information gain and entropy are inversely proportional.\nIn decision trees, it is necessary that with each split the entropy decreases. Hence, if the splitting is done accurately, we may arrive to a very definite decision. So, we check each node for all possible splitting. cases for First, we calculate the entropy difference and the case for which difference is least is considered..\n\nALGORITHM:\nCalculate Information gain for each parameter.\nDetermine the attribute with maximum Information gain entropy.\nChoose this attribute as next splitting node.\nContinue in similar manner for all attributes.\n \nLet us consider an example \n\n \n\n",
-        "Results": " \nC4.5 Vs C5.0 C4.5 was superseded in 1997 by a commercial system See5/C5.0 (C5.0 for Unix / Linux, See5 pour Windows).  \nThe changes hold within new capabilities as well as much improved efficiency, and include: \n  A variant of boosting that constructs an ensemble of classifiers which are then voted to give a final classification. This often leads to a dramatic improvement in predictive accuracy.   New data types (e.g., dates), \u00e2\u0080\u009cnot applicable\u00e2\u0080\u009d values, variable misclassification costs, and mechanisms to pre-filter attributes. \n Unordered rule set when a case is classified, all applicable rules are found and voted. \n This improves both their predictive accuracy and the interpretability of rule sets.  \nMulti-threading enhances scalability. C5.0 have the ability to take advantage of computers with multiple CPUs and/or cores\n\n\n",
-        "Conclusions": " \n\nThe decision tree is a usual algorithm in data mining.C4.5 algorithm is a wide application scope, high frequency decision tree algorithm. It constructs and prunes the decision tree analysis and estimates, completes the classified data mining by data preprocessing and choosing parameters or catalog.The article analyzes the C4.5 and improved methods for the calculation speed of C4.5 algorithm in detail. At least, it is proved by experiment data set that the improved C4.5 algorithm is well-performed on the training speed classify and accuracy. In this Paper C4.5 algorithm was improved the experiment proved that it has minimal impact on the classification accuracy, but the efficiency increased a lot. We can not only speed up the growing of the decision tree, so that better information of rules can be generated. In this paper the algorithm was verified by different large datasets which are publicly available on UCI\nmachine learning repository. With the improved algorithm ,we can get faster and more effective results without the change of the final decision and the presented algorithm constructs the decision tree more clear and understandable .Efficiency and classification is greatly improved and the disadvantages of low efficiency and memory consumption while dealing with large amount of data were overcome as it was in C4.5.If the amount of data is small original C4.5 is\nused because of its higher accuracy.\n\n\n",
-        "thebibliography": " \n\nhttps://towardsdatascience.com/what-is-the-c4-5-algorithm-and-how-does-it-work-2b971a9e7db0\nhttps://medium.com/greyatom/decision-trees-a-simple-way-to-visualize-a-decision-dc506a403aeb\nhttps://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/\nhttps://arxiv.org/abs/1310.2071\nhttps://www.sciencedirect.com/science/article/pii/S0925231298000903\nhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC4466856/\n\n\n\n",
-        "acknowledgement": "\nThe authors are grateful to K.J Somaiya college of Engineering faculty.\n\n"
+        "Introduction": " \n \n Data mining is the process of analyzing large data and getting valuable information from it. There are various algorithms and techniques done to do so. One such data structure is Decision tree; it is a flowchart-like structure with nodes and arrows directing from one node to another. At each node, one attribute is considered and further split branches equal to the number of unique values the attribute can take. Each of this branch is connected to other node where the value of next attribute is defined. Hence in going from one node to another, we fix or determine the value of each attribute. Each leaf node consists of one of the values of classification variable.\n Now the question is which attribute must be placed at what level in a tree.C4.5 Algorithm is used to choose the attributes to be placed at each level of the tree. The main advantage of C4.5 algorithm can deal with attributes having numeric data/non-categorical data which is difficult to classify per say and also deal with missing value data.C4.5 algorithm makes a decision tree by using the concept of information entropy. The parameter used here is normalized information gain. Normalized information gain is calculated for each attribute and the one with maximum value is chosen as the first attribute/root node of the decision tree.\n \n \n literature review\n \n For the construction of a decision tree, we can use the C4.5 algorithm. The algorithm is based on Information gain entropy. We can say that, if an event is highly probable, there is no surprise if it occurs. This means that it gives very little information. This means amount of information gained is inversely proportional to the probability of the event. Entropy is proportional to the probability of an event; hence we can also say that information gain and entropy are inversely proportional.\n In decision trees, it is necessary that with each split the entropy decreases. Hence, if the splitting is done accurately, we may arrive to a very definite decision. So, we check each node for all possible splitting. cases for First, we calculate the entropy difference and the case for which difference is least is considered..\n \n ALGORITHM:\n Calculate Information gain for each parameter.\n Determine the attribute with maximum Information gain entropy.\n Choose this attribute as next splitting node.\n Continue in similar manner for all attributes.\n  \n Let us consider an example \n \n  \n \n ",
+        "Results": " \n C4.5 Vs C5.0 C4.5 was superseded in 1997 by a commercial system See5/C5.0 (C5.0 for Unix / Linux, See5 pour Windows).  \n The changes hold within new capabilities as well as much improved efficiency, and include: \n   A variant of boosting that constructs an ensemble of classifiers which are then voted to give a final classification. This often leads to a dramatic improvement in predictive accuracy.   New data types (e.g., dates), \u00e2\u0080\u009cnot applicable\u00e2\u0080\u009d values, variable misclassification costs, and mechanisms to pre-filter attributes. \n  Unordered rule set when a case is classified, all applicable rules are found and voted. \n  This improves both their predictive accuracy and the interpretability of rule sets.  \n Multi-threading enhances scalability. C5.0 have the ability to take advantage of computers with multiple CPUs and/or cores\n \n \n ",
+        "Conclusions": " \n \n The decision tree is a usual algorithm in data mining.C4.5 algorithm is a wide application scope, high frequency decision tree algorithm. It constructs and prunes the decision tree analysis and estimates, completes the classified data mining by data preprocessing and choosing parameters or catalog.The article analyzes the C4.5 and improved methods for the calculation speed of C4.5 algorithm in detail. At least, it is proved by experiment data set that the improved C4.5 algorithm is well-performed on the training speed classify and accuracy. In this Paper C4.5 algorithm was improved the experiment proved that it has minimal impact on the classification accuracy, but the efficiency increased a lot. We can not only speed up the growing of the decision tree, so that better information of rules can be generated. In this paper the algorithm was verified by different large datasets which are publicly available on UCI\n machine learning repository. With the improved algorithm ,we can get faster and more effective results without the change of the final decision and the presented algorithm constructs the decision tree more clear and understandable .Efficiency and classification is greatly improved and the disadvantages of low efficiency and memory consumption while dealing with large amount of data were overcome as it was in C4.5.If the amount of data is small original C4.5 is\n used because of its higher accuracy.\n \n \n ",
+        "thebibliography": " \n \n https://towardsdatascience.com/what-is-the-c4-5-algorithm-and-how-does-it-work-2b971a9e7db0\n https://medium.com/greyatom/decision-trees-a-simple-way-to-visualize-a-decision-dc506a403aeb\n https://sefiks.com/2018/05/13/a-step-by-step-c4-5-decision-tree-example/\n https://arxiv.org/abs/1310.2071\n https://www.sciencedirect.com/science/article/pii/S0925231298000903\n https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4466856/\n \n \n \n ",
+        "acknowledgement": "\n The authors are grateful to K.J Somaiya college of Engineering faculty.\n \n "
     },
     {
-        "author": "\nArman Cohan\nFranck Dernoncourt\nDoo Soon Kim\nTrung Bui\nSeokhwan Kim \nWalter Chang\nNazli Goharian\u00e2\u0080\u00a0\n",
-        "title": "A Discourse-Aware Attention Model for\nAbstractive Summarization of Long Documents",
-        "Introduction": " \nExisting large-scale summarization datasets\nconsist of relatively short documents. For exam\u0002ple, articles in the CNN/Daily Mail dataset (Her\u0002mann et al., 2015) are on average about 600 words\nlong. Similarly, existing neural summarization\nmodels have focused on summarizing sentences\nand short documents. In this work, we propose a\nmodel for effective abstractive summarization of\nlonger documents. Scientific papers are an ex\u0002ample of documents that are significantly longer\nthan news articles (see Table 1). They also fol\u0002low a standard discourse structure describing the\nproblem, methodology, experiments/results, and\nfinally conclusions (Suppe, 1998).\nMost summarization works in the literature\nfocus on extractive summarization. Examples\nof prominent approaches include frequency-based\nmethods (Vanderwende et al., 2007), graph-based\nmethods (Erkan and Radev, 2004), topic mod\u0002eling (Steinberger and Jezek, 2004), and neural\nmodels (Nallapati et al., 2017). Abstractive sum\u0002marization is an alternative approach where the\ngenerated summary may contain novel words and\nphrases and is more similar to how humans sum\u0002marize documents (Jing, 2002). Recently, neu\u0002ral methods have led to encouraging results in\nabstractive summarization (Nallapati et al., 2016;\nSee et al., 2017; Paulus et al., 2017; Li et al.,\n2017). These approaches employ a general frame\u0002work of sequence-to-sequence (seq2seq) models\n(Sutskever et al., 2014) where the document is\nfed to an encoder network and another (recurrent)\nnetwork learns to decode the summary. While\npromising, these methods focus on summarizing\nnews articles which are relatively short. Many\nother document types, however, are longer and\nstructured. Seq2seq models tend to struggle with\nlonger sequences because at each decoding step,\nthe decoder needs to learn to construct a context\nvector capturing relevant information from all the\ntokens in the source sequence (Shao et al., 2017).\nOur main contribution is an abstractive model\nfor summarizing scientific papers which are an\nexample of long-form structured document types.\nOur model includes a hierarchical encoder, captur\u0002ing the discourse structure of the document and a\ndiscourse-aware decoder that generates the sum\u0002mary. Our decoder attends to different discourse\nsections and allows the model to more accurately\nrepresent important information from the source\nresulting in a better context vector. We also in\u0002troduce two large-scale datasets of long and struc\u0002tured scientific papers obtained from arXiv and\nPubMed to support both training and evaluating\nmodels on the task of long document summariza\u0002tion. Evaluation results show that our method out\u0002performs state-of-the-art summarization models1.\n\n\n",
-        "Results": " \nOur main results are shown in Tables 2\nand 3. Our model significantly outperforms the\nstate-of-the-art abstractive methods, showing its\neffectiveness on both datasets. We observe that\nin our ROUGE-1 score is respectively about 4 and\n3 points higher than the abstractive model PntrGen-Seq2Seq for the arXiv and PubMed datasets,\nproviding a significant improvement. Our method\nalso outperforms most of the extractive methods\nexcept for LexRank in one of the ROUGE scores.\nWe note that since extractive methods copy salient\nsentences from the document, it is usually easier for them to achieve higher ROUGE scores.\nFigure 2 illustrates the effectiveness of our\nmodel extensions in capturing various discourse\ninformation from the papers. It can be observed\nthat the state-of-the-art Pntr-Gen-Seq2Seq model\ngenerates a summary that mostly focuses on introducing the problem, whereas our model generates\na summary that includes more information about\nthe methodology and impacts of the target paper.\nThis indicates that the context vector in our model\ncompared with Pntr-Gen-Seq2Seq is better able to\ncapture important information from the source by\nattending to various discourse sections.\n\n\n",
-        "Conclusions": " \nThis work was the first attempt at addressing\nneural abstractive summarization of single, long\ndocuments. We presented a neural sequence-tosequence model that is able to effectively summarize long and structured documents such as scientific papers. While our results are encouraging,\nthere is still much room for improvement for this\nchallenging task; our new datasets can help the\ncommunity to further explore this problem.\nWe note that following the convention in the\nsummarization research, our quantitative evaluation is performed by ROUGE automatic metric.\nWhile ROUGE is an effective evaluation framework, nuances in the coherence or coverage of the\nsummaries are not captured with it. It is non-trivial\nto evaluate such qualities especially for long doc\u0002ument summarization; future work can design expert human evaluations to explore these nuances.\n\n",
-        "thebibliography": " \nDzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly\nlearning to align and translate. arXiv preprint\narXiv:1409.0473 .\nSumit Chopra, Michael Auli, Alexander M Rush, and\nSEAS Harvard. 2016. Abstractive sentence summarization with attentive recurrent neural networks. In\nHLT-NAACL. pages 93\u00e2\u0080\u009398.\nArman Cohan and Nazli Goharian. 2015. Scientific article summarization using citation-context\nand article\u00e2\u0080\u0099s discourse structure. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Lisbon, Portugal, pages 390\u00e2\u0080\u0093400. http://aclweb.org/\nanthology/D15-1045.\nArman Cohan and Nazli Goharian. 2017a. Contextu\u0002alizing citations for scientific summarization using\nword embeddings and domain knowledge. arXiv\npreprint arXiv:1705.08063 .\n\n",
-        "acknowledgement": "\nWe thank the three anonymous reviewers for\ntheir comments and suggestions.\n\n"
+        "author": "\n Arman Cohan\n Franck Dernoncourt\n Doo Soon Kim\n Trung Bui\n Seokhwan Kim \n Walter Chang\n Nazli Goharian\u00e2\u0080\u00a0\n ",
+        "title": "A Discourse-Aware Attention Model for\n Abstractive Summarization of Long Documents",
+        "Introduction": " \n Existing large-scale summarization datasets\n consist of relatively short documents. For exam\u0002ple, articles in the CNN/Daily Mail dataset (Her\u0002mann et al., 2015) are on average about 600 words\n long. Similarly, existing neural summarization\n models have focused on summarizing sentences\n and short documents. In this work, we propose a\n model for effective abstractive summarization of\n longer documents. Scientific papers are an ex\u0002ample of documents that are significantly longer\n than news articles (see Table 1). They also fol\u0002low a standard discourse structure describing the\n problem, methodology, experiments/results, and\n finally conclusions (Suppe, 1998).\n Most summarization works in the literature\n focus on extractive summarization. Examples\n of prominent approaches include frequency-based\n methods (Vanderwende et al., 2007), graph-based\n methods (Erkan and Radev, 2004), topic mod\u0002eling (Steinberger and Jezek, 2004), and neural\n models (Nallapati et al., 2017). Abstractive sum\u0002marization is an alternative approach where the\n generated summary may contain novel words and\n phrases and is more similar to how humans sum\u0002marize documents (Jing, 2002). Recently, neu\u0002ral methods have led to encouraging results in\n abstractive summarization (Nallapati et al., 2016;\n See et al., 2017; Paulus et al., 2017; Li et al.,\n 2017). These approaches employ a general frame\u0002work of sequence-to-sequence (seq2seq) models\n (Sutskever et al., 2014) where the document is\n fed to an encoder network and another (recurrent)\n network learns to decode the summary. While\n promising, these methods focus on summarizing\n news articles which are relatively short. Many\n other document types, however, are longer and\n structured. Seq2seq models tend to struggle with\n longer sequences because at each decoding step,\n the decoder needs to learn to construct a context\n vector capturing relevant information from all the\n tokens in the source sequence (Shao et al., 2017).\n Our main contribution is an abstractive model\n for summarizing scientific papers which are an\n example of long-form structured document types.\n Our model includes a hierarchical encoder, captur\u0002ing the discourse structure of the document and a\n discourse-aware decoder that generates the sum\u0002mary. Our decoder attends to different discourse\n sections and allows the model to more accurately\n represent important information from the source\n resulting in a better context vector. We also in\u0002troduce two large-scale datasets of long and struc\u0002tured scientific papers obtained from arXiv and\n PubMed to support both training and evaluating\n models on the task of long document summariza\u0002tion. Evaluation results show that our method out\u0002performs state-of-the-art summarization models1.\n \n \n ",
+        "Results": " \n Our main results are shown in Tables 2\n and 3. Our model significantly outperforms the\n state-of-the-art abstractive methods, showing its\n effectiveness on both datasets. We observe that\n in our ROUGE-1 score is respectively about 4 and\n 3 points higher than the abstractive model PntrGen-Seq2Seq for the arXiv and PubMed datasets,\n providing a significant improvement. Our method\n also outperforms most of the extractive methods\n except for LexRank in one of the ROUGE scores.\n We note that since extractive methods copy salient\n sentences from the document, it is usually easier for them to achieve higher ROUGE scores.\n Figure 2 illustrates the effectiveness of our\n model extensions in capturing various discourse\n information from the papers. It can be observed\n that the state-of-the-art Pntr-Gen-Seq2Seq model\n generates a summary that mostly focuses on introducing the problem, whereas our model generates\n a summary that includes more information about\n the methodology and impacts of the target paper.\n This indicates that the context vector in our model\n compared with Pntr-Gen-Seq2Seq is better able to\n capture important information from the source by\n attending to various discourse sections.\n \n \n ",
+        "Conclusions": " \n This work was the first attempt at addressing\n neural abstractive summarization of single, long\n documents. We presented a neural sequence-tosequence model that is able to effectively summarize long and structured documents such as scientific papers. While our results are encouraging,\n there is still much room for improvement for this\n challenging task; our new datasets can help the\n community to further explore this problem.\n We note that following the convention in the\n summarization research, our quantitative evaluation is performed by ROUGE automatic metric.\n While ROUGE is an effective evaluation framework, nuances in the coherence or coverage of the\n summaries are not captured with it. It is non-trivial\n to evaluate such qualities especially for long doc\u0002ument summarization; future work can design expert human evaluations to explore these nuances.\n \n ",
+        "thebibliography": " \n Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly\n learning to align and translate. arXiv preprint\n arXiv:1409.0473 .\n Sumit Chopra, Michael Auli, Alexander M Rush, and\n SEAS Harvard. 2016. Abstractive sentence summarization with attentive recurrent neural networks. In\n HLT-NAACL. pages 93\u00e2\u0080\u009398.\n Arman Cohan and Nazli Goharian. 2015. Scientific article summarization using citation-context\n and article\u00e2\u0080\u0099s discourse structure. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, Lisbon, Portugal, pages 390\u00e2\u0080\u0093400. http://aclweb.org/\n anthology/D15-1045.\n Arman Cohan and Nazli Goharian. 2017a. Contextu\u0002alizing citations for scientific summarization using\n word embeddings and domain knowledge. arXiv\n preprint arXiv:1705.08063 .\n \n ",
+        "acknowledgement": "\n We thank the three anonymous reviewers for\n their comments and suggestions.\n \n "
     }
 ]
\ No newline at end of file

From 0242699ece6dccfd63d15a9b9769156ffe1ae6aa Mon Sep 17 00:00:00 2001
From: priya-mane <priya.hm@somaiya.edu>
Date: Mon, 26 Oct 2020 19:47:42 +0530
Subject: [PATCH 3/3] Added comments to the code and requirements.txt

---
 .../Research_paper_latex_parser/README.md     |  6 +--
 .../get_details.py                            | 39 +++++++++++++++++++
 .../requirements.txt                          |  5 +++
 3 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt

diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/README.md b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md
index 136d5a395..a50fd67fb 100644
--- a/Scripts/Miscellaneous/Research_paper_latex_parser/README.md
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md
@@ -21,11 +21,7 @@ The script returns a json object containing following items for each research pa
 ### Prerequisites
 
 ```
-pip install os
-pip install json
-pip install re
-pip install argparse
-pip install tqdm
+pip install -r requirements.txt
 ```
 
 ***
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
index 46673f5a2..e48f5d21f 100644
--- a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py
@@ -20,6 +20,11 @@ def get_elements(self):
         data = self.tex_data
         data_dict = {}
 
+        """
+        The next few lines extract data for the section tags specified in the latex.
+        Regular expressions are used to separate headings(h) and the content(c).
+        The heading and content are then added to a dictionary object.
+        """
         sections = re.findall(r'section{(.*?)\\', data, re.S)
         for obj in sections:
             h = re.findall(r'(.*?)}', obj, re.S)
@@ -27,6 +32,11 @@ def get_elements(self):
             data_dict['%s' % (h[0])] = '%s' % (c)
             data = data.replace("section{" + obj, " ")
 
+        """
+        The next few lines extract data for the begin tags specified in the latex.
+        Regular expressions are used to separate headings(h) and the content(c).
+        The heading and content are then added to a dictionary object.
+        """
         begins = re.findall(r'\\begin{(.*?)\\end', data, re.S)
         for obj in begins:
             h = re.findall(r'(.*?)}', obj, re.S)
@@ -39,14 +49,34 @@ def get_elements(self):
         return data_dict
 
     def get_author(self):
+        """
+        The Author tag is a specially mentioned tag in latex format.
+        Hence the Author name is extracted from this tag.
+        The user can choose to specify the tag as 'Author' or 'author'.
+        Hence the `[Aa]` is included in the regex.
+        """
         author = re.findall(r'[Aa]uthor(s?){(.*?)}', self.tex_data, re.S)
         return author[0][1]
 
     def get_title(self):
+        """
+        The Title tag is a specially mentioned tag in latex format.
+        Hence the title is extracted from this tag.
+        The user can choose to specify the tag as 'Title' or 'title'.
+        Hence the `[Tt]` is included in the regex.
+        """
         title = re.findall(r'[Tt]itle{(.*?)}', self.tex_data, re.S)
         return title[0]
 
     def get_ack(self):
+        """
+        The Acknowledgements tag is a specially mentioned tag in latex format.
+        Hence the acknowledgements is extracted from this tag.
+        The user can choose to specify the tag as 'acknowledgements' or 'Acknowledgements'.
+        Hence the `[Aa]` is included in the regex.
+        The user can also choose to specify it in singular sense like 'Acknowledgement' or 'acknowledgement'.
+        Hence the s is made optional at the end by writing `(s?)` in the regex.
+        """
         acknowledgments = re.findall(
             r'\\[Aa]cknowledgment(s?)(.*?)\\', self.tex_data, re.S)
         return acknowledgments[0][1]
@@ -101,23 +131,28 @@ def purge_equations(self):
 
 if __name__ == '__main__':
 
+    # Define description of the script.
     parser = argparse.ArgumentParser(
         description="extract title,author,abstract,introduction,results,conclusions and acknowledgments from given set of research papers.")
 
+    # Define inputs required for the script to run.
     parser.add_argument("-parent", help="enter path to parent directory containing all research papers",
                         dest="parent_path", type=str, required=True)
     parser.add_argument("-output", help="enter path of output file",
                         dest="op", type=str, required=True)
 
+    # Parse the arguments received from the command.
     args = parser.parse_args()
     directory_path = args.parent_path
     op_file = args.op
 
     all_data = []
 
+    # Store all files from the mentioned directory.
     all_files = [f for f in listdir(
         directory_path) if isfile(join(directory_path, f))]
 
+    # Read all the files and extract information form each file.
     for tex_file in tqdm(all_files):
 
         p = os.path.join(directory_path, tex_file)
@@ -125,11 +160,14 @@ def purge_equations(self):
         with open(p, 'r', encoding='latin-1') as f:
             data_lst = f.readlines()
             data = ' '.join([str(elem) for elem in data_lst])
+
+            # Use clean_data class methods to remove images, tables and equations/.
             cd = clean_data(data)
             cd.purge_images()
             cd.purge_tables()
             cd.purge_equations()
 
+            # Use essential_data class methods to extract the required data and store in json object.
             ed = essential_data(cd.tex_data)
             d = {}
             d.update({"author": ed.get_author()})
@@ -138,5 +176,6 @@ def purge_equations(self):
             d.update({"acknowledgement": ed.get_ack()})
             all_data.append(d)
 
+            # Dump the json output object to the output file.
             with open(op_file, "w") as outfile:
                 json.dump(all_data, outfile, indent=4)
diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt b/Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt
new file mode 100644
index 000000000..617464885
--- /dev/null
+++ b/Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt
@@ -0,0 +1,5 @@
+os-sys
+json==2.0.9
+re==2.2.1
+argparse==1.1
+tqdm==4.29.1
\ No newline at end of file