From 695a9fc42b85451e3527aa2bac545a2dc7ceb9aa Mon Sep 17 00:00:00 2001 From: priya-mane Date: Sun, 25 Oct 2020 21:38:50 +0530 Subject: [PATCH 1/3] added python script for a research paper latex parser --- .../Research_paper_latex_parser/README.md | 76 +++ .../get_details.py | 141 +++++ .../Research_paper_latex_parser/op_json.json | 47 ++ .../Research_paper_latex_parser/papers/p1.tex | 91 ++++ .../Research_paper_latex_parser/papers/p2.tex | 50 ++ .../Research_paper_latex_parser/papers/p3.tex | 63 +++ .../Research_paper_latex_parser/papers/p4.tex | 87 +++ .../Research_paper_latex_parser/papers/p5.tex | 146 ++++++ .../Research_paper_latex_parser/parser.ipynb | 494 ++++++++++++++++++ .../results/Capture.JPG | Bin 0 -> 24998 bytes 10 files changed, 1195 insertions(+) create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/README.md create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p1.tex create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p2.tex create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p3.tex create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p4.tex create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/papers/p5.tex create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/parser.ipynb create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/results/Capture.JPG diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/README.md b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md new file mode 100644 index 000000000..136d5a395 --- /dev/null +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md @@ -0,0 +1,76 @@ +# Research paper parser + +The script reads latex files for research paper from the given directory and extracts essential information from the latex format. + +The script purges unwanted items like - +* Images +* Tables +* Equations + +The script returns a json object containing following items for each research paper - +* title +* author +* abstract +* introduction +* conclusions +* results +* acknowledgments +* Scrapes any other title defined by the user too. + +*** +### Prerequisites + +``` +pip install os +pip install json +pip install re +pip install argparse +pip install tqdm +``` + +*** + +### How to run the script + +``` +python get_details.py -p -o +``` + +Example : + +``` +python get_details.py -p ./papers -o op_json.json +``` + +![Output](results/Capture.JPG) + +*** + + +### Results + +A sample json object created for one research paper. + +``` + { + "author": "I.M. Great and So.R. Yu", + "title": "A Sample Research Paper", + "Introduction": " \n\nUsing latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n\nMaecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n\n", + "Results": " \nIncluding figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n \n\n \n\n \n\n", + "Conclusions": " \n\nMan, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n", + "Some_title": " \n\nTest title for user defined section.\n\n", + "user_defined_title_for_begin": " \n\nwjlrhfwer ljqr flwuer j rlferfurl u airlf aiurf uoiruf iuoqir oiuqr iuq woe\n", + "acknowledgement": "\nThe author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n\n" + } +``` + + +*** + +## *Author Name* + +Priya Mane + +*** + + diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py new file mode 100644 index 000000000..6ad39f48f --- /dev/null +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py @@ -0,0 +1,141 @@ +from os.path import isfile, join +import re +import json +from os import listdir +import argparse +import os +from tqdm import tqdm + + +class essential_data: + """ + Extract essential data from the tex document. + Essential data includes - title, author, abstract, introduction, conclusions, results, acknowledgments. + """ + + def __init__(self, tex_data): + self.tex_data = tex_data + + def get_elements(self): + data = self.tex_data + data_dict = dict() + + sections = re.findall(r'section{(.*?)\\', data, re.S) + for obj in sections: + h = re.findall(r'(.*?)}', obj, re.S) + c = obj.replace(h[0]+"}", ' ') + data_dict['%s' % (h[0])] = '%s' % (c) + data = data.replace("section{" + obj, " ") + + begins = re.findall(r'\\begin{(.*?)\\end', data, re.S) + for obj in begins: + h = re.findall(r'(.*?)}', obj, re.S) + if len(h) > 1: + continue + c = obj.replace(h[0]+"}", ' ') + data_dict['%s' % (h[0])] = '%s' % (c) + data = data.replace("\\begin{" + obj + "\\end", " ") + + return data_dict + + def get_author(self): + author = re.findall(r'[Aa]uthor(s?){(.*?)}', self.tex_data, re.S) + return author[0][1] + + def get_title(self): + title = re.findall(r'[Tt]itle{(.*?)}', self.tex_data, re.S) + return title[0] + + def get_ack(self): + acknowledgments = re.findall( + r'\\[Aa]cknowledgment(s?)(.*?)\\', self.tex_data, re.S) + return acknowledgments[0][1] + + +class clean_data: + """ + Contains functions to purge all unwanted elements from the tex file. + """ + + def __init__(self, tex_data): + self.tex_data = tex_data + + def purge_images(self): + """ + Purges images from the tex data using tag the '\begin{figure}' + """ + imgs = re.findall( + r'begin{figure}(.*?)end{figure}', self.tex_data, re.S) + start = "\\begin{figure}" + end = "end{figure}" + imgs = [start + img + end for img in imgs] + for img in imgs: + self.tex_data = self.tex_data.replace(img, " ") + + def purge_tables(self): + """ + Purges tables from the tex data using tag the '\begin{table}' + """ + tables = re.findall( + r'begin{table}(.*?)end{table}', self.tex_data, re.S) + start = "\\begin{table}" + end = "end{table}" + tables = [start + table + end for table in tables] + for table in tables: + self.tex_data = self.tex_data.replace(table, " ") + + def purge_equations(self): + """ + Purges equation from the tex data using tag the '\begin{equation}' + """ + equations = re.findall( + r'begin{equation}(.*?)end{equation}', self.tex_data, re.S) + start = "\\begin{equation}" + end = "end{equation}" + equations = [start + equation + end for equation in equations] + for equation in equations: + self.tex_data = self.tex_data.replace(equation, " ") + + +# python get_details.py -p papers -o op_json.json + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description="extract title,author,abstract,introduction,results,conclusions and acknowledgments from given set of research papers.") + + parser.add_argument("-parent", help="enter path to parent directory containing all research papers", + dest="parent_path", type=str, required=True) + parser.add_argument("-output", help="enter path of output file", + dest="op", type=str, required=True) + + args = parser.parse_args() + directory_path = args.parent_path + op_file = args.op + + all_data = [] + + all_files = [f for f in listdir( + directory_path) if isfile(join(directory_path, f))] + + for tex_file in tqdm(all_files): + + p = os.path.join(directory_path, tex_file) + + data = open(p, encoding='latin-1').read() + + cd = clean_data(data) + cd.purge_images() + cd.purge_tables() + cd.purge_equations() + + ed = essential_data(cd.tex_data) + d = {} + d.update({"author": ed.get_author()}) + d.update({"title": ed.get_title()}) + d.update(ed.get_elements()) + d.update({"acknowledgement": ed.get_ack()}) + all_data.append(d) + + with open(op_file, "w") as outfile: + json.dump(all_data, outfile, indent=4) diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json new file mode 100644 index 000000000..73d6b1145 --- /dev/null +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json @@ -0,0 +1,47 @@ +[ + { + "author": "I.M. Great and So.R. Yu", + "title": "A Sample Research Paper", + "Introduction": " \n\nUsing latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n\nMaecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n\n", + "Results": " \nIncluding figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n \n\n \n\n \n\n", + "Conclusions": " \n\nMan, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n", + "Some_title": " \n\nTest title for user defined section.\n\n", + "user_defined_title_for_begin": " \n\nwjlrhfwer ljqr flwuer j rlferfurl u airlf aiurf uoiruf iuoqir oiuqr iuq woe\n", + "acknowledgement": "\nThe author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n\n" + }, + { + "author": "Pratik Merchant, Smit Moradiya, Jignesh Nagda, Niket Mehta", + "title": "Parallel Implementation of Support Vector Machine", + "Introduction": " \n\nThe name \u00e2\u0080\u0098support vectors\u00e2\u0080\u0099 (data points) used to define this dividing plane. Since we only require the SVs to create a classifier, the non SVs can be discarded. However, it becomes a problem when the points are not separable by a simple linear plane. Hence, to handle this problem, SVM uses what is known as the \u00e2\u0080\u009ckernel trick\u00e2\u0080\u009d on the training data and the mapping to a higher dimensional space is done by it, where such a dividing plane can be found more easily. Every improve accuracy. The role of Kernel is to transform the problem using some linear algebra for linear SVM and this is how the learning of the hyperplane happens. To avoid misclassifying each training example is given by the regularisation parameter. Lower is the regularisation value, more is the misclassification. To decide how far the influence of each training parameter reaches, the gamma parameter is used. Low gamma value means points which are at a far distance from the separation line are considered for calculation whereas a high gamma value implies that only the points nearby to the separation line are considered for calculation. Lastly, the separation of the line/hyperplane to the point which closest to it is called as margin. A larger separation for both the classes means a good margin and also no crossing into other class.\nThe steps to implement SVM are as follows: Step 1: Import all the necessary libraries such as numpy, pandas, matplotlib. Step 2: Importing the dataset Step 3: Performing exploratory data analysis Step 4: Performing data pre-processing Step 5: Splitting the data into train and test. Step 6: Import SVM, create a classifier and train the model. Step 6: Making predictions Step 7: Evaluating the algorithm Step 8: Results \n\n", + "my_section": " \n\nnlwekndw lweuidwe nulei nameiude includewe oiu wede oiuwe dn eiuqwend\n\n", + "Results": " \nPSVM first loads the training data onto \u00e2\u0080\u0098m\u00e2\u0080\u0099 number of machines. This done in round robin fashion. The memory required by each memory is Big-Oh of nd/m. In the next step, PSVM performs a row-based ICF parallely on the data which has been loaded. At the end of this step, only a small portion of the factorized matrix is stored on each machine, which has a space complexity of Big-Oh of np/m. For the quadratic optimization problem, IPM is performed hereafter. Let, n- no. of training instances d-initial no. of dimensions p- After factorization, reduced matrix dimension (p<<)W#4KPLh z0oO~nW)(f5Rsf)`4%`C(02jbSlLXLFHEh%u)mKme6IDk;)iqL5f1UwC0Jzn&MFZfV zuA%CH45|P$Tc{7*pS9n|Zv=iL@Ed{O2>eFiHv<1ZBA}rO(vt@OthX3{@CZ^AUr2k4 z`PX_SJo=A1hPW62z~O(Z|BX}N!hhoSn^XM%s6D@B{*AzI1b!p%8-d>l{0#vCJ^?-n zK2ZriF$O+<2|*DFVL{+Oy9NLnfF=L}^Z*nCK}9zRC~Vx^oFsU89b9?LAdcpiJQj{H zUQaV8UVa`vUO-CL)5*-j&eDy++|nBAAkDtt(8A6Dg-EmO2&?m{JKeXmfvR}BSU&N7 zq-Ei4XCV$@mzBAJFXbuW33Gy3x|uO}!t5PfB|N2>{&>0ss{ZpbFO!Uv3&cu7^MTS| zTA;p3GyP>O9v&V%9)diMF4nyK;^N}Gd;+`z0^Fz;+^${@Zf2g`4zA39dB6ipR|^-Y zlN;30f#K%^&CDI$-K3e^-JuW(D>Ex$a|lG3+uTeL!p+YQG2=Fa2=a471O*|Y!u(s+++FPd=o@0eYiVx@vvhECMWMmZ%f$QVp#Qu2L_ze|PXAN*Q8J?1 zNZfa^G;_0jfGRRSr<9vdl$%dX>))oCM_g1)iuadxQoKLG_;&;S?;ZJn2`w2Zh=qj3 z&#^nY{Mr2zOXvTy*|vwu{6U`o6c6?7&kE|fD5b;0!;|9u_g(%y27g>Z=^X05ziKGz zPKN&r$L~)5C6Ir^^&75#34wp9@o(+=4cEVfz`xY^w|4zM2G`$MuB8Kt+IpZk?DY(A zAHc!F!p6eH!N$VI#lgYFCni8~41BViw+M+T$fzhO$SCem)3GyB)3Vaup+0tI%)`^mJNU)RkXNrm!{9ORV&mcy5|c8s zvU76t@(T(ps}R*S$lAL45A7YDUEMt&dxu9x$Hpfnr>2*dS60{7H@<9c9UdKjJvse$ zc7E|wE;In+PqF?g*)MVtqvS%z#KgeF`6(9~x(BLY5MyF7@?(?8KgKb0zRe^Mh)b#v zol((-$1JFMKxXbTgip>Qw8VP&Q?x%M`>zQW^uHz9Uj_S{T+;v{1{!MeFo*$J;NXIz zh+O-QcVrK8WY@YOCFp^MlF@70M>s-g#$oW~c^|b`$c7LVwuoKNHO=C@&%r$fmgRN9 zU}O_q%3!vt_z?w7Y}cZkI3~osTOTM<)PsqfTYKvFE~-X*^;3o-xqLhvnQ3_DT}Nof zYypnUFo2_oPuFlxZ_Y@b?hHnzZ&8b$k!&M2pz;wTDv`3F7{P7w1PT2*ckgtM%)Eq! ze4i;eV(~413b7n1N73OIeZ;i$DC6)xr1B}3gZA7fQ=+_Q{DHxm-Aj>vc97av_eZmX z6ba!a*)MhJUeK5Z%Tc;%^+R3CB&r)TN?Im@UhzL~^K%3z1P;1j>4z(fu8(pT`*m`8 zh}u%&uA6Z7KlvOqiAm$UJslwUrRbT2c2=S4;bz<&9k`OfrXzuhmiJq<&in3m&K54G z6Xj}-Qr0&~^B>2zFdT89-{UCKp@PljrbB(4$@_dhuj9-T{m8R(B2v0p0zA#k@qsQQ z!@SI*8IxtSB3^JMh>1xbcrRzZM-QtoGjlCi=(Mz;fG=Cr2=Ux;{`!$YQ{K|ut%}+J zntO{8Zb4(&SWT-}@!2N{2nwro+a3$}WWR^0z<>8`$BY8`G_}|lFy&_2Jv(j!ozdNC zUs2VjhL^fUAgqR-_bXy7gw~iESoj>70e>q+qEzl$+|QfT=6 z)hzA(#i$nWSdsKEMICA;H7M0t$*=M1I2*1M;8ZrH5qpk;#)ic zR<;0G(AwUa(MpU`3}aFY)wPb>F4LvQWRTgf7kRYyUEkP;6kV_z#D$PDB7ccscVZw^|iOuOUvq>toN zQSI&O5Usa2DQOmu`vL9$*!16St8>ntpN2m3j_i=+c2o=Z5-nILU#i7n#+e{Rd-NxC z>#5oETV1QrgY(>R?Aqjwr!!PP)>k7=zKFlacYCY|kp4FovI$QYg7AF|yEq9xFX6jv zfvZ6@iB?GJK>VyYv@kf%ieV(FTeo60IiRbZDTJtn_X_qKMQ%hc$VYtn;tj$bF7a%3LOIDxV-Xgb_ z8(yhHg!vf!5RtTUQW`(n6Z2BWOr2~M-!;UXmdUecLaoJF1)FKz#vV$K%*f6VWdSL&62!xW-~1<@eN^pRyI{Sb3nWT~JO|~? zZ28e8$j;=n_h)unI}CX}dMKAw9uuos?Bay?vD89oY;oLO&E1iJrtX9JeCNE zjF5cR66bmTabPJXdZ6VyN7-%v!PA&)z(4yMSaI|J5Oyf~{ThJEe!Bo)iE1G3Zdrnm z?ak6rw3?o2BN{cjrNfe1H8#t!eA+7UYs$tHQC`5eq9OemPoEDOouz{wm`NV5AQQgs$6JC!w z+N^jtc3n*NR_H;b9O8o|3C@2uqk^sM}Lsn zsD2H|PJ@43p#+Z{8k)HnUz)hsH3y}-Z{N5r&*%OB2Zp&90BdZg+r4? zo`F0Izizj`5W#w{uGgbha-bvoNQ*95oNQq+J)8v@g)=5J9>UZm*58q%Q6nICbRZa* z{yzU#urnk%dAI$K%k@sK5Ncm*x2bV?t9?~x9v-Znrdc<><{rIZGGuM(5zXpmWFniL zIl^Of?gEPVp$Eu7M1DzD^{9IfQzJQFWvJOZyq@SGm5zuzv6}o5DI`E)`mbRAiZ)N& zRSnhWB3IXZ+cfmg^VDMORS~m_`D%{29~P{P7B?7gzI&FkYMR!VG-1TXyBnV1_!TBY z%gVbm0!lDo`jPRlm#-2nF<7xKnYGEY(Bzly#v&ybyTcH3kH#+u`e}1&60PKpdbMI1 zB97 z(+|~~Rn9fqHfBP+)bG|DS#CHA=zTkqIY_)a;n|%*YV`c}q59nFgn;oT+In=hX=Psk z=wtWZCjCE}i#5H7fPJjxV{$d$VKyFwzDGLe!`^K zR)oqZF2>XiCe`Vc*=3d9V2A5+@SN@{s*@#L`U*)ryN98rhx{!tTZys^;?k%U!Yo;|7Z}Y5C+)>#XK7Z}p z!|qIj(WyF3TQcz)vE6X#R)I)bcCQ)UcT0e=@-WX)EvcS=Y_MHRfE<2*YVyn4k^PTL zE0)#yjl~5&0|3Z@r&_e?{@jFEGbUT%24r%e9oXW;t8~)6=wj zMN8>7r#}{|c}Dl!(PkR{NMpQp4rj^+1}HWhX$7~7a>UYZ;%!RAr9zpE2|DWZO73|3 z9f4zAWIcih`-0y}^K1NKXqA68#Az)Pu`P>1Lv*$+p1yWtCv!I-&11!RLx>+DBfHua zP*>^v01$IeM)?ZimR21D34lSdT- zTq%Q6yE{~|bMs*_h9VofIqx{cx+xWWyvL0D{f%Di4SCbwm_8pjaU9M{b06hlcp9$~ zHP{cX>X9SPBH6A((-w17{Zu);yAd*OWbqJ*n7Hb8JDlvy?Ox|23Z&lY$ZWt8SQKU9 zahfFKd2+*d=4x{&RcdoX*%p;e^(!!Pd{I}E;($47O^ad{z(#ZO z#0Xf`UicNof&gK3LH6+1AxZU4w;LHJ9NGuKwBxOe?~UwRgu8!u{94e{`(s4GS0h zSN@ztT^*DJnbIXb$pM9NUl zc>-wHmfxYC%+|?#7LGi4(3R@?l~Y z_Vgql*22Aybdv30Xyc`t3m>L4XN6y=9{ZbSQRK|KvQOzIe5fxKj1Hm*{FwJowAo!} zQk$*f^X}KxCIUk5{vC-T*2R&po@KJnNXfulmFKD;CsiWcE~R;UPrG&zxVq&*`%yr9n7!peE5r?puuRdTdJg5rO|D$Tf-1>PJGYf zyRoA$V}4pV91O}0>54q**tzsbp?pLW6U)JovA%EGg(Rt?5V-=yEQ|}p9^EiQi*lp? z*xGL9LD|Ipi8&-0gL3ip-_eXN=3MXWW+hJrNjhNT^2CLI87#wWCgssr!iLVfo^^rHWsFC@!q;I?46(Ubt zDmAY}hHJcfRC5U{sfnQ-EfQ=X;{HgSF`2LX7I#oSMqeT-gqtUi3g=bvUcxp9yh=B_ zk5rxTQF}@u8)zdl$Vn0MtH$aWEa8M3$f+~TeQB{)i*cdukM~>7a86C=!pSl4Z7F%e zHfVPZXqSiP+BnI$nhqTL2wdGVtk>K%Dp6H*IWl# zMIPNz`rzhTft-#L51(jyGGL7(G}97%G1gzyNtC{ntZZ*?qE571odAdqho&0yh ziMy)DK{;e2H3_3$=sf>tCJ9K5=#%pyu=Hv8jkoCx=VrIa(E`o0ONU+FuU8_B?&vkt zjXDTR3&ToxGeZ^YZHPgfoONh%=*0YOebLbYXhr{&OR+F@%+tL)%j?wE_j?foUG7zW z-JMtTqLwcUOBsk2w|}YKEu~Eq_}=cuVme9Trf6r5E(srZI?qSZ0UVmxw!vKS631W7 z@2}1-pd#(U!r1c`h$!tsRev(nGVWOIO<#lLalK@w=$>gizSs(|oQn+WT6ZMsTz(Lx z{!FdNU3Xa#cG~%lqmEV-wmDX_;$d6;D_*)I&4ivs>Z7}k?;D4v4WMz5n_M{{P27Ro zd1RncyYlgN9;T|1Ri#6bSC(eCxB8*z)WR<#vy^%VBOFZg`b;<2#oKbfG@FD-IcIu$ zDw^|T=wa>l$x{pVF;zSuYK;yoC!}9yZZbli$uH|xCkj(XFI`T+@3Y4^b6x40PYsaC z%POdxU>6yGzjOF3%U61BFDFM$Y7Y|qY1jq)qpEmpr|kHu`BZeYzq)?Rt%cw$3}&E; zL^eFJA)$^G5POS049sh{7JDDb|G*Y;=ceR7VY~)#YhGD+3DrmVJBIF(xFeOqM4nE! zo;$;bY1?V}qa&Wu|1-2pBt;fZkmq=APkJ~x!?R=tw2qE|yL-znGJ zhx75>C#1uX(s`FZyXc+b;(RfY^NYoO^k&f`j475S3dI8nBPhYijH~`L5|4-GA{R0W zioRSt4Og@D&6B4~6h6u3H{^LYi?O}P_vcGN zk*)Oah=P%1fxk{YD#B6L@V$jtsOGAr>z5nqo^PrePRT3!QfCbx_RDh(pBI~8{MXJ!p!+1b8UZo2#&U2hZKU+K zTk5~dddXASWV44M&;fe+D)@E|m;``FWE`%ekvt3u?-$oD{-J&^Sh0*qrw-Di{g z`b(q-O~vNx{7MV&V69?#i9yFv!=|?ln3q$ym zD3XVhU6ol%5wkejBGvI`w^n&@rKP)pQ?e?N*i2jf&XaOvC6Y>d)MLU}HU9?AyWC~| zs0;`NM$IyoZVhVP&KS{Msz>cB^35Y)%tDoWSa{6zY_hL6f;HO?;ybnLxh{N*lL*;X zvC=jzQ~K{=G?=lf5Tsb!P__=(G@xGKFZM)A2k$o38uTkHuQ5i=S;c1G>foygazzle zTB8@^BFc6auL1i;(S_soo$xhl+2fH_026a}g+uf7S`TgrXd!g<^#~#_O@P{ahS6|r9E`AIEhV_(14=6fv@N@_>hu?M z+!Mm#k*a(9il9 zVQ&lf9kkMlx^W7~klA35vMJ9jo$8_GQFpd9Y-D6{XWL8N+7AVorW_b=m4TH&eSXwi zsy)HIwnAtHUQFLx){ZRiRoXn_BLTGTYQ2AmGC1#K!K4+BTT0qt`o~yjY)*Duw?+uw zB64`8kr??dlEZw5oT8na(%3a!?(UNZ$eP*_({Kd@jA7O|H_MCry(14xk|K4kW39qh zh7d8DG%TyXBN2`=w>)dUsg3MdHVi$`#@{B^vz%?4tQh>bu zDs0Krbhy;f-ULJ#FH)~6!FA~Zv3q|xS`;K^(hvq=yC(lrmnwNVK>)R2?v`+6&6Wsjbvt9NgNq?9Q+YV=u29^F zF1OO6#Bko9Nm?x)9@=~Aj+YxmNIQthq)nH8*OI=(oBfwr{560EvJowa%$RJQ7AfKl zfwfehaIC$wP%9#ZD+X-%6l2!Pij1+RV9W@|tcY=!G#dGkrD!OGukDA;UhRoLw1qtV ziuR50Sr=#7KV6CCluJKE?o@sD8_b!Svl;2+w7U~yP#i+u=Xm2TULSkJ+5V2`*qA(LJ~Ym8w-jEzpbl4I zLh5!@y^Ln2Kr63Xb#J4a%zU){=o&!L@<20giEBXL&lk)dq8$Q>MUPV=#CY${;$47A zyyIDVDtKc)d5KnG<^C&;B!i-KnfVXBV$(|+rPw{A*sUMiD5N%m>@s4O=*qXyH)Y5o z_Ha1eebb1X`{-_hyCdjqr45!u!%hr*B+b}@t)W)Mj7VghZ=qi0C$%DemX(4XemTni z3?ZG{hF;{~^ox^RKjv*tP-$sbapH+o*jZv3)i_uVxCS{N4dp9iCoWxXp9@6_3w!Qz zMI1II5jzSZ-d}x*EbK%lsqK2?ob*pkddzg+#putF9$%ySQr+Hnv*O&B7F2Ce^tpZ- z`}_*VNWIsXQ3Bn8XZTW-Vok5K${VUHl5pLM60>+MiK7j9?t3MBtlsVxZur$%+Ub42 zcc_$}3$}wa(}vh3P|X{(ardbXRrC?^aR^`wkkULJdaqOOSU1p_!~HEd!>S@i30Z^C z%0am6% z4ptu>jqm+xiGI~#tsHA-;hdJT?Yih?d6@f>Pf=`nRW*D>-%cg6q`v!gW92irxPul= z4VXP?d3*TR;N!1x(APyw$#v`RbaH+*4u2N|>F~-S+w}h>;y3gwV8DOYYFy{_6`5Tc z#pm2H$34V3!;5^&5M?e8zQ+L?M0w)@VoCQpf4m$>^yNZf%`BjIo(N^sP9zUVWW zq69~F5+sZi%JK~Vbueq_7hul@E*hc}2C-RHy7#3WOJxgp#|91YSciRBn9~5atkmr? zAI8=l@61A2(X6CI5kV6M<8N@X8EJ7s?#g;dL83gnoK&QI4M$EQ!JuQ=$`k4a zUP0=seo5s9T0um_iO_hE?U()U{P#A~NJsWdcpNg|Fte;QP8K$`I>6)}yF)k~0fctW z^T~h&n5|p=5Ifew>Zml?$Jv8&WY?nOcDneuQsSH5ry7Jjc<25vOefyvy{^vRW3PRq z+X5<;yWN9)L}f^Lqg&jbNIc(kteV<@(!Qi@F9VS*V^934k0y~qT*JxpB%_J@2h0w8 z@2Vwr5Y$vRb|U-g*w9cU-P%h)2!4y@n2J2ynZEN?wCRA%)9I|-EiiBm^f2zsNihMW4<`qM-_N!dzOBn%iE9cD zeF0-IE&HGRzq`CSGj66tMPq3VuD*9)1Kk`LzZ?CA;I|h1UI@Q!!EbZ; z+avr-PH{W*tmztH8GMX#!1O<;s$;JKf{Aw7^;Ym^t{h@YXSGxLDg{ZxUe+)r>nkEhn>bx zj+a8abUSVX!-AtDA)zE=L)A+!seHDK8RG+UGsvf|IOBV}Pr5%{0|!(M877U_08yj5 z-P(sxQa*mjL#blxfenZ3z}6icc{pf&@1!5BQElh5J>~E?_jtB#$%Ly&!#8A2!(m=f z$^IDKyt+)k5&a&LJHR)zwG_Rky2N2br($GOPyNPRZ1J;P6GTYl5ZZ=St31m!;Q8b3 z&dBjKAfVqg5xpg$Ve~lF(ZQ;D)V})e%ci-n?DnqbBo$FCYc-4yOvS{zduun|9+Dm` zY+jMo+3;4!LpNTg?Q=T&g&n?a;aSIUl_=v<{vuAD>iD!N4R2pqvM&@iUxGcOErYDSw-`?@O7-3j)0&3c#et_c5t_x99^|wAP&BS z47Oh}k23m{kf`vQgZw@fe-k+%cbvP+s&XC_r2D2}0E|;S#`5N261PIf?4+7|q4H^w z0qLjs+X+mTAxcXMYw6GAKbk9~Ij@{fH|JRTu&$+LxI4P@JpZ_y7`wpE&PDVHEVOOp z__Px4#dRXt=TRq~nptZpkmh%Yla^OjkfFLTk+)%HaPq^SxqQ?uRian+mdTgX&$c>m z=M6i?J7v@*Xm>Kfyk}=$7Tm=K#k!R%2e_vnRt@qb^8~P|39HwX=f$@SSjl?d=hy-+ z7kp&7y~vtAomP3(RDlQ+IGaW8IVs`xoM67`%9E(cbZg6_zRWTwQY=fE*LF>>OL5=G z;b`zTem^$8k#p7-?KViZIo#*$Qlsv0pft3Dte`V5hT@8g^H)IvvVyVMy=t zcFBdi)x|%AnqDfMV;;YrEM8Lih#3*^{!M(?DvvNs>1cpjt>Lm@_kDLPW|bg|Ck3@u z3X#}y(naJ>Oq<}mp-@e45oU-s_brop(v3BY8VQZG^a-;juzr;%Z=*m`lJPwCr5$xn z_|y7u_mKycicMqkEX2)b?#5xcqL?vad$KDEB0-BBOM=|16Z*Z(?sbTU zSfa}#5Mf!AM!f*bahlXrLKV3M{1K?=*+p{>)Pg&!4KR;KtsrtyQ*Ho5j z!OR~Fro_jpn+b+L`}&Y@#c93q@`^;-bw}3xYHC#GE4fm*(n2r-*34HKZcv{1_&t1`gEctu zkXityy%fkCao`-*RA?I`ut{~XoyJgQuz*n5`9}A=0`nd?PCkOvycM>Ho>$5VkAKpV zw2ZgjTnD$ad>W*j82;c+EWYbo9DBDS`h)0gf5%IY)2@uu7T(@V<>~ZKQz^T1DJxRT zX+e@b`r^$TP#BA`?UpuQqlb)r`E20YEu#qg+URoEZy`6Ij8{kr`(hen98$!nR@d|U zCZ_a`SZP?w`q)LKR=jyWA&K+ay~r+c4_57SW?jqB&*i~Wa+g;0mWw}?z4u-Tc^2z; z{hTf1-UdjW!7CE6m7uAQG&g&+A|^&C-3D*i)-S}B_j`AJ=oMzirLfWz0JT2k5SR5& zN_5*5e0)$KK}zmwJUx}T-~4%`q?gP;F(w+%kgWA`a*(pr@Azr+C_@nSOh_UF&(L0% z%Whq$G>E-GTL@kzkw2@BJwfRFkcYHjK(@ioch~l zC-p?FpFSNC`hnD5N4u?{XG#GGd5IJ8e+y+tnUXApv4O~Zq5hlG3`uu_mTz1>pPV&~ zSX`;V8O5h-8{Kqhl+Nj1E>Y;`cE*&THp=vh?8qFI+1ZfwNpU;veA068AZ`RYUNbyr z2=-^;@06W9ofeOlKW8KIu#lajy_uuse`l^Kxyyd1b7U%e?+ zbXcgd0U6?htcE&%T}Lv!o*EldqwP@;702`-ONTDeN@_OMyi%KJ9%WGraWgPw`y_X( zfYLwR&(AQn2PZ)3UWzg2B~Jx;o>}H*P5k4AI-3_WCN6K~pD+T?r#=KmNjmF^h-yqf zg4C8HQ7H^ZdsK4oFTO|Q$wrA67+cCRXAZj+Sxq>87v~bhb7Y)?v7ThM2|dZ54|dSa zH^)C&e)eZ-IQJG zGU*fU7bZtHw+|#a8t*QB9a(;>KQjx>l-#Ht^&4GZWO0#DWMXQj@coqf(rbfku%rH= z$<6m;*`K-A>Zx8$(I(jiHes3}O+f)27lc0Amhs`dJ4y*mVRJE$qI6?v@$}#dYQ@1) z!L?ER_dSz6LX=+ph&JVJKmVX%bu2a4@O!PhZ598V) z&wC&7w{|IUVvlsj*Z5D)8;bP*h=AC^wr1mDQ;x|gbP3(fw$uxxc2sHaru_>1rFYaN z9+kLqs`T_0*J^@+SK@RZ(v4)xwoMxKf6~Z|mOHO@gp%Lf{!u#2ttj+`Icadfi_C~l zak|^$uDCMo#c0W5AIyrP+DKsfVthc2BiF}(Q>XE*>afbo#~>W*y>QFqh`<3w%k~uz z(=c;oym~dtTPt4D^gA7y>&D3)Mlj#SdZgsq)1AlPx1u(aF2Om&Nu5l|93@=gIc5Gr z38bE^D-m5+H#e}7GL)%(RUcAjvTb=WJLLFk5UVSXwzGeB{e#WQ5#o|w#=JDSBa+gc zqqbGr$wkttSbeF;pzPHJTZy2unKhvYy={XEh8kz0N+kNKqebv&03h{sP}*#n!focz z3zh+$f9}flxk|e2NTmz?jc9aqda|^ySdraIlh%wP%$2sJzg92K#!_jsw6CF~sQUnY}4oVYU(q8Q)!Rtf@p8g!g$>Q`a*23pi7U^xfznQd%ZfQYj46`I+CE z-7qzCaaDflMZK1oLt_} z5in>&rdmK1_Q{-DE0#tDc_u)tO%r!e9Mg&aF^*gE72+C@4!bmwo%oIv zslG|5JNe}fSU4}<$1%HI@HX@FD(c+GD8V=N{88_k?|f!xKBz8P_Vl?l_Hus$lZ;RB z()eS=8-vahh`047=wwED^#-ymEIccxl&$aeve21l#S^O5N`|8{^rVY;z8S;!Rj44^ z&uZer-5lb?p0`(rVKVc4az5HfTRmMaLA^wlwyZvUQk{O1cOL7m%U$B0JmZ7cc&S^; zqGH)2V~)*ez#JSMY=n75JhF>2sxU9j)BQ)6ZX#U^_?|rxb^eW$j7~%c7H~;;k1x@rO3x{a&nQ(?g>yl5%tI29%9^NYW(wxDhl?qwv@gy~3kjnq zOJiNh7S5crhDxV!g?JL9DIxL=A1pR;kmQF70DdP*$yV|Wno1fwa_D`G#sV{_-X~Hl zQ$R_;n7g#=ae#A#HRpEMql{(YF)PtK(;}&?GnI`KoEyv!l?3#(<$|Aye~8LP?T>wu z5pJvso4z5?)zvjlnPycJ35jOoN~Vxa*iqq(4p25RywyEg=eh{uF8zAYU+}ujA~ueF zc-AQ74dr88b0UwsI8GFU4^8`B53Z;&o6;YTSt4qx5kX(PY92iNpxIa6WwPeft0efw zS&ux8gZ$2I>8S8LdD_mZ8AG`ealOZ{*;#sZ>Eq8-^0tfxnnsJQtL)S`J=u^U3lQ9P z^9MV95fytKB(CXYLh11*E#CoS|M7(b(4G`$p5Bb%!tAz{IzO8V_eOOMN#ov7DDI>o z)-*K^ciVEc#X(nm@fTxSB(m|6dw1Fh7d+V}iUo1W)xL3b0Ic{4e&LKCIoYThULA4Z zf(a9dzI()2M-GrcPxsj$_&9rwqe0KwmEfJM8zLR4P#kjX8`R*mN6!90^>byYa z(X5bt)t35(a!>2%DK5MeibSo#4vIaNdW&@$l1Kd7ZnAXUKg#g8M6}68jGs@q3gHP- zvf4nv3Gr5KyipQ9uDaVRlC{HuIY~B~c3WgPIza5O0(#D_moQ%YnxtKrX&yy*NY5%(eOh z2{yKB8ZZEDZkg~u!$74fbqpO^#k}9FnRbQ7%;Rm0ETu5d0F`kAM#=rGAdI~LWm-GV zc5l(|EHaGM!&jfVn?LJ%eQF3=QgiG1T6ClMhSl&OcrpLNIwAihbM2x)Im><|w)l=KTx|QvtOOOgA3@a^t%35` zTFXnOB(;}89f-~H!pI72*5G(*5NE5obMO=TS8%p0AreEQST&A)<{(KDM;0C5m(PP% zuvEm`>G-_*UA-Swh(vR(i)GY?tTPjye#2K)eU(CUOCA#)mmx@Ew4i8;yRQ$G%yyd8 z+l$@-shfUb9eug8x>3g*7~z^UB5w<`xx)0-rex;2Km-W7%U<;Sqr7SmezY_;F6pc%rdb`+gwTpTf3OZ$oAp*ifENyu#W^9&R+hJGVco#PALVmH%7QK6r>v+k@p6}P!H?)GHcNzd ziaS+o+IJ1fOdlQ3P!5upHe%T2s@>a~9%(J|8%S zjP&$)bNczXKqsM^RUbo?@N0Mg%&9xh+=Kt4)Ob#J)N=ihV*gxJx@#dVc$Wz$>M@OK zc_wE-G1{8UCX0`G$&CLtif2yz^Ur%sqhI%LK_^7DE4eBZNj<0&Lg*Jlb!Jzi1a*|( z@Mc7D6ua$;4B+ljJqw}GEzWsYRJ3QVT3JD{!ylos>$u%Reho~df5)&^Jz#x_d@b~D zJXu4eKB#|fW_B{!%R?#8$$8yU3BUZFfiw|k>Ij0G*7(8(F%zl3MUhGPfJ!G3mG830 z^#Y52o=0MZ43mygM4~e{CkGm*eOtf6w1UzIGKk3+JreGX$$9L>|-RwwsJ z(@Ppn>?MzYm!yk6^K(e-#t6IPY-_LY`Hc3C_90uMRQc@OH1t@4Ap9mjzP%0+o|8ok z<}-WO@hUfXJRQa%y1Ju%zfH|9VT;w9euWWWp|NXEnSw196*B8@xAnMqaiNqb-1%Jz zFY!e$rX?9ZPzhk)+5Wg*IOUnjh3M;}_$n>D)W|$AKS60q?-IpIf$!~sBKueJWy>J1 zCHKmMrkc_bWAN$g7l9=^?K4Vhv6IJ+=Y|&1$O4S{`uz|;I*^oE)^#V%cXJyW!U(tN~1Fxxv{Egennee&XR0e#21*>{1m{UVJ6`zvMNUyu8r5`%x- wVfCc-+2}SoCoaAQc9_yHcvKH0{znE+{wD@>evj#YW}xl&y!>nPay|9G0P%4$5&!@I literal 0 HcmV?d00001 From b3388d2d1e285f0b7dcb46b1c8283114eef53980 Mon Sep 17 00:00:00 2001 From: priya-mane Date: Sun, 25 Oct 2020 22:22:23 +0530 Subject: [PATCH 2/3] replace open for file access with open with, replace dict literal --- .../get_details.py | 37 ++++++------ .../Research_paper_latex_parser/op_json.json | 56 +++++++++---------- 2 files changed, 47 insertions(+), 46 deletions(-) diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py index 6ad39f48f..46673f5a2 100644 --- a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py @@ -18,7 +18,7 @@ def __init__(self, tex_data): def get_elements(self): data = self.tex_data - data_dict = dict() + data_dict = {} sections = re.findall(r'section{(.*?)\\', data, re.S) for obj in sections: @@ -122,20 +122,21 @@ def purge_equations(self): p = os.path.join(directory_path, tex_file) - data = open(p, encoding='latin-1').read() - - cd = clean_data(data) - cd.purge_images() - cd.purge_tables() - cd.purge_equations() - - ed = essential_data(cd.tex_data) - d = {} - d.update({"author": ed.get_author()}) - d.update({"title": ed.get_title()}) - d.update(ed.get_elements()) - d.update({"acknowledgement": ed.get_ack()}) - all_data.append(d) - - with open(op_file, "w") as outfile: - json.dump(all_data, outfile, indent=4) + with open(p, 'r', encoding='latin-1') as f: + data_lst = f.readlines() + data = ' '.join([str(elem) for elem in data_lst]) + cd = clean_data(data) + cd.purge_images() + cd.purge_tables() + cd.purge_equations() + + ed = essential_data(cd.tex_data) + d = {} + d.update({"author": ed.get_author()}) + d.update({"title": ed.get_title()}) + d.update(ed.get_elements()) + d.update({"acknowledgement": ed.get_ack()}) + all_data.append(d) + + with open(op_file, "w") as outfile: + json.dump(all_data, outfile, indent=4) diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json index 73d6b1145..43b88c6f9 100644 --- a/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/op_json.json @@ -2,46 +2,46 @@ { "author": "I.M. Great and So.R. Yu", "title": "A Sample Research Paper", - "Introduction": " \n\nUsing latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n\nMaecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n\n", - "Results": " \nIncluding figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n \n\n \n\n \n\n", - "Conclusions": " \n\nMan, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n\n", - "Some_title": " \n\nTest title for user defined section.\n\n", - "user_defined_title_for_begin": " \n\nwjlrhfwer ljqr flwuer j rlferfurl u airlf aiurf uoiruf iuoqir oiuqr iuq woe\n", - "acknowledgement": "\nThe author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n\n" + "Introduction": " \n \n Using latex is pretty easy if you have a sample document you can follow.Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed volutpat ornare odio et faucibus. Donec fringilla massa eget auctor viverra. Mauris a imperdiet est. Cras tincidunt nulla ut elit tristique ultricies. Phasellus nec orci vel mi suscipit maximus at vitae tortor. Vivamus sed libero vel lacus aliquam rhoncus. Ut in lacinia nunc. Nullam quis mauris leo. Phasellus vitae nisl condimentum quam congue volutpat. Quisque et dapibus ipsum. Curabitur fringilla pellentesque elit, non posuere purus malesuada id. Pellentesque rutrum vitae urna eu mattis.\n \n Maecenas ac congue massa. Quisque a sem turpis. Duis et diam ex. Suspendisse et enim interdum, sodales risus eu, ultrices est. Suspendisse eu odio enim. In vulputate odio porttitor tincidunt vestibulum. Praesent tincidunt ullamcorper purus, quis semper felis volutpat quis.\n \n ", + "Results": " \n Including figures, tables, and equations is easy. Latex also permits easy reference to document elements (figures, tables, sections). Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n \n \n \n \n \n \n \n ", + "Conclusions": " \n \n Man, latex is great! Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam tincidunt lorem luctus eros dictum faucibus. Fusce euismod libero et erat pretium dapibus. Pellentesque faucibus hendrerit est, ac fringilla urna. In porta, ante eu dictum vestibulum, nisl nulla euismod purus, ac bibendum nibh ante vel elit. Fusce diam ante, tincidunt id eleifend a, hendrerit vitae tellus. Duis pretium urna ac vestibulum eleifend. Suspendisse potenti. Aliquam varius odio in pretium semper. Ut faucibus lobortis mauris vel sollicitudin. Nullam condimentum, lacus quis mattis pellentesque, massa nulla cursus nisi, aliquet eleifend est tellus ut libero.\n \n ", + "Some_title": " \n \n Test title for user defined section.\n \n ", + "user_defined_title_for_begin": " \n \n wjlrhfwer ljqr flwuer j rlferfurl u airlf aiurf uoiruf iuoqir oiuqr iuq woe\n ", + "acknowledgement": "\n The author is grateful to Donald Knuth for inventing tex, and making publication quality typesetting a reality for scientists around the world.\n \n " }, { "author": "Pratik Merchant, Smit Moradiya, Jignesh Nagda, Niket Mehta", "title": "Parallel Implementation of Support Vector Machine", - "Introduction": " \n\nThe name \u00e2\u0080\u0098support vectors\u00e2\u0080\u0099 (data points) used to define this dividing plane. Since we only require the SVs to create a classifier, the non SVs can be discarded. However, it becomes a problem when the points are not separable by a simple linear plane. Hence, to handle this problem, SVM uses what is known as the \u00e2\u0080\u009ckernel trick\u00e2\u0080\u009d on the training data and the mapping to a higher dimensional space is done by it, where such a dividing plane can be found more easily. Every improve accuracy. The role of Kernel is to transform the problem using some linear algebra for linear SVM and this is how the learning of the hyperplane happens. To avoid misclassifying each training example is given by the regularisation parameter. Lower is the regularisation value, more is the misclassification. To decide how far the influence of each training parameter reaches, the gamma parameter is used. Low gamma value means points which are at a far distance from the separation line are considered for calculation whereas a high gamma value implies that only the points nearby to the separation line are considered for calculation. Lastly, the separation of the line/hyperplane to the point which closest to it is called as margin. A larger separation for both the classes means a good margin and also no crossing into other class.\nThe steps to implement SVM are as follows: Step 1: Import all the necessary libraries such as numpy, pandas, matplotlib. Step 2: Importing the dataset Step 3: Performing exploratory data analysis Step 4: Performing data pre-processing Step 5: Splitting the data into train and test. Step 6: Import SVM, create a classifier and train the model. Step 6: Making predictions Step 7: Evaluating the algorithm Step 8: Results \n\n", - "my_section": " \n\nnlwekndw lweuidwe nulei nameiude includewe oiu wede oiuwe dn eiuqwend\n\n", - "Results": " \nPSVM first loads the training data onto \u00e2\u0080\u0098m\u00e2\u0080\u0099 number of machines. This done in round robin fashion. The memory required by each memory is Big-Oh of nd/m. In the next step, PSVM performs a row-based ICF parallely on the data which has been loaded. At the end of this step, only a small portion of the factorized matrix is stored on each machine, which has a space complexity of Big-Oh of np/m. For the quadratic optimization problem, IPM is performed hereafter. Let, n- no. of training instances d-initial no. of dimensions p- After factorization, reduced matrix dimension (p<< Date: Mon, 26 Oct 2020 19:47:42 +0530 Subject: [PATCH 3/3] Added comments to the code and requirements.txt --- .../Research_paper_latex_parser/README.md | 6 +-- .../get_details.py | 39 +++++++++++++++++++ .../requirements.txt | 5 +++ 3 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/README.md b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md index 136d5a395..a50fd67fb 100644 --- a/Scripts/Miscellaneous/Research_paper_latex_parser/README.md +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/README.md @@ -21,11 +21,7 @@ The script returns a json object containing following items for each research pa ### Prerequisites ``` -pip install os -pip install json -pip install re -pip install argparse -pip install tqdm +pip install -r requirements.txt ``` *** diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py index 46673f5a2..e48f5d21f 100644 --- a/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/get_details.py @@ -20,6 +20,11 @@ def get_elements(self): data = self.tex_data data_dict = {} + """ + The next few lines extract data for the section tags specified in the latex. + Regular expressions are used to separate headings(h) and the content(c). + The heading and content are then added to a dictionary object. + """ sections = re.findall(r'section{(.*?)\\', data, re.S) for obj in sections: h = re.findall(r'(.*?)}', obj, re.S) @@ -27,6 +32,11 @@ def get_elements(self): data_dict['%s' % (h[0])] = '%s' % (c) data = data.replace("section{" + obj, " ") + """ + The next few lines extract data for the begin tags specified in the latex. + Regular expressions are used to separate headings(h) and the content(c). + The heading and content are then added to a dictionary object. + """ begins = re.findall(r'\\begin{(.*?)\\end', data, re.S) for obj in begins: h = re.findall(r'(.*?)}', obj, re.S) @@ -39,14 +49,34 @@ def get_elements(self): return data_dict def get_author(self): + """ + The Author tag is a specially mentioned tag in latex format. + Hence the Author name is extracted from this tag. + The user can choose to specify the tag as 'Author' or 'author'. + Hence the `[Aa]` is included in the regex. + """ author = re.findall(r'[Aa]uthor(s?){(.*?)}', self.tex_data, re.S) return author[0][1] def get_title(self): + """ + The Title tag is a specially mentioned tag in latex format. + Hence the title is extracted from this tag. + The user can choose to specify the tag as 'Title' or 'title'. + Hence the `[Tt]` is included in the regex. + """ title = re.findall(r'[Tt]itle{(.*?)}', self.tex_data, re.S) return title[0] def get_ack(self): + """ + The Acknowledgements tag is a specially mentioned tag in latex format. + Hence the acknowledgements is extracted from this tag. + The user can choose to specify the tag as 'acknowledgements' or 'Acknowledgements'. + Hence the `[Aa]` is included in the regex. + The user can also choose to specify it in singular sense like 'Acknowledgement' or 'acknowledgement'. + Hence the s is made optional at the end by writing `(s?)` in the regex. + """ acknowledgments = re.findall( r'\\[Aa]cknowledgment(s?)(.*?)\\', self.tex_data, re.S) return acknowledgments[0][1] @@ -101,23 +131,28 @@ def purge_equations(self): if __name__ == '__main__': + # Define description of the script. parser = argparse.ArgumentParser( description="extract title,author,abstract,introduction,results,conclusions and acknowledgments from given set of research papers.") + # Define inputs required for the script to run. parser.add_argument("-parent", help="enter path to parent directory containing all research papers", dest="parent_path", type=str, required=True) parser.add_argument("-output", help="enter path of output file", dest="op", type=str, required=True) + # Parse the arguments received from the command. args = parser.parse_args() directory_path = args.parent_path op_file = args.op all_data = [] + # Store all files from the mentioned directory. all_files = [f for f in listdir( directory_path) if isfile(join(directory_path, f))] + # Read all the files and extract information form each file. for tex_file in tqdm(all_files): p = os.path.join(directory_path, tex_file) @@ -125,11 +160,14 @@ def purge_equations(self): with open(p, 'r', encoding='latin-1') as f: data_lst = f.readlines() data = ' '.join([str(elem) for elem in data_lst]) + + # Use clean_data class methods to remove images, tables and equations/. cd = clean_data(data) cd.purge_images() cd.purge_tables() cd.purge_equations() + # Use essential_data class methods to extract the required data and store in json object. ed = essential_data(cd.tex_data) d = {} d.update({"author": ed.get_author()}) @@ -138,5 +176,6 @@ def purge_equations(self): d.update({"acknowledgement": ed.get_ack()}) all_data.append(d) + # Dump the json output object to the output file. with open(op_file, "w") as outfile: json.dump(all_data, outfile, indent=4) diff --git a/Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt b/Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt new file mode 100644 index 000000000..617464885 --- /dev/null +++ b/Scripts/Miscellaneous/Research_paper_latex_parser/requirements.txt @@ -0,0 +1,5 @@ +os-sys +json==2.0.9 +re==2.2.1 +argparse==1.1 +tqdm==4.29.1 \ No newline at end of file