diff --git a/doc/srcml.md b/doc/srcml.md index cf462ec..3002501 100644 --- a/doc/srcml.md +++ b/doc/srcml.md @@ -179,23 +179,25 @@ TRANSFORMATIONS: For instance, the XML markup can be enhanced with line and column coordinates. Notice also that srcML has built-in capabilities to query and manipulate the XML. Queries can be done with XPath expressions. General transformations can -be executed with XSLT. +be executed with XSLT. By the way, there is no need to first convert the +source to XML; the built-in queries capabilities work also directly off the +source code. Here are a few examples of useful operations: - Get all function and method definition names: ```console -$ srcml --xpath="//src:function/src:name" program.xml +$ srcml --xpath="//src:function/src:name" program.java ``` - Count the number of conditions: ```console -$ srcml --xpath='count(//src:condition)' program.xml +$ srcml --xpath='count(//src:condition)' program.c ``` - Output all line comments: ```console -$ srcml --xpath='//src:comment[@type="line"]' program.xml +$ srcml --xpath='//src:comment[@type="line"]' program.cpp ``` Much more versatile and powerful tools to process any XML are diff --git a/tools/json-graph/src/jsonml2jgf.c b/tools/json-graph/src/jsonml2jgf.c index 73a0bf8..c1395ac 100644 --- a/tools/json-graph/src/jsonml2jgf.c +++ b/tools/json-graph/src/jsonml2jgf.c @@ -94,7 +94,7 @@ static void det_coords_adjust_labels(Graph g) for (p = start; p < end; ++p) if (isspace(*p)) n->label->start++; - else if (*p == '\\' && (*p == 'n' || *p == 't')) { + else if (*p == '\\' && (*(p+1) == 'n' || *(p+1) == 't')) { n->label->start += 2; ++p; } diff --git a/tools/tokenizer/Makefile b/tools/tokenizer/Makefile index b6ca860..9851522 100644 --- a/tools/tokenizer/Makefile +++ b/tools/tokenizer/Makefile @@ -3,16 +3,31 @@ INCLUDES = CPPFLAGS = $(INCLUDES) -CFLAGS = -O2 +CFLAGS = -g -O2 -fPIC LDFLAGS = -PROGS = tokenize antlr4tojson pytokenize jstokenize +PROGS = tokenize antlr4tojson pytokenize jstokenize tokml libtoken.so .PHONY: all all: $(PROGS) -tokenize: tokenize.o -tokenize.o: tokenize.c +tokenize: tokenize.o libtoken.a +tokenize.o: tokenize.c libtoken.h + +tokml: tokml.o libtoken.a +tokml.o: tokml.c libtoken.h + +libtoken.o: libtoken.c libtoken.h + +.PHONY: lib +lib: libtoken.a libtoken.so + +libtoken.a: libtoken.o + ar r $@ $^ + ranlib $@ + +libtoken.so: libtoken.o + $(CC) -shared -Wl,-soname,$@.1 -o $@ $^ antlr4tojson: antlr4tojson.o antlr4tojson.o: antlr4tojson.c @@ -27,5 +42,5 @@ token_common.o: token_common.c token_common.h .PHONY: clean clean: - @-rm -f *.o + @-rm -f *.o *.a @-rm -f $(PROGS) diff --git a/tools/tokenizer/README.md b/tools/tokenizer/README.md index 6008eec..e82940f 100644 --- a/tools/tokenizer/README.md +++ b/tools/tokenizer/README.md @@ -8,29 +8,35 @@ This same repository also offers separate programs for a Python tokenizer of the command-line options and have the same output formats. Here we focus on the C/C++/Java tokenizer (`tokenize`), but most of this -documentation equally applies to the other tokenizer program. The `Makefile` -builds them all. +documentation equally applies to the other tokenizer program. +The `Makefile` builds them all. The following lexeme classes are recognized: - identifier -- reserved word/keyword +- reserved word/keyword of the language of the input source - binary, octal, decimal, hexadecimal and floating-point numbers - double-quoted string literal - single-quoted character literal - all single, double, and triple operator and punctuation symbols - the preprocessor tokens # and ## +- a number of pseudo tokens depending on selected options For each correctly recognized token, the program determines its class/type and the exact coordinates (line number and column) in the input text of its starting character. All token literals are output exactly as they appear in -the source text, without any interpretation of escaped characters. +the source text, without any interpretation of possibly escaped characters. -A newline is defined as a single linefeed character `\n` or the combination -carriage return `\r` followed by linefeed `\n`. -Line continuations (a backslash immediately followed by a newline) are handled +A newline is defined as a single linefeed character `\n`, a carriage return +`\r`, or the combination carriage return `\r` followed by linefeed `\n`. +Line continuations, i.e., a backslash immediately followed by a newline, are handled at the character input level, so the token recognizers will only see logical -lines. Line and column reflect positions in the physical line structure, not the logical one. +lines. Line and column coordinates however reflect positions in the physical line +structure, not the logical one. When so requested, logical line endings are +output as `newline` pseudo tokens and will be represented by a linefeed +character. Similarly, when requested, continuations are output as +`continuation` pseudo tokens and will be represented by a backslash-escaped +linefeed `\\n`. For instance the appearance of a line continuation inside a string literal: @@ -45,8 +51,8 @@ upon output as a token becomes: "A long string literal that is broken here to stretch over two lines." ``` -Moreover, white-space, control characters and comments are skipped and -anything left over is flagged as illegal characters. +White-space (SPACE and TAB characters), certain control characters, and comments are +normally skipped and anything left over is flagged as illegal characters. Since Java at the lexical level is very close to C and C++, this tokenizer can also be used for Java, albeit that some literal pecularities are not @@ -54,9 +60,7 @@ recognized. The program looks at the file name extension to determine the language. This can be overridden (and must be specified in case of using standard input) by the `-l` option. Depending on the language setting, the proper set of keywords will be -recognized. For C and C++ their -combined set of (95) keywords is recognized, assuming that a C program will not -inadvertently use C++ keywords as regular identifiers. +recognized. ## Program options @@ -68,22 +72,24 @@ A tokenizer for C/C++ (and Java) source code with output in 6 formats. Recognizes the following token classes: keyword, identifier, integer, floating, string, character, operator, and preprocessor. -usage: tokenize [ -1acdhjl:m:no:rsvw ] [ FILES ] +usage: tokenize [ -1acdhjkl:m:nNo:rsvwW ] [ FILES ] Command line options are: -a : append to output file instead of create or overwrite. -c : treat a # character as the start of a line comment. -d : print debug info to stderr; implies -v. -h : print just this text to stderr and stop. --j : assume input is Java (deprecated: use -l Java or .java). +-k : output line and block comments as tokens. -l : specify language explicitly (C, C++, Java). -m : output mode either plain (default), csv, json, jsonl, xml, or raw. -n : output newlines as a special pseudo token. +-N : output line continuations as a special pseudo token. -o : write output to this file (instead of stdout). -s : enable a special start token specifying the filename. -1 : treat all filename arguments as a continuous single input. -v : print action summary to stderr. -w : suppress all warning messages. +-W : output adjacent white-space as a token. ``` The program reads multiple files. Depending on the `-1` option, the files @@ -95,7 +101,7 @@ the mode setting. ## Multiple output modes The tokenizer has multiple output modes. They are plain text, CSV, JSON, JSONL -and XML. A sample of plain text output looks like this: +XML, and RAW mode. A sample of plain text output looks like this: ```text ( 62, 0) preprocessor: # @@ -139,26 +145,52 @@ and XML. A sample of plain text output looks like this: Line numbers are 1 based, columns start at 0 (Emacs-style). The token classes are: -| Class: | Description: -|--------------|------------ -| identifier | any identifier -| keyword | a reserved word -| integer | integer number irrespective of notation -| floating | a floating-point number -| string | a double-quoted string (maybe empty) -| character | a single-quoted character -| operator | any operator or punctuator symbol -| preprocessor | either # or ## -| filename | pseudo token: start of a new file -| newline | pseudo token: end of logical line +| Class: | Description: +|---------------|------------ +| identifier | any identifier +| keyword | a reserved word +| integer | integer number irrespective of notation +| floating | a floating-point number +| string | a double-quoted string (maybe empty) +| character | a single-quoted character +| operator | any operator or punctuator symbol +| preprocessor | either `#` or `##` + +The following classes are only recognized when the appropriate switch has been set: + +| Class: | Description: | Switch: +|---------------|----------------------------------------|--------- +| line_comment | treat `#` till end of line as comment | -c -k +| line_comment | a comment that starts with `//` | -k +| block_comment | a comment enclosed in `/*` and `*/` | -k +| filename | pseudo token: start of a new file | -s +| newline | pseudo token `\n`: end of logical line | -n +| continuation | pseudo token `\\n`: line continuation | -N +| whitespace | adjacent white-space | -W The `filename` token is optional. It will be included when the `-s` option is provided. It is a pseudo token that provides the filename of the input as the first token. Similarly, the `newline` is a pseudo token and appears only with the `-n` option. It signals the end of a logical line. Mind that multiple -newlines occurring in sequence are not suppressed. The `newline` token has no -textual representation, e.g. in XML mode output it will appear as an empty -text element. +newlines occurring in sequence are not suppressed nor aggregated but appear as +separate newline tokens (the same holds for continuations). +The `newline` token will +be represented by a linefeed character (LF). Depending on the output mode this +will be escaped appropriately. The `-W` would normally also collect any +newlines except when `-n` is a also set and continuations except when `-N` is +set in which case they are treated as separate tokens. To summarize, the valid +combinations of these options and their effect are: + +| Switches: | Effect on output: +|-----------|------------------ +| | all white-space, line endings inclusive, discarded +| -n | newline tokens for logical lines +| -N | continuation tokens +| -W | whitespace tokens inclusive all physical line endings +| -n -N | newline and continuation tokens +| -W -n | whitespace tokens and newline tokens separately +| -W -N | whitespace tokens and continuation tokens separately +| -W -n -N | whitespace, newline, and continuation all separately ### CSV output @@ -177,7 +209,10 @@ line,column,class,token ``` The operator token `,` is escaped with double quotes, like so `","`. -String tokens are escaped as well and any original double quote is doubled. +String tokens are always escaped and any original double quote is doubled. +A newline on its own or as part of whitespace will appear escaped as `\n`. +A whitespace token text will appear inside double quotes. A continuation token +will appear as `"\\n"`. ### JSON output @@ -222,6 +257,18 @@ tokens. (An alternative would be to use the CDATA construct.) ``` +## tokML + +Recently a new program has been added: `tokml`. As the name suggests the +output is in XML format but unlike the `-mxml` option to `tokenize`, `tokml` +outputs the original source code annotated with XML elements that supply the +token information. This is an approach identical to what `srcML` does for a +parse tree. The precise XML syntax used is defined by the RelaxNG schema in +the file `tokml-schema.rnc`. + +The XML annotation makes it very convenient to apply XPath and XQuery queries +to the token stream, e.g. by using tools like `xidel` and `xmlstarlet`. + ## References > [1] diff --git a/tools/tokenizer/antlr4tojson.c b/tools/tokenizer/antlr4tojson.c index f2d2b9c..7cdb254 100644 --- a/tools/tokenizer/antlr4tojson.c +++ b/tools/tokenizer/antlr4tojson.c @@ -45,9 +45,9 @@ #include #include #include -#include /* getopt() */ -#include /* basename() */ -#include /* tolower() */ +#include /* getopt() */ +#include /* basename() */ +#include /* tolower() */ // POSIX Extended Regular Expressions for all parts of token output. @@ -87,15 +87,15 @@ class_RE "),(channel=(" posint_RE "),)?(" line_RE "):(" column_RE ")\\]$" // Program option settings: -static int debug = 0; // when 1 debug output to stderr +static int debug = 0; // when 1 debug output to stderr static int verbose = 0; // when 1 info output to stderr -static int nowarn = 0; // when 1 warnings are suppressed -static int start_token = 0; // when 1 start filename pseudo-token +static int nowarn = 0; // when 1 warnings are suppressed +static int start_token = 0; // when 1 start filename pseudo-token static int continuous_files = 0;// when 1 do not reset after each file // Program globals: static char *filename = "stdin";// current file being parsed -static unsigned num_files = 0; // number of files read +static unsigned num_files = 0; // number of files read static unsigned linenr = 1; // line number counted from 1 static enum { CSV, JSON, JSONL, RAW } mode = JSON; @@ -179,10 +179,10 @@ static void JSON_escape(FILE *out, const char *p, unsigned len) const char peek = len ? *(p+1) : anything_but_valid_escape; // look ahead fputc('\\', out); if (strchr("\\\"bfnrt", peek)) { - // An valid JSON escape. Output it and skip peek: - c = peek; - p++; - len--; + // An valid JSON escape. Output it and skip peek: + c = peek; + p++; + len--; } //else Not a correct JSON escape, a standalone backslash; double it. } @@ -214,7 +214,7 @@ static unsigned get(char const *text) if (regexec(re, text, nmatch, pmatch, REG_NOTEOL) == REG_NOMATCH) { // Warn about the failed match: fprintf(stderr, "(W) [%s:%u] not a valid token; skipped.\n", - filename, linenr); + filename, linenr); // Cannot recover; no more input. return 0; } @@ -256,22 +256,22 @@ static unsigned get(char const *text) case CLASS_IDENT: // CSV output does not need the quoting. if (mode == JSON || mode == JSONL) - fputc('"', stdout); + fputc('"', stdout); // Undo the capitalization? fputc(tolower(*p), stdout); fwrite(p+1, 1, len-1, stdout); if (mode == JSON || mode == JSONL) - fputc('"', stdout); + fputc('"', stdout); break; case TEXT: // CSV output benefits from quoting; must escape the " fputc('"', stdout); // Strip off the enclosing single quotes. if (mode == JSON || mode == JSONL) - JSON_escape(stdout, p+1, len-2); + JSON_escape(stdout, p+1, len-2); else if (mode == CSV) - CSV_escape(stdout, p+1, len-2); + CSV_escape(stdout, p+1, len-2); fputc('"', stdout); break; case CLASS_STRING: @@ -281,10 +281,10 @@ static unsigned get(char const *text) // Keep the enclosing single quotes! fputc('"', stdout); if (mode == JSON || mode == JSONL) - JSON_escape(stdout, p, len); + JSON_escape(stdout, p, len); else if (mode == CSV) - CSV_escape(stdout, p, len); + CSV_escape(stdout, p, len); fputc('"', stdout); break; case CHANNEL: @@ -333,7 +333,7 @@ main(int argc, char *argv[]) case 'h': fputs( -"A converter for the ANTLR4 token output format.\n\n", stdout); +"A converter for the ANTLR4 token output format.\n\n", stderr); fprintf(stderr, usage_str, basename(argv[0])); fputs( "\nCommand line options are:\n" @@ -358,7 +358,7 @@ fputs( else if (!strcmp(optarg, "raw")) mode = RAW; else { - if (!nowarn) + if (!nowarn) fprintf(stderr, "(W): Invalid mode %s (using csv).\n", optarg); mode = CSV; } @@ -419,10 +419,10 @@ fputs( break; case CSV: if (!continuous_files || num_files == 1) - fputs("seqnr,start,stop,text,class,channel,line,column\n", stdout); + fputs("seqnr,start,stop,text,class,channel,line,column\n", stdout); else { - fputc('\n', stdout); - first_time = 1; + fputc('\n', stdout); + first_time = 1; } if (start_token) { fprintf(stdout, "0,0,0,%s,File,0,1,0\n", filename); @@ -431,20 +431,20 @@ fputs( case JSON: case JSONL: if (!continuous_files || num_files == 1) { - if (mode == JSON) fputs("[\n", stdout); + if (mode == JSON) fputs("[\n", stdout); } else { - if (mode == JSON) fputc(',', stdout); - fputc('\n', stdout); - first_time = 1; + if (mode == JSON) fputc(',', stdout); + fputc('\n', stdout); + first_time = 1; } if (start_token) { - // Must quote filename: + // Must quote filename: fprintf(stdout, - "{\"seqnr\":0, \"start\":0, \"stop\":0, \"text\":\"%s\"," - " \"class\":\"File\", \"line\":1, \"column\":0}", - filename); - first_time = 0; + "{\"seqnr\":0, \"start\":0, \"stop\":0, \"text\":\"%s\"," + " \"class\":\"File\", \"line\":1, \"column\":0}", + filename); + first_time = 0; } break; } @@ -452,19 +452,19 @@ fputs( while (getline(&line, &len, stdin) != -1) { // If already did some output must close that previous line: if (first_time) - first_time = 0; + first_time = 0; else { - switch (mode) { - case RAW: - break; - case JSON: - fputc(',', stdout); - /*FALL THROUGH*/ - case CSV: - case JSONL: - fputc('\n', stdout); - break; - } + switch (mode) { + case RAW: + break; + case JSON: + fputc(',', stdout); + /*FALL THROUGH*/ + case CSV: + case JSONL: + fputc('\n', stdout); + break; + } } get(line); // no , and/or \n output yet linenr++; @@ -476,15 +476,15 @@ fputs( // Trailer: switch (mode) { case RAW: - break; + break; case JSON: - // no last comma! - fputs("\n]", stdout); - /*FALL THROUGH*/ + // no last comma! + fputs("\n]", stdout); + /*FALL THROUGH*/ case CSV: case JSONL: - fputc('\n', stdout); - break; + fputc('\n', stdout); + break; } first_time = 1; } diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk new file mode 100755 index 0000000..3105a63 --- /dev/null +++ b/tools/tokenizer/filter6.awk @@ -0,0 +1,263 @@ +#!/usr/bin/awk -f + +# Copyright (c) 2021 International Business Machines Corporation +# Prepared by: Geert Janssen + +# Expects a C/C++ tokenizer generated CSV file as input with explicit +# whitespace and separate newline (and continuation) tokens. +# (tokenize -W -n [-N] -mcsv) +# Outputs one possibly modified token (class or literal) per line. +# Tries to use some context to better discriminate the meaning of some +# otherwise ambiguous tokens. + +# Should use yacc/bison or lemon? + +# Ambiguous tokens in C/C++: +# < > delimiters of filename in preprocessor include directive +# Resolved by using preceding #include context +# < > delimiters of template parameters +# < less than operator +# Resolve: preceding context keyword template, template <+ +# > greater than operator +# Resolve: preceding context keyword template < +# " " delimiters of filename in preprocessor include directive +# " " delimiters of string literal +# Resolved by using preceding #include context +# ( ) expression grouping +# ( ) argument list +# { } block +# { } initializer +# [ ] indexing +# [ ] lambda capture +# ~ destructor +# ~ unary operator +# - unary operator +# - binary operator +# Resolve: no white-space after - then unary? +# * unary operator (dereference pointer) +# * binary operator (multiplication) +# * pointer declarator +# & bitwise and operator +# & address of operator +# Can of worms: overloaded operator symbols + +# Simplistic CPP line syntax: +# "#" directive-name (token)* newline + +# #include +# #include "local" +# #define identifier-macro-def +# #define identifier-macro-const val +# #define identifier-macro-func( ... ) + +# Using a stack to remember CSV token lines whose output is temporarily +# suppressed. That way can have unbounded lookahead. +# Use function to empty and print stack from bottom to top. + +function push(record) { + stack[sp++]=record +} + +function empty_out() { + for (i=0; i disambiguation. +(state == 0 && $4 == "template") { + print $0 + next_state=0 # switched off for now +} + +# # seen; expect directive or identifier. +(state == 1 && $3 == "identifier") { + push($0) + if ($4 in directive) { + if ($4 == "include") + next_state=2 + else + if ($4 == "define") + next_state=7 + else { + empty_out() + next_state=0 + } + } + else { # #ident => stringize to "ident" + empty_out() + next_state=0 + } +} + +# Handle #include <... +(state == 2 && $4 == "<") { + # Note: suppressing this token. + next_state=3 +} + +# Handle #include "...". +(state == 2 && $3 == "string") { + # $4 has enclosing " doubled! + filename=substr($4,3,length($4)-4) + empty_out() + print $1 "," $2 ",string-local-filename," filename + next_state=0 +} + +# Collect all tokens after the < till >. +# Treat first specially to get its coordinates. +(state == 3 && ($3 == "identifier" || $3 == "keyword")) { + id_lin=$1 + id_col=$2 + filename=$4 + # Note: modifying this token. + next_state=4 +} + +# Keep collecting tokens till > or newline. +(state == 4 && $3 != "newline" && $4 != ">") { # eats up anything + filename=filename $4 + # Note: suppressing this token. + next_state=4 +} + +# Handling #include <...>, or #include <...newline. +(state == 4 && ($3 == "newline" || $4 == ">")) { + # When newline it's an error, but act as if > was present: + empty_out() + print id_lin "," id_col ",string-sys-filename,\"" filename "\"" + if ($3 == "newline") + print $0 + # else suppressing the > token. + next_state=0 +} + +# Handle template <. +(state == 5 && $4 == "<") { + $3="start-template-paramlist" + print $0 + next_state=6 +} + +# Handle template < >, explicit specialization. +(state == 6 && $4 == ">") { + $3="end-template-paramlist" + print $0 + next_state=0 +} + +# Handle #define name. +(state == 7 && ($3 == "identifier" || $3 == "keyword")) { + id_lin=$1 + id_col=$2 + macro_name=$4 + # Note: modifying this token later. + next_state=8 +} + +# Handle #define name(. +(state == 8 && $4 == "(") { + empty_out() + print id_lin "," id_col ",identifier-macro-func," macro_name + print $0 + next_state=0 +} + +# Handle #define name whitespace +(state == 8 && $3 == "whitespace") { + # Note: suppressing this token. + next_state=9 +} + +# Handle #define name whitespace? newline +((state == 8 || state == 9) && $3 == "newline") { + empty_out() + print id_lin "," id_col ",identifier-macro-def," macro_name + print $0 + next_state=0 +} + +# Handle #define name whitespace !newline. +(state == 9 && $3 != "newline") { + empty_out() + print id_lin "," id_col ",identifier-macro-const," macro_name + print $0 + next_state=0 +} + +# Default rule; always executed: +# 1. no prior rule matched: +# - stay in same state only for whitespace, newline, and continuation; +# this allows for their presence without explicit mention in rules +# - output any previously suppressed tokens (to not lose them) +# - print current token except for whitespace +# - back to state 0 to quickly recover for any errors in input +# 2. some rule matched: +# - simply move on to next state as stated in that rule +# - reset next_state to -1 +{ + if (next_state == -1) { + # Echo the current token as is (ignore whitespace though): + if ($3 != "whitespace") { + if ($3 != "newline" && $3 != "continuation") { + empty_out() + state=0 + } + print $0 + } + # otherwise: Do not change state! + } + else { + state=next_state + next_state=-1 + } +} + +END {} diff --git a/tools/tokenizer/jstokenize.c b/tools/tokenizer/jstokenize.c index c935837..e9604a8 100644 --- a/tools/tokenizer/jstokenize.c +++ b/tools/tokenizer/jstokenize.c @@ -108,20 +108,20 @@ static int tokenize(char *token, const char **type, // Skip till end-of-line (\n exclusive): while ((cc = get()) != EOF && cc != '\n' && cc != '\r') ; - // cc == '\n' || cc == '\r' || cc == EOF - if (cc == '\r') { - if (!nowarn) - fprintf(stderr, - "(W): Unexpected continuation in line comment.\n"); - // Effectively ignore any \ and terminate logical line: - cc == '\n'; - } + // cc == '\n' || cc == '\r' || cc == EOF + if (cc == '\r') { + if (!nowarn) + fprintf(stderr, + "(W): Unexpected continuation in line comment.\n"); + // Effectively ignore any \ and terminate logical line: + cc == '\n'; + } goto restart; } if (cc == '*') { - // Remember start position: - unsigned lin = linenr; + // Remember start position: + unsigned lin = linenr; // Skip till */ inclusive: int nc = get(); // if EOF next get will be EOF too @@ -130,9 +130,9 @@ static int tokenize(char *token, const char **type, nc = get(); if (nc == EOF) { // Error! fprintf(stderr, - "(E): [%s:%u] Unexpected end-of-file in /* comment.\n", - filename, lin); - unexpect_eof++; + "(E): [%s:%u] Unexpected end-of-file in /* comment.\n", + filename, lin); + unexpect_eof++; return 0; } } while (cc != '*' || nc != '/'); @@ -153,13 +153,13 @@ static int tokenize(char *token, const char **type, // Skip till end-of-line (\n exclusive): while ((cc = get()) != EOF && cc != '\n' && cc != '\r') ; - if (cc == '\r') { - if (!nowarn) - fprintf(stderr, - "(W): Unexpected continuation in hashbang comment.\n"); - // Effectively ignore any \ and terminate logical line: - cc == '\n'; - } + if (cc == '\r') { + if (!nowarn) + fprintf(stderr, + "(W): Unexpected continuation in hashbang comment.\n"); + // Effectively ignore any \ and terminate logical line: + cc == '\n'; + } goto restart; } // seen # but not #! @@ -201,37 +201,37 @@ static int tokenize(char *token, const char **type, int pc; do { token_add(cc); - pc = cc; - cc = get(); - if (cc == '\r') { - if (!nowarn) - fprintf(stderr, - "(W): Unexpected continuation in regex literal.\n"); - // Effectively ignore: - cc = get(); - } - - if (cc == '\n') { + pc = cc; + cc = get(); + if (cc == '\r') { if (!nowarn) - fprintf(stderr, - "(W): Unexpected newline in regular expression literal.\n"); - // discard: - cc = get(); - } + fprintf(stderr, + "(W): Unexpected continuation in regex literal.\n"); + // Effectively ignore: + cc = get(); + } + + if (cc == '\n') { + if (!nowarn) + fprintf(stderr, + "(W): Unexpected newline in regular expression literal.\n"); + // discard: + cc = get(); + } - if (cc == EOF) { + if (cc == EOF) { if (!nowarn) - fprintf(stderr, - "(W): Unexpected EOF in regular expression literal.\n"); + fprintf(stderr, + "(W): Unexpected EOF in regular expression literal.\n"); unexpect_eof++; - break; - } + break; + } } while (cc != '/' || pc == '\\'); token_add(cc); // the / cc = get(); while (strchr("gimsuy", cc)) { token_add(cc); - cc = get(); + cc = get(); } unget(cc); *type = "regex"; @@ -259,15 +259,15 @@ static int tokenize(char *token, const char **type, int nesting = 0; // keep track of ${} nesting do { token_add(cc); - // For template can have nesting inside placeholder ${...} - // FIXME: no check for nested paired ``; same for {} - if (qc == '`') { - if (pc == '$' && cc == '{') - nesting++; - else - if (cc == '}') - nesting--; - } + // For template can have nesting inside placeholder ${...} + // FIXME: no check for nested paired ``; same for {} + if (qc == '`') { + if (pc == '$' && cc == '{') + nesting++; + else + if (cc == '}') + nesting--; + } // Assume \ is not escaped itself. if (pc != '\\' && cc == qc && !nesting) { // unescaped quote @@ -283,16 +283,16 @@ static int tokenize(char *token, const char **type, if (cc == '\n' && qc != '`') { // Ok in template if (!nowarn) - fprintf(stderr, - "(W): Unexpected unescaped newline in string.\n"); + fprintf(stderr, + "(W): Unexpected unescaped newline in string.\n"); // discard cc = get(); } if (cc == EOF) { if (!nowarn) - fprintf(stderr, - "(W): Unexpected EOF in string/template.\n"); + fprintf(stderr, + "(W): Unexpected EOF in string/template.\n"); unexpect_eof++; break; } @@ -312,11 +312,11 @@ static int tokenize(char *token, const char **type, unget(cc); token[len] = '\0'; if (is_keyword(token, keywords, num_keywords)) { - *type = "keyword"; - regex_ok = !!is_keyword(token, regex_preceders, num_preceders); + *type = "keyword"; + regex_ok = !!is_keyword(token, regex_preceders, num_preceders); } else - *type = "identifier"; + *type = "identifier"; break; } @@ -340,16 +340,16 @@ static int tokenize(char *token, const char **type, } int_lit = DEC; // assume decimal number /* BIN: 0[bB][01](_?[01])* - LEGACY_OCT: 0[0-7]+ - OCT: 0[oO][0-7](_?[0-7])* - DEC: 0|[1-9](_?[0-9])* - HEX: 0[xX][0-9a-fA-F](_?[0-9a-fA-F])* + LEGACY_OCT: 0[0-7]+ + OCT: 0[oO][0-7](_?[0-7])* + DEC: 0|[1-9](_?[0-9])* + HEX: 0[xX][0-9a-fA-F](_?[0-9a-fA-F])* - EXP: [eE][+-]?[0-9](_?[0-9])* + EXP: [eE][+-]?[0-9](_?[0-9])* - FLOATING: .[0-9][_0-9]*EXP? - | DEC.([0-9][_0-9]*)?EXP? - | DEC EXP + FLOATING: .[0-9][_0-9]*EXP? + | DEC.([0-9][_0-9]*)?EXP? + | DEC EXP */ if (cc == '0') { @@ -368,14 +368,14 @@ static int tokenize(char *token, const char **type, int_lit = HEX; break; default: - if ('0' <= nc && nc <= '7') { - token_add(cc); // the 0 - int_lit = LEGACY_OCT; - } - else { - unget(nc); - nc = cc; - } + if ('0' <= nc && nc <= '7') { + token_add(cc); // the 0 + int_lit = LEGACY_OCT; + } + else { + unget(nc); + nc = cc; + } break; } cc = nc; @@ -454,9 +454,9 @@ static int tokenize(char *token, const char **type, } if (cc == 'n') // BigInt - token_add(cc); + token_add(cc); else - unget(cc); + unget(cc); *type = "integer"; break; @@ -492,28 +492,28 @@ static int tokenize(char *token, const char **type, if (strchr("*+-<>&|?.=", cc) && c2 == cc) { // double or triple // ** ++ -- << >> && || ?? .. == - // special case ++ and -- - if (c2 == '+' || c2 == '-') { + // special case ++ and -- + if (c2 == '+' || c2 == '-') { token_add(c2); *type = "operator"; break; - } + } // ** << >> && || ?? .. == int c3 = get(); - // special case . and ... + // special case . and ... if (c2 == '.') { if (c3 == '.') { // ... token_add(c2); token_add(c3); } - else { - // ..x - unget(c3); - unget(c2); - } + else { + // ..x + unget(c3); + unget(c2); + } // . *type = "operator"; break; @@ -530,18 +530,18 @@ static int tokenize(char *token, const char **type, // ** << >> && || ?? == - if (c2 == '>' && c3 == c2) { - // >>> - int c4 = get(); + if (c2 == '>' && c3 == c2) { + // >>> + int c4 = get(); token_add(c3); - if (c4 == '=') - // >>>= - token_add(c4); - else - unget(c4); - } - else - unget(c3); + if (c4 == '=') + // >>>= + token_add(c4); + else + unget(c4); + } + else + unget(c3); // ** << >> && || ?? == *type = "operator"; @@ -552,7 +552,7 @@ static int tokenize(char *token, const char **type, // also missing => ?. !== <= >= == != += -= *= %= &= |= ^= /= if (cc == '?' && c2 == '.' || - cc == '=' && c2 == '>') { + cc == '=' && c2 == '>') { // ?. => token_add(c2); *type = "operator"; @@ -562,20 +562,20 @@ static int tokenize(char *token, const char **type, // still missing !== <= >= == != += -= *= %= &= |= ^= /= if (c2 == '=') { - // <= >= == != += -= *= %= &= |= ^= /= - token_add(c2); - if (cc == '!') { - // != - int c3 = get(); - if (c3 == '=') - // !== - token_add(c3); - else - unget(c3); - } + // <= >= == != += -= *= %= &= |= ^= /= + token_add(c2); + if (cc == '!') { + // != + int c3 = get(); + if (c3 == '=') + // !== + token_add(c3); + else + unget(c3); + } } else - unget(c2); + unget(c2); *type = "operator"; break; } @@ -711,7 +711,7 @@ int main(int argc, char *argv[]) fputs( "A tokenizer for JavaScript source code with output in 6 formats.\n" "Recognizes the following token classes: keyword, identifier, integer,\n" -"floating, string, regex, and operator.\n\n", stdout); +"floating, string, regex, and operator.\n\n", stderr); fprintf(stderr, usage_str, basename(argv[0])); fputs( "\nCommand line options are:\n" diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c new file mode 100644 index 0000000..2fa4563 --- /dev/null +++ b/tools/tokenizer/libtoken.c @@ -0,0 +1,1282 @@ +/* Copyright (c) 2021, 2022 International Business Machines Corporation + Prepared by: Geert Janssen + + Code functionality shared by all tokenizers. + This obviously avoids code duplication and associated maintenance problems. +*/ + +#include "libtoken.h" + +// Program globals: +const char *filename = "stdin"; // current file being parsed +unsigned linenr = 1; // physical line number counted from 1 +unsigned column = 0; // byte position in physical line, from 0 +unsigned char_count = 0; // total byte count +unsigned utf8_count = 0; // total utf-8 encoded unicode codepoints + +int buffer[MAX_BUF]; // use buffer as multi-char lookahead. +unsigned buffered = 0; // number of buffered bytes +unsigned saved_col = 0; // one-place buf for last column on prev line + +// Program option settings: +int debug = 0; // when 1 debug output to stderr +int verbose = 0; // when 1 info output to stderr +int nowarn = 0; // when 1 warnings are suppressed + +unsigned illegals = 0; // count number of illegal characters +unsigned unexpect_eof = 0; // encountered unexpected EOF +int hash_as_comment = 0; // when 1 treat # as line comment +int newline_token = 0; // when 1 output newline pseudo-token +int comment_token = 0; // when 1 output comments as tokens +int whitespace_token = 0; // when 1 output adjacent white-space as a token +int continuation_token = 0; // when 1 output line continuation pseudo-token + +static int logical_lines = 0; // when 1 ignore line continuations in get) + +// Must be synced with enum TokenClass! +const char *token_class[] = { + /* 0*/ "identifier", + /* 1*/ "keyword", + /* 2*/ "string", + /* 3*/ "character", + /* 4*/ "integer", + /* 5*/ "floating", + /* 6*/ "operator", + /* 7*/ "preprocessor", + /* 8*/ "line_comment", + /* 9*/ "block_comment", + /*10*/ "whitespace", + /*11*/ "newline", + /*12*/ "continuation", + /*13*/ "filename", + /*14*/ "endoffile" +}; + +/* No longer using perfect hash function but simple binary search. */ + +/* C11 n1570.pdf 6.4.1 (44) + C17 n2176.pdf 6.4.1 (A.1.2) (44) +*/ +static const char *C_keywords[] = { + "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", + "_Generic", "_Imaginary", "_Noreturn", "_Static_assert", + "_Thread_local", + + "auto", "break", "case", "char", "const", + "continue", "default", "do", "double", "else", + "enum", "extern", "float", "for", "goto", + "if", "inline", "int", "long", "register", + "restrict", "return", "short", "signed", "sizeof", + "static", "struct", "switch", "typedef", "union", + "unsigned", "void", "volatile", "while" +}; + +#if 0 +/* C++ 2014 n4296.pdf 2.11 (84) */ +static const char *CPP_keywords[] = { + "alignas", "alignof", "and", "and_eq", "asm", + "auto", "bitand", "bitor", "bool", "break", + "case", "catch", "char", "char16_t", "char32_t", + "class", "compl", "const", "const_cast", "constexpr", + "continue", "decltype", "default", "delete", "do", + "double", "dynamic_cast", "else", "enum", "explicit", + "export", "extern", "false", "float", "for", + "friend", "goto", "if", "inline", "int", + "long", "mutable", "namespace", "new", "noexcept", + "not", "not_eq", "nullptr", "operator", "or", + "or_eq", "private", "protected", "public", "register", + "reinterpret_cast", "return", "short", "signed", "sizeof", + "static", "static_assert", "static_cast", "struct", "switch", + "template", "this", "thread_local", "throw", "true", + "try", "typedef", "typeid", "typename", "union", + "unsigned", "using", "virtual", "void", "volatile", + "wchar_t", "while", "xor", "xor_eq" +}; +#endif + +/* C++23 n4885.pdf 5.11 (92) */ +static const char *CPP_keywords[] = { + "alignas", "alignof", "and", "and_eq", "asm", + "auto", "bitand", "bitor", "bool", "break", + "case", "catch", "char", "char16_t", "char32_t", + "char8_t", "class", "co_await", "co_return", "co_yield", + "compl", "concept", "const", "const_cast", "consteval", + "constexpr", "constinit", "continue", "decltype", "default", + "delete", "do", "double", "dynamic_cast", "else", + "enum", "explicit", "export", "extern", "false", + "float", "for", "friend", "goto", "if", + "inline", "int", "long", "mutable", "namespace", + "new", "noexcept", "not", "not_eq", "nullptr", + "operator", "or", "or_eq", "private", "protected", + "public", "register", "reinterpret_cast", "requires","return", + "short", "signed", "sizeof", "static", "static_assert", + "static_cast", "struct", "switch", "template", "this", + "thread_local", "throw", "true", "try", "typedef", + "typeid", "typename", "union", "unsigned", "using", + "virtual", "void", "volatile", "wchar_t", "while", + "xor", "xor_eq" +}; + +/* Java SE 8 (50) (false, true, null are literals) */ +/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */ +static const char *Java_keywords[] = { + "abstract", "assert", "boolean", "break", "byte", "case", + "catch", "char", "class", "const", "continue", "default", + "do", "double", "else", "enum", "extends", "final", + "finally", "float", "for", "goto", "if", "implements", + "import", "instanceof", "int", "interface", "long", "native", + "new", "package", "private", "protected", "public", "return", + "short", "static", "strictfp","super", "switch", "synchronized", + "this", "throw", "throws", "transient", "try", "void", + "volatile", "while" +}; + +static const char *Python_keywords[] = { + "False", "None", "True", "and", "as", "assert", "async", + "await", "break", "class", "continue", "def", "del", "elif", + "else", "except", "finally", "for", "from", "global", "if", + "import", "in", "is", "lambda", "nonlocal", "not", "or", + "pass", "raise", "return", "try", "while", "with", "yield" +}; + +/* Includes future reserved keywords, strict mode reserved words and module + code reserved words, as well as all the older standards future reserved + words, and the literals null, false, and true. +*/ +static const char *JavaScript_keywords[] = { + "abstract", "await", "boolean", "break", "byte", + "case", "catch", "char", "class", "const", + "continue", "debugger", "default", "delete", "do", + "double", "else", "enum", "export", "extends", + "false", "final", "finally", "float", "for", + "function", "goto", "if", "implements", "import", + "in", "instanceof", "int", "interface", "let", + "long", "native", "new", "null", "package", + "private", "protected", "public", "return", "short", + "static", "super", "switch", "synchronized", "this", + "throw", "throws", "transient", "true", "try", + "typeof", "var", "void", "volatile", "while", + "with", "yield" +}; + +#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]); + +/* Generic binary search lookup in some keyword table. + `word' to be searched must be NUL-terminated C string. + `table' is array of const char * of `size' sorted alphabetically. + Returns word found (i.e., pointer value in table) or 0. +*/ +#define lang_is_keyword(lang) \ + static const char *lang##_is_keyword(const char *word) \ + { \ + int i = 0, j = num_keywords(lang); \ + while (i < j) { \ + int k = (i + j) >> 1 /* / 2 */; \ + const char *kw = lang##_keywords[k]; \ + int cmp = strcmp(word, kw); \ + if (!cmp) \ + return kw; \ + if (cmp < 0) j = k; else i = k + 1; \ + } \ + return 0; \ + } + +/* Define individual is_keyword functions per language: */ +/* C_is_keyword */ +lang_is_keyword(C) +/* CPP_is_keyword */ +lang_is_keyword(CPP) +/* Java_is_keyword */ +lang_is_keyword(Java) +/* Python_is_keyword */ +lang_is_keyword(Python) +/* JavaScript_is_keyword */ +lang_is_keyword(JavaScript) + +const char *(*is_keyword)(const char *) = C_is_keyword; + +/* Conversion table from filename extension to language code. + To find language code, consider all entries and check each ext + against filename; matched language is langs[i].lang. + Invariant: langs[X].lang == X for every Language value. + String representation of language code is langs[X].name. + + Have certain config settings depend on the language. + Use 2 step: + 1. determine language from name/extension + 2. look up language config +*/ +static const struct { + const char *ext; + Language lang; + const char *name; +} + langs[] = { + { ".c", C, "C" }, + { ".cpp", CPP, "C++" }, + { ".java", JAVA, "Java" }, + { ".js", JAVASCRIPT, "JavaScript" }, + { ".py", PYTHON, "Python" }, + + // Alternatives: + { ".h", C, "" }, + { ".C", CPP, "" }, + { ".cc", CPP, "" }, + { ".hh", CPP, "" }, +}; + +const char *lang_name(Language lang) +{ + return langs[lang].name; +} + +static const struct { + //Language lang; implicit + const char *(*is_keyword)(const char *); +} + lang_configs[] = { + { C_is_keyword, }, + { CPP_is_keyword, }, + { Java_is_keyword, }, + { JavaScript_is_keyword, }, + { Python_is_keyword, }, +}; + +/* Must be called right after a file is opened as stdin. + Will attempt to remove any UTF-8 unicode signature (byte-order mark, BOM) + at the beginning of the file. + Unicode: U+FEFF + UTF-8: EF BB BF + + First bytes Encoding Must remove? + 00 00 FE FF UTF-32 big endian Yes + FF FE 00 00 UTF-32 little endian Yes + FE FF UTF-16 big endian Yes + FF FE UTF-16 little endian Yes + 00 00 00 xx UTF-32 big endian No + xx 00 00 00 UTF-32 little endian No + 00 xx UTF-16 big endian No + xx 00 UTF-16 little endian No + otherwise UTF-8 No +*/ +static void remove_BOM(void) +{ + int c1 = getchar(); + if (c1 == 0xEF) { + int c2 = getchar(); + if (c2 == 0xBB) { + int c3 = getchar(); + if (c3 == 0xBF) { + return; + } + if (c3 != EOF) buffer[buffered++] = c3; + } + if (c2 != EOF) buffer[buffered++] = c2; + } + if (c1 != EOF) buffer[buffered++] = c1; +} + +int open_as_stdin(const char *file) +{ + filename = file; + if (!freopen(filename, "r", stdin)) { + if (!nowarn) + fprintf(stderr, "(W): Cannot read file %s.\n", filename); + return -1; + } + return set_or_detect_lang(0); +} + +/* Deal with DOS (\r \n) and classic Mac OS (\r) (physical) line endings. + In case of CR LF skip (but count) the CR and return LF. + In case of CR not followed by LF turns the CR into LF and returns that. + All other chars are returned as is. + Note: never returns a CR (\r). Line/column counts are not affected here. +*/ +static int normalize_newline(void) +{ + /* No need to recognize Unicode code points here. */ + int cc = getchar(); + + if (cc == '\r') { + // Maybe \r \n (CR NL) combination? + int nc = getchar(); + if (nc == '\n') { + char_count++; // counts the carriage return + utf8_count++; + // No use incrementing column. + return nc; // return \n; effectively skipping the \r + } + // Mind nc not \n. ungetc(EOF) is Okay. + ungetc(nc, stdin); + // cc == '\r'; consider a newline as well, so turn into \n: + cc = '\n'; + } + return cc; +} + +/* Detects escaped newlines (line continuations) and signals them with the + special '\r' character (that otherwise is not used). + Keeps track of physical coordinates and absolute location for each character. +*/ +int get(void) +{ + int cc; + + restart: + // Get the next character: + if (buffered) { // chars available in lookahead buffer + cc = buffer[--buffered]; // never EOF + char_count++; + // cc maybe '\r' (line continuation); act like '\n': + if (cc == '\n' || cc == '\r') { + linenr++; + saved_col = column; + column = 0; + return cc; + } + column++; + return cc; + } + + // Read a fresh char: + cc = normalize_newline(); // cc != '\r' + if (cc == EOF) return EOF; + char_count++; + if (utf8_start(cc)) utf8_count++; + + if (cc == '\n') { // a normalized end-of-line (\r|\r?\n) + linenr++; + saved_col = column; + column = 0; + return cc; // \n here signals a logical end-of-line + } + + // Deal with explicit \ line continuations! + if (cc == '\\') { + // Must look ahead (never maintained across get calls!): + int nc = normalize_newline(); // cc != '\r' + if (nc == '\n') { + char_count++; // counts the newline + utf8_count++; + linenr++; // on next physical line + saved_col = column+1; // +1 for backslash + column = 0; + + if (logical_lines) + // Still need to get a character. + // Could again start a line continuation! + goto restart; + + // Signal that this was an escaped newline (= line continuation): + return '\r'; + } + // Mind nc not \n. ungetc(EOF) is Okay. + ungetc(nc, stdin); + // cc == '\\' a regular backslash + } + column++; + return cc; +} + +/* Undo action of a get() lookahead call. + An attempt at undoing an EOF read has no effect. + Since get() encodes logical line endings with \n and continuation + line endings with \r, both could be subject to an unget(). +*/ +void unget(int cc) +{ + if (cc == EOF) return; + if (buffered < MAX_BUF) { + if (cc == '\n' || cc == '\r') { + linenr--; + // column was 0 right after getting the \n + // hopefully there are no multiple ungets of \n + column = saved_col; + } + else + column--; + char_count--; + buffer[buffered++] = cc; + } + else { + fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF); + exit(2); + } +} + +/* Either set this file's input language explicitly via a string or + use the filename extension to determine the language. + If neither works out, use the default language C. + Uses global filename (maybe stdin). + Once the language is known, configs for that language are applied, + e.g. the correct keyword table to use. +*/ +Language set_or_detect_lang(const char *source) +{ + int i; + Language lang = C; // default language + + if (source) { + /* Check if explicit language is known: */ + for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++) + if (!strcmp(source, langs[i].name)) { + lang = langs[i].lang; + goto done; + } + fprintf(stderr, "(E): No support for language `%s'.\n", source); + } + + char *p; + if (p = strrchr(filename, '.')) { + for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++) + if (!strcmp(p, langs[i].ext)) { + lang = langs[i].lang; + goto done; + } + fprintf(stderr, "(E): Unknown filename extension `%s'.\n", p); + } + if (!nowarn) + fprintf(stderr, "(W): Assuming default language C.\n"); + + done: + is_keyword = lang_configs[lang].is_keyword; + return lang; +} + +// Dynamically sized token buffer: +static char *token_buf = 0; +static unsigned token_alloc = 0; +static unsigned token_len = 0; + +// Makes sure there is room in the token buffer. +static void token_buf_room(void) +{ + if (token_len == token_alloc) { // all space used up + if (!token_alloc) { // first time allocation + token_alloc = 65536; + if (!(token_buf = malloc(token_alloc))) { + fprintf(stderr, "(F): Allocation of token buffer failed.\n"); + exit(4); + } + token_buf[0] = '\0'; // for safety + return; + } + + token_alloc <<= 1; + if (!(token_buf = realloc(token_buf, token_alloc))) { + fprintf(stderr, "(F): Reallocation of token buffer failed.\n"); + exit(4); + } + //fprintf(stderr, "Realloc-ed token buf.\n"); + } +} + +// Appends a character to the token buffer, always making sure there is room. +static void token_buf_push(int cc) +{ + token_buf_room(); + // There is room: token_len < token_alloc + token_buf[token_len++] = cc; +} + +// Undoes the push action but only if there is some content. +static int token_buf_pop(void) +{ + return token_len ? token_buf[--token_len] : 0; +} + +// Adds a terminating NUL character which does not change the token length. +static void token_buf_close(void) +{ + token_buf_room(); + token_buf[token_len] = '\0'; // Note: no advance +} + +// Resets the token buffer cursor. +static void token_buf_reset(void) +{ + token_len = 0; +} + +/* Tokenization of C++ programming language source text. + Recognizes: + - identifier + - reserved word/keyword + - binary, octal, decimal, hexadecimal and floating-point numbers + - double-quoted string literal + - single-quoted character literal + - all single, double, and triple operator and punctuation symbols + - the preprocessor tokens # and ## + Optionally: + - filename start_token + - line_comment comment_token + - block_comment comment_token + - newline newline_token + - continuation continuation_token + - whitespace whitespace_token + + Normally skips white-space and comments and flags anything + left over as illegal characters. + + (Approximately 20 tests per single character worst-case.) + + Returns 0 upon EOF else the token length in bytes. + (There are no 0-length tokens!) + EOF may be interpreted as a token. The function then returns: + token = "", type = endoffile, line and col correctly defined. + + An unexpected EOF in the middle of a token will cause an error message + and the partial token to be output first before a next call returns 0 + (to indicate the EOF condition). +*/ + +unsigned C_tokenize_int(const char **token, enum TokenClass *type, + unsigned *line, unsigned *col, unsigned *pos) +{ + int cc; + *type = ENDOFFILE; + + do { // infinite loop; after token recognized breaks out. + // Start collecting a token. + token_buf_reset(); + *line = linenr; + *col = column; + *pos = char_count; + // white-space tokens see continuation lines: + logical_lines = 0; + cc = get(); + + restart: + // cc already read; coordinates for it are correct. + + /*** WHITE-SPACE ***/ + + /* In principle all consecutive white-space including \n and \r (and some + other control chars) are collected and form a single whitespace token. + However, when newlines are requested to be reported as separate tokens, + they break this pattern. Note that we cannot issues multiple tokens + in a single call to this function. + + Token buf will only hold some white-space chars when implicitly + requested via whitespace_token; otherwise stays empty. + Same for the \n and \r requests. + */ + + if (cc == '\n' && newline_token) { // end of a logical line + // Here we assume the buf is empty. + token_buf_push(cc); + *type = NEWLINE; + break; + } + + if (cc == '\r' && continuation_token) { // end of a physical line + // Here we assume the buf is empty. + token_buf_push('\\'); + token_buf_push('\n'); + *type = CONTINUATION; + break; + } + + // Aggregate as much white-space as possible. + // FIXME: officially a NUL should be considered white-space. + while (isspace(cc)) { // i.e., cc in [ \f\n\r\t\v] + // Here: !newline_token (!continuation_token) + if (whitespace_token) + if (cc == '\r') { // line continuation + // Convert back to original char sequence: + token_buf_push('\\'); + token_buf_push('\n'); + } + else + token_buf_push(cc); // perhaps \n + //else: white-space is discarded + + // Here: whitespace_token implies token_len > 0 + + cc = get(); + if (cc == '\n' && newline_token || + cc == '\r' && continuation_token) { + // Must issue whitespace token if so requested. + if (whitespace_token) { + // Undo lookahead (unget(EOF) has no effect!): + unget(cc); // next token will be newline/continuation + *type = WHITESPACE; + token_buf_close(); + *token = token_buf; + return token_len; + } + // Issue newline/continuation token right away: + goto restart; + } + } + // Here: !isspace: must break or start real token. + + if (whitespace_token && token_len) { + // Undo lookahead (unget(EOF) has no effect!): + unget(cc); + *type = WHITESPACE; + break; + } + + if (cc == EOF) { + token_buf_reset(); + break; + } + + // Rest of tokens treat line continuations as non-existent: + logical_lines = 1; + + // If white-space skipped must reset coordinates: + *line = linenr; + *col = column-1; + *pos = char_count-1; + + /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/ + // Java: no preprocessor directives. + + // NULs (like many other chars) in comments are silently ignored! + + if (cc == '#' && hash_as_comment) { + if (comment_token) + token_buf_push(cc); + // Skip till end-of-line (\n exclusive): + while ((cc = get()) != '\n' && cc != EOF) + if (comment_token) + token_buf_push(cc); + // cc == '\n' || cc == EOF + // Don't consider \n part of comment. + if (comment_token) { + // Undo lookahead (unget(EOF) has no effect!): + unget(cc); + *type = LINE_COMMENT; + break; + } + *line = linenr-1; + *col = saved_col; + *pos = char_count; + goto restart; + } + + /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/ + + if (cc == '/') { + cc = get(); + if (cc == '/') { + if (comment_token) { + token_buf_push(cc); + token_buf_push(cc); + } + // Skip till end-of-line (\n exclusive): + while ((cc = get()) != '\n' && cc != EOF) + if (comment_token) + token_buf_push(cc); + // cc == '\n' || cc == EOF + // Don't consider \n part of comment. + if (comment_token) { + // Undo lookahead (unget(EOF) has no effect!): + unget(cc); + *type = LINE_COMMENT; + break; + } + *line = linenr-1; + *col = saved_col; + *pos = char_count; + goto restart; + } + + if (cc == '*') { + if (comment_token) { + token_buf_push('/'); + token_buf_push(cc); + } + // Skip till */ inclusive: + int nc = get(); // if EOF next get will be EOF too + if (comment_token && nc != EOF) + token_buf_push(nc); + do { + cc = nc; + nc = get(); + if (nc == EOF) { // Error! + fprintf(stderr, + "(E): [%s:%u] Unexpected end-of-file in /* comment.\n", + filename, *line); + unexpect_eof++; + if (comment_token) + // Better return partial comment as token and postpone EOF: + *type = BLOCK_COMMENT; + else + token_buf_reset(); + token_buf_close(); + *token = token_buf; + return token_len; + } + if (comment_token) + token_buf_push(nc); + } while (cc != '*' || nc != '/'); + // cc == '*' && nc == '/' + // Don't consider char right after */ as part of comment. + if (comment_token) { + *type = BLOCK_COMMENT; + break; + } + *line = linenr; + *col = column; + *pos = char_count; + cc = get(); + goto restart; + } + // seen / but not // or /* + unget(cc); // char after / + cc = '/'; // restore / + } + + // If white-space and/or comments skipped must reset coordinates: + *line = linenr; + *col = column-1; + *pos = char_count-1; + + /*** CHAR and STRING PREFIX (C/C++) ***/ + + // Allow u,U,L prefix for string and char + // FIXME: allow u8 as prefix for string + if (cc == 'L' || cc == 'u' || cc == 'U') { + token_buf_push(cc); + cc = get(); + if (cc == '"') + goto string_token; + if (cc == '\'') + goto char_token; + // u,U,L will be interpreted as (start of) identifier. + unget(cc); // char after u,U,L + cc = token_buf_pop(); // restore original and remove from token + } + + /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/ + // Java: false, true, null are literals + // FIXME: Flag to allow .letter as part of identifier? + // (compound identifier) + + // Simplistic solution to allowing Unicode: allow any char >= 128 without + // actual checking for UTF-8. + if (isalpha(cc) || cc == '_' || cc == '$' || (cc & 0x80)) { + token_buf_push(cc); + while (isalnum(cc = get()) || cc == '_' || cc == '$' || + cc != EOF && (cc & 0x80)) + token_buf_push(cc); + unget(cc); + token_buf_close(); + *type = is_keyword(token_buf) ? KEYWORD : IDENTIFIER; + break; + } + + /*** INTEGER and FLOATING ***/ + // Java: uses _ in numbers as insignificant separator + // Java: decimal suffix: [lL], float suffix: [fFdD] + // Java: allows hex float + +#if 0 + // Examples: + int bin_num = 0B010101u; + int oct_num = 01234567L; + int hex_num = 0x123ABCLL; + int dec_num = 12345678; + + float flt_num1 = 077.; + float flt_num2 = 077.987; + float flt_num3 = 77.; + float flt_num4 = .77; +#endif + + // . digits ... floating + if (cc == '.') { + // Look ahead for a digit: + int nc; + if (isdigit(nc = get())) { + unget(nc); + goto start_fraction; + } + unget(nc); + // Could go immediately to operator: goto seen_period + } + + if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal + // Types of integer literals: + enum { + BIN, OCT, DEC, HEX + } int_lit = cc == '0' ? OCT : DEC; + + // Lookahead: + int nc = get(); + if (int_lit == OCT && (nc == 'x' || nc == 'X')) { + int_lit = HEX; + token_buf_push(cc); // the 0 + cc = nc; // the x or X + } + else + if (int_lit == OCT && (nc == 'b' || nc == 'B')) { + int_lit = BIN; + token_buf_push(cc); // the 0 + cc = nc; // the b or B + } + else + unget(nc); // isdigit(cc) + + do { + token_buf_push(cc); + cc = get(); + + // Allow for ' between `digits': + if (cc == '\'') { + // Keep the ' in the token for now: + token_buf_push(cc); + int nc = get(); + if (isdigit(nc) || int_lit == HEX && isxdigit(nc)) + cc = nc; + else { // Error! + fprintf(stderr, + "(E): [%s:%u] C++14 only allows ' between digits.\n", + filename, linenr); + // what to do? + } + } + } while (isdigit(cc) || int_lit == HEX && isxdigit(cc)); + // !is[x]digit(cc) + + // FIXME: allow hex floats in C + if (int_lit == OCT || int_lit == DEC) { + int floating = 0; + // Seen digits-sequence. Maybe followed by . or e or E? + if (cc == '.') { // fractional part + start_fraction: + floating = 1; + token_buf_push(cc); + // digits? FIXME: again allow ' between digits + while (isdigit(cc = get())) + token_buf_push(cc); + // !isdigit(cc) + } + // cc != '.' || !isdigit(cc) + if (cc == 'e' || cc == 'E') { // exponent + floating = 1; + token_buf_push(cc); + if ((cc = get()) == '-' || cc == '+') { + token_buf_push(cc); + cc = get(); + } + // FIXME: no check for at least 1 digit + // FIXME: again allow ' between digits + while (isdigit(cc)) { + token_buf_push(cc); + cc = get(); + } + // !isdigit(cc) + } + if (floating) { + if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L') + token_buf_push(cc); + else + unget(cc); + *type = FLOATING; + break; + } + } + + // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case + if (cc == 'l' || cc == 'L') { + token_buf_push(cc); + // maybe another l + cc = get(); + if (cc == 'l' || cc == 'L') { + token_buf_push(cc); + // Here: token is digits[lL][lL] + cc = get(); + } + // maybe a u + if (cc == 'u' || cc == 'U') + // Here: token is digits[lL][lL]?[u|U] + token_buf_push(cc); + else + unget(cc); + } + else if (cc == 'u' || cc == 'U') { + token_buf_push(cc); + // maybe an l + cc = get(); + if (cc == 'l' || cc == 'L') { + token_buf_push(cc); + // Here: token is digits[uU][lL] + cc = get(); + } + // maybe another l + if (cc == 'l' || cc == 'L') + // Here: token is digits[uU][lL]?[lL] + token_buf_push(cc); + else + unget(cc); + } + else + unget(cc); + *type = INTEGER; + break; + } + + /*** STRING (C/C++/Java) ***/ + + if (cc == '"') { + string_token: + token_buf_push(cc); + // Watch out for escaped " inside string. + cc = get(); + while (cc != '"') { + if (cc == EOF) { // Error! + fprintf(stderr, + "(E): [%s:%u] Unexpected end-of-file in string literal.\n", + filename, *line); + unexpect_eof++; + // Better return partial string as token and postpone EOF: + *type = STRING; + token_buf_close(); + *token = token_buf; + return token_len; + } + token_buf_push(cc); + int nc = get(); + + if (cc == '\\') { + // FIXME: No check on valid escape char! + // ' " ? \ a b f n r t v + token_buf_push(nc); + cc = get(); + } + else + cc = nc; + } + // cc == '"' + token_buf_push(cc); + *type = STRING; + break; + } + + /*** CHARACTER (C/C++/Java) ***/ + + if (cc == '\'') { + char_token: + token_buf_push(cc); + // Watch out for escaped ' inside char. + cc = get(); + // Cannot have empty char! + if (cc == '\'') { + fprintf(stderr, + "(E): [%s:%u] Cannot have an empty character literal.\n", + filename, linenr); + // Output as token anyway, but count as illegal: + token_buf_push(cc); + *type = CHARACTER; + illegals++; + break; + } + + // FIXME: Avoid including too many chars. + while (cc != '\'') { + if (cc == EOF) { // Error! + fprintf(stderr, + "(E): [%s:%u] Unexpected end-of-file in character literal.\n", + filename, linenr); + unexpect_eof++; + // Better return partial character as token and postpone EOF: + *type = CHARACTER; + token_buf_close(); + *token = token_buf; + return token_len; + } + if (cc == '\n') { // Error! + fprintf(stderr, + "(E): [%s:%u] Cannot have end-of-line in character literal.\n", + filename, linenr); + illegals++; + // Immediately terminate character literal as if ' present. + // cc = '\''; make into valid literal??? No! + break; + } + token_buf_push(cc); + int nc = get(); + if (cc == '\\') { + token_buf_push(nc); + cc = get(); + // FIXME: No check on valid escape char! + // ' " ? \ a b f n r t v 0[d[d]] xh* + } + else { + cc = nc; + // If first char then expect no more. + if (token_len == 2) { + if (nc != '\'') { + fprintf(stderr, + "(E): [%s:%u] Cannot have multi-character literal.\n", + filename, linenr); + illegals++; + // Immediately terminate character literal as if ' present. + // cc = '\''; make into valid literal??? + break; + } + } + } + } + if (cc == '\'') + token_buf_push(cc); + else + unget(cc); + *type = CHARACTER; + break; + } + + /*** OPERATOR (and PUNCTUATION) (C/C++/Java) ***/ + + // Operator and punctuation symbols. Longest match. + + /* Operator or punctuator Alternative representation + { <% + } %> + [ <: + ] :> + # %: (not supported here) + ## %:%: (not supported here) + */ + + // Single char operator or punctuator (C/C++/Java) + // { } [ ] ( ) ; : ? . ~ ! + - * / % ^ = & | < > , + // Double char operator or punctuator (C/C++) + // <: :> <% %> + // Double char operator or punctuator (C/C++/Java) + // += -= *= /= %= ^= &= |= == != <= >= && || << >> ++ -- -> + // Double char operator or punctuator (C++/Java) + // :: + // Double char operator or punctuator (C++) + // .* + // Triple char operator or punctuator (C/C++/Java) + // ... <<= >>= + // Triple char operator or punctuator (C++) + // ->* <=> + // Java: @ >>> >>>= + + //seen_period: + + token_buf_push(cc); + token_buf_close(); + //token=[cc,0];len=1 + + if (strstr("{}[]();?~,@", token_buf)) { // allow @ for Java + // Single char operator/punctuator. + *type = OPERATOR; + break; + } + + if (strstr("<:.-+*/%^&|=!>", token_buf)) { // single or start of double/triple + // Check second char: + int c2 = get(); + if (c2 != EOF) { + token_buf_push(c2); + //token=[cc,c2];len=2 + + // Check third char: + int c3 = get(); + if (c3 != EOF) { + token_buf_push(c3); + token_buf_close(); + //token=[cc,c2,c3,0];len=3 + if (!strcmp(">>>", token_buf)) { // allow >>> for Java + //token=[>,>,>,0];len=3 + // Look-ahead for =: + int c4 = get(); + if (c4 == '=') // >>>= for Java + token_buf_push(c4); + //token=[>,>,>,=];len=4 + else + unget(c4); + //token=[>,>,>,0];len=3 + *type = OPERATOR; + break; + } + //token=[cc,c2,c3,0];len=3 + + if (!strcmp("...", token_buf) || + !strcmp("<=>", token_buf) || + !strcmp("->*", token_buf) || + !strcmp("<<=", token_buf) || + !strcmp(">>=", token_buf)) { + // Triple char operator/punctuator. + *type = OPERATOR; + break; + } + + // Maybe double char. Undo the c3 token extension: + token_buf_pop(); + token_buf_close(); + //token=[cc,c2,0];len=2 + } + else + token_buf_close(); + //token=[cc,c2,0];len=2 + unget(c3); + + // Maybe double char. + static const char * const ops2[] = { + "<:", "<%", "<=", "<<", ":>", + "::", ".*", "->", "-=", "--", + "+=", "++", "*=", "/=", "%>", + "%=", "^=", "&=", "&&", "|=", + "||", "==", "!=", ">=", ">>" + }; + unsigned size = sizeof(ops2) / sizeof(ops2[0]); + unsigned i; + for (i = 0; i < size; i++) + if (!strcmp(ops2[i], token_buf)) + break; + if (i < size) { + *type = OPERATOR; + break; + } + //token=[cc,c2,0];len=2 + + // Must be single char. Undo the c2 token extension: + token_buf_pop(); + token_buf_close(); + //token=[cc,0];len=1 + } + //else token=[cc,0];len=1 + + // Must be single char. + unget(c2); + *type = OPERATOR; + break; + } + //token=[cc,0];len=1 + + /*** PREPROCESSOR (C/C++) ***/ + + if (cc == '#') { + int nc = get(); + if (nc != '#') + unget(nc); + else + token_buf_push(nc); + *type = PREPROCESSOR; + break; + } + + // What is left here? Illegal chars! + if (!nowarn) + // Mind non-printing chars! + fprintf(stderr, + "(W): [%s:%u] Illegal character `%s%c` (0x%02x) skipped.\n", + filename, linenr, cc<32?"CTRL-":"", cc<32?cc+64:cc, cc); + // Count them: + illegals++; + + } while (1); + token_buf_close(); + *token = token_buf; + return token_len; +} + +unsigned C_tokenize(const char **token, const char **type, + unsigned *line, unsigned *col, unsigned *pos) +{ + enum TokenClass typeid; + unsigned result = C_tokenize_int(token, &typeid, line, col, pos); + *type = token_class[typeid]; + return result; +} + +// Escape hard newlines in a string. +void RAW_escape(FILE *out, const char *token) +{ + const char *p; + for (p = token; *p; p++) { + if (*p == '\n') { + fputs("\\n", out); + continue; + } + fputc(*p, out); + } +} + +// Escape token for output as CSV string. +void CSV_escape(FILE *out, const char *token) +{ + const char *p; + // start CSV string: + fputc('"', out); + for (p = token; *p; p++) { + if (*p == '\n') { // escape embedded real new lines + fputs("\\n", out); + continue; + } + if (*p == '"') + fputc('"', out); + fputc(*p, out); + } + // end CSV string: + fputc('"', out); +} + +// Escape token for output as JSON string. +void JSON_escape(FILE *out, const char *token) +{ + // C/C++ has escapes: \' \" \? \a \b \f \n \r \t \v \x \0. + // To preserve, simply escape the backslash and all ": + const char *p; + for (p = token; *p; p++) { + if (*p == '\n') { // escape embedded real new lines + fputs("\\n", out); + continue; + } + if (*p == '\t') { // escape embedded real TABs + fputs("\\t", out); + continue; + } + // FIXME: control characters from U+0000 through U+001F must be escaped + if (*p == '\\' || *p == '"') + fputc('\\', out); + fputc(*p, out); + } +} + +// Escape token for output as XML text. +void XML_escape(FILE *out, const char *token) +{ +#if 1 + // Alternative: escape every <, >, and &: + const char *p; + for (p = token; *p; p++) { + if (*p == '<') + fputs("<", out); + else + if (*p == '>') + fputs(">", out); + else + if (*p == '&') + fputs("&", out); + else + fputc(*p, out); + } +#else + // User CDATA construct for escaping. + // Impossible to escape ]]> occurring in token! + // Must chop up the substring ]]> in ]] and >. + const char *p; + const char *q = token; + // "abc]]>hello" => hello"]]> + // "]]>]]>" => ]]]]>"]]> + while ((p = strstr(q, "]]>"))) { + int len = p - q; // always > 0 + fputs("", out); + q = p+2; // q start at >... + } + if (q < token+strlen(token)) + fprintf(out, "", q); +#endif +} diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h new file mode 100644 index 0000000..0af2491 --- /dev/null +++ b/tools/tokenizer/libtoken.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2021, 2022 International Business Machines Corporation + Prepared by: Geert Janssen + + Code functionality shared by all tokenizers. +*/ + +#ifndef LIBTOKEN_H +#define LIBTOKEN_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_BUF 8 // maximum lookahead in chars + +/* Let's assume UTF-8 encoding. + https://www.cprogramming.com/tutorial/unicode.html + https://opensource.apple.com/source/tidy/tidy-2.2/tidy/src/utf8.c.auto.html +*/ + +// Test for start of UTF-8 sequence. +#define utf8_start(cc) (((cc)&0xC0)!=0x80) +#define utf8_follow(cc) (((cc)&0xC0)==0x80) + +#define utf8_len(cc) \ + (((cc)&0xF8)==0xF0 ? 4 : ((cc)&0xF0)==0xE0 ? 3 : ((cc)&0xE0)==0xC0 ? 2 : 1) + +typedef enum { C, CPP, JAVA, JAVASCRIPT, PYTHON } Language; + +// Program globals: +extern const char *filename/*= "stdin"*/; // current file being parsed +extern unsigned linenr/*= 1*/; // physical line number counted from 1 +extern unsigned column/*= 0*/; // char position in physical line, from 0 +extern unsigned saved_col/*= 0*/; // 1-place buf for last column on prev line +extern unsigned char_count/*= 0*/; // total char/byte count +extern unsigned utf8_count/*= 0*/; // total utf-8 char count +extern unsigned buffered/*= 0*/; // number of buffered chars +extern int buffer[MAX_BUF]; // use buffer as multi-char lookahead. + +// Program option settings: +extern int debug/*= 0*/; // when 1 debug output to stderr +extern int verbose/*= 0*/; // when 1 info output to stderr +extern int nowarn/*= 0*/; // when 1 warnings are suppressed + +extern unsigned illegals/*= 0*/; // count number of illegal characters +extern unsigned unexpect_eof/*= 0*/; // encountered unexpected EOF +extern int hash_as_comment/*= 0*/; // when 1 treat # as line comment +extern int newline_token/*= 0*/; // when 1 output newline pseudo-token +extern int comment_token/*= 0*/; // when 1 output comments as tokens +extern int whitespace_token/*= 0*/; // when 1 output adjacent white-space as a token +extern int continuation_token/*= 0*/; // when 1 output line continuation pseudo-token + +enum TokenClass { + /* 0*/ IDENTIFIER, + /* 1*/ KEYWORD, + /* 2*/ STRING, + /* 3*/ CHARACTER, + /* 4*/ INTEGER, + /* 5*/ FLOATING, + /* 6*/ OPERATOR, + /* 7*/ PREPROCESSOR, + /* 8*/ LINE_COMMENT, + /* 9*/ BLOCK_COMMENT, + /*10*/ WHITESPACE, + /*11*/ NEWLINE, + /*12*/ CONTINUATION, + /*13*/ FILENAME, + /*14*/ ENDOFFILE +}; + +extern const char *token_class[]; + +// keyword lookup function (pointer variable): +// (initialized by set_or_detect_lang()) +extern const char *(*is_keyword)(const char *); + +extern int get(void); +extern void unget(int cc); +extern Language set_or_detect_lang(const char *source); +extern const char *lang_name(Language lang); +extern int open_as_stdin(const char *file); + +extern unsigned C_tokenize_int(const char **token, enum TokenClass *type, + unsigned *line, unsigned *col, unsigned *pos); +extern unsigned C_tokenize(const char **token, const char **type, + unsigned *line, unsigned *col, unsigned *pos); + +extern void RAW_escape(FILE *out, const char *token); +extern void CSV_escape(FILE *out, const char *token); +extern void JSON_escape(FILE *out, const char *token); +extern void XML_escape(FILE *out, const char *token); + +#ifdef __cplusplus +} +#endif + +#endif /* LIBTOKEN_H */ diff --git a/tools/tokenizer/ntokenize.c b/tools/tokenizer/ntokenize.c index b5625fe..7adac51 100644 --- a/tools/tokenizer/ntokenize.c +++ b/tools/tokenizer/ntokenize.c @@ -56,62 +56,62 @@ #define ws_RE "[ \t\v\f\n]*" // 96 chars (omitted are e.g.: @ $ `) -// 3 5 67 8 9 9 -// 1234 5 6 7 3 9 9012345678901234567890123 4 56 -#define basic_char0_RE "[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]" +// 33 56 67 8 9 9 +// 1234 5 6 7 8 34 90 9012345678901234567890123 4 56 +#define basic_char0_RE "[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]" // all basic chars except \n and > -#define h_chars_RE "[][ \t\v\fa-zA-Z0-9_{}#()<%:;.?*+/^&|~!=,\\\"'-]+" +#define h_chars_RE "[][ \t\v\fa-zA-Z0-9_{}#()<%:;.?*+/^&|~!=,\\\"'-]+" // all basic chars except \n and \" -#define q_chars_RE "[][ \t\v\fa-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\'-]+" -#define header_RE "<"h_chars_RE">|\""q_chars_RE"\"" -#define pp_number_RE "\\.?[0-9]('?[a-zA-Z_0-9]|[eE][+-]|\\.)*" - -#define unichar_RE "\\\\u[0-9a-fA-F]{4}|\\\\U[0-9a-fA-F]{8}" - -//#define identifier_RE "[_a-zA-Z][_a-zA-Z0-9]*" -#define identifier_RE "([_a-zA-Z]|"unichar_RE")([_a-zA-Z0-9]|"unichar_RE")*" - -#define suffix_RE "([uU]ll?|[uU]LL?|ll?[uU]?|LL?[uU]?)?" -#define binary_RE "0[bB][01]('?[01])*"suffix_RE -#define octal_RE "0('?[0-7])*"suffix_RE -#define decimal_RE "[1-9]('?[0-9])*"suffix_RE -#define hexadecimal_RE "0[xX][0-9a-fA-F]('?[0-9a-fA-F])*"suffix_RE -#define integer_RE binary_RE"|"octal_RE"|"decimal_RE"|"hexadecimal_RE - -#define dec_part_RE "[0-9]('?[0-9])*" -#define exponent_RE "[eE][-+]?[0-9]('?[0-9])*" -#define floating_RE "(\\."dec_part_RE"("exponent_RE")?|"\ - dec_part_RE"\\.("dec_part_RE")?("exponent_RE")?|"\ - dec_part_RE exponent_RE")[fFlL]?" - -#define oct_char_RE "\\\\[0-7]{1,3}" -#define hex_char_RE "\\\\x[0-9a-fA-F]+" -#define escape_RE "\\\\['\"?abfnrtv\\]|"oct_char_RE"|"hex_char_RE -#define character_RE "[uUL]?'([^'\\\n]|"escape_RE"|"unichar_RE")'" -#define string_RE "[uUL]?\"([^\"\\\n]|"escape_RE"|"unichar_RE")*\"" +#define q_chars_RE "[][ \t\v\fa-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\'-]+" +#define header_RE "<"h_chars_RE">|\""q_chars_RE"\"" +#define pp_number_RE "\\.?[0-9]('?[a-zA-Z_0-9]|[eE][+-]|\\.)*" + +#define unichar_RE "\\\\u[0-9a-fA-F]{4}|\\\\U[0-9a-fA-F]{8}" + +//#define identifier_RE "[_a-zA-Z][_a-zA-Z0-9]*" +#define identifier_RE "([_a-zA-Z]|"unichar_RE")([_a-zA-Z0-9]|"unichar_RE")*" + +#define suffix_RE "([uU]ll?|[uU]LL?|ll?[uU]?|LL?[uU]?)?" +#define binary_RE "0[bB][01]('?[01])*"suffix_RE +#define octal_RE "0('?[0-7])*"suffix_RE +#define decimal_RE "[1-9]('?[0-9])*"suffix_RE +#define hexadecimal_RE "0[xX][0-9a-fA-F]('?[0-9a-fA-F])*"suffix_RE +#define integer_RE binary_RE"|"octal_RE"|"decimal_RE"|"hexadecimal_RE + +#define dec_part_RE "[0-9]('?[0-9])*" +#define exponent_RE "[eE][-+]?[0-9]('?[0-9])*" +#define floating_RE "(\\."dec_part_RE"("exponent_RE")?|"\ + dec_part_RE"\\.("dec_part_RE")?("exponent_RE")?|"\ + dec_part_RE exponent_RE")[fFlL]?" + +#define oct_char_RE "\\\\[0-7]{1,3}" +#define hex_char_RE "\\\\x[0-9a-fA-F]+" +#define escape_RE "\\\\['\"?abfnrtv\\]|"oct_char_RE"|"hex_char_RE +#define character_RE "[uUL]?'([^'\\\n]|"escape_RE"|"unichar_RE")'" +#define string_RE "[uUL]?\"([^\"\\\n]|"escape_RE"|"unichar_RE")*\"" // should really be: any basic source char except ) followed by delimiter -#define r_chars_RE "[^)]*" +#define r_chars_RE "[^)]*" // delimiter; first and second occurrence in rawstring must be the same // use back reference \3: -#define d_chars_RE "([^ ()\\\t\v\f\n]{0,16})" -#define rawstring_RE "[uUL]?R\""d_chars_RE"\\("r_chars_RE"\\)\\3\"" +#define d_chars_RE "([^ ()\\\t\v\f\n]{0,16})" +#define rawstring_RE "[uUL]?R\""d_chars_RE"\\("r_chars_RE"\\)\\3\"" -#define operator_RE "[][{}();?~,]|<=>|<<=|\\.\\.\\.|->\\*|>>=|"\ - "[*/!=^]=?|<[:%=<]?|:[:>]?|\\.[*]?|-[->=]?|\\+[=+]?|"\ - "%[>=]?|&[=&]?|>[>=]?|\\|[|=]?" +#define operator_RE "[][{}();?~,]|<=>|<<=|\\.\\.\\.|->\\*|>>=|"\ + "[*/!=^]=?|<[:%=<]?|:[:>]?|\\.[*]?|-[->=]?|\\+[=+]?|"\ + "%[>=]?|&[=&]?|>[>=]?|\\|[|=]?" #define preprocessor_RE "##?" -#define token_RE "^"ws_RE"(("rawstring_RE")|("identifier_RE")|("\ +#define token_RE "^"ws_RE"(("rawstring_RE")|("identifier_RE")|("\ integer_RE")|("floating_RE")|("string_RE")|("\ character_RE")|("operator_RE")|("preprocessor_RE"))" #define NMATCH 34 // Guarded against overflow but not full-proof! -#define MAX_LINE 4096 // maximum logical line length in chars (\0 exclusive) +#define MAX_LINE 4096 // maximum logical line length in chars (\0 exclusive) #define utf8_start(cc) (((cc)&0xC0)!=0x80) @@ -204,7 +204,7 @@ unsigned get_token(char const *text, unsigned start) if (regexec(re, text, nmatch, pmatch, REG_NOTEOL) == REG_NOMATCH) { // Warn about the failed match: fprintf(stderr, "(W) [%u:%u] not a valid token; skipped.\n", - linenrs[start],columns[start]); + linenrs[start],columns[start]); // Cannot recover; no more input. return 0; } @@ -239,7 +239,6 @@ unsigned get_token(char const *text, unsigned start) int normalize_newline(void) { int cc = getchar(); - if (cc == EOF || cc == '\n') return cc; if (cc == '\r') { // Maybe \r \n (CR NL) combination? @@ -248,10 +247,10 @@ int normalize_newline(void) char_count++; // counts the carriage return utf8_count++; // No use incrementing column. - return nc; // effectively skip the \r + return nc; // return \n; effectively skipping the \r } - // Mind nc not \n. - if (nc != EOF) ungetc(nc, stdin); + // Mind nc not \n. ungetc(EOF) is Okay. + ungetc(nc, stdin); // cc == '\r'; consider a newline as well, so turn into \n: cc = '\n'; } @@ -266,15 +265,15 @@ int get(void) int cc; restart: // Read a fresh char: - cc = normalize_newline(); + cc = normalize_newline(); // cc != '\r' if (cc == EOF) return EOF; char_count++; if (utf8_start(cc)) utf8_count++; - if (cc == '\n') { + if (cc == '\n') { // a normalized end-of-line (\r|\r?\n) linenr++; column = 0; - return cc; + return cc; // \n here signals a logical end-of-line } // Deal with \ line continuations! @@ -291,8 +290,8 @@ int get(void) // Could again start a line continuation! goto restart; } - // Mind nc not \n. - if (nc != EOF) ungetc(nc, stdin); + // Mind nc not \n. ungetc(EOF) is Okay. + ungetc(nc, stdin); // cc == '\\' a regular backslash } column++; @@ -397,22 +396,22 @@ int buffer_fill(void) if (cc == '"') { // Switch to unfiltered input till unescaped closing ": if ((cc = get()) == '"') { - buffer_add(cc); - // An empty string literal. - continue; + buffer_add(cc); + // An empty string literal. + continue; } if (cc == EOF || cc == '\n') - // unexpected EOF or newline in string - break; + // unexpected EOF or newline in string + break; buffer_add(cc); int pc; do { - pc = cc; - cc = get(); - if (cc == EOF || cc == '\n') - // unexpected EOF or newline in string - goto break_outer; - buffer_add(cc); + pc = cc; + cc = get(); + if (cc == EOF || cc == '\n') + // unexpected EOF or newline in string + goto break_outer; + buffer_add(cc); } while (pc == '\\' || cc != '"'); // pc != '\\' && cc == '"' } diff --git a/tools/tokenizer/pytokenize.c b/tools/tokenizer/pytokenize.c index fb787a9..664fdcb 100644 --- a/tools/tokenizer/pytokenize.c +++ b/tools/tokenizer/pytokenize.c @@ -1,7 +1,7 @@ /* Copyright (c) 2021 International Business Machines Corporation Prepared by: Geert Janssen - Tokenizer for Python 3. + Tokenizer for Python 3.x Token classes: - identifier @@ -33,13 +33,13 @@ // Program globals: static unsigned brackets_opened = 0; // unpaired nested ( [ { seen static int prev_was_newline = 1; // no previous token or was newline -static int first_time = 1; +static int first_time = 1; // control add , for JSON and JSONL // Program option settings: static int start_token = 0; // when 1 start filename pseudo-token static int continuous_files = 0; // when 1 do not reset after each file static enum { PLAIN, CSV, JSON, JSONL, XML, RAW } mode = PLAIN; -static int output_layout = 0; // when 1 output layout pseudo tokens +static int output_layout = 0; // when 1 output layout pseudo tokens static const char *keywords[] = { "False", "None", "True", "and", "as", "assert", "async", @@ -51,7 +51,7 @@ static const char *keywords[] = { static const unsigned num_keywords = sizeof(keywords)/sizeof(keywords[0]); -static void emit(const char *s, unsigned line, unsigned col) +static void emit(const char *s, unsigned line, unsigned col) { if (output_layout) { switch (mode) { @@ -67,18 +67,18 @@ static void emit(const char *s, unsigned line, unsigned col) case JSON: case JSONL: if (first_time) - first_time = 0; + first_time = 0; else { - if (mode == JSON) fputc(',', stdout); - fputc('\n', stdout); + if (mode == JSON) fputc(',', stdout); + fputc('\n', stdout); } fprintf(stdout, "{ \"line\": %u, \"column\": %u, " - "\"class\": \"layout\", \"token\": \"%s\" }", line, col, s); + "\"class\": \"layout\", \"token\": \"%s\" }", line, col, s); break; case XML: fprintf(stdout, - "%s\n", - line, col, s); + "%s\n", + line, col, s); break; } } @@ -88,19 +88,19 @@ static void emit(const char *s, unsigned line, unsigned col) #define MAX_INDENTS 128 static unsigned indents[MAX_INDENTS]; static unsigned *sp = indents; -#define indents_reset() do { sp = indents; } while(0) -#define indents_empty() (sp == indents) -#define indents_full() (sp == indents+MAX_INDENTS) -#define indents_top() (indents_empty() ? 0 : *(sp-1)) -#define indents_push(i) do { assert(!indents_full()); *sp++ = (i); } while(0) -#define indents_pop() do { assert(!indents_empty()); sp--; } while(0) +#define indents_reset() do { sp = indents; } while(0) +#define indents_empty() (sp == indents) +#define indents_full() (sp == indents+MAX_INDENTS) +#define indents_top() (indents_empty() ? 0 : *(sp-1)) +#define indents_push(i) do { assert(!indents_full()); *sp++ = (i); } while(0) +#define indents_pop() do { assert(!indents_empty()); sp--; } while(0) // emit NEWLINE and deal with indentation static void process_newline(unsigned indent) { emit("NEWLINE", linenr-1, saved_col); - unsigned last_indent = indents_top(); + unsigned last_indent = indents_top(); // maybe 0 if (indent > last_indent) { indents_push(indent); @@ -116,11 +116,12 @@ static void process_newline(unsigned indent) } while (indent < indents_top()); // Here: empty() || indent >= top() if (indent > indents_top() && !nowarn) - fprintf(stderr, "(W): incorrect indentation.\n"); + fprintf(stderr, "(W): Incorrect indentation.\n"); } // else: indent == last_indent: no action } +// cc in [ \t\f] static int process_ws(int cc) { // Collect white-space and compute possible indentation: @@ -171,7 +172,7 @@ static int utf8_codepoint(int cc, int *len, int bytes[4]) else { /* invalid utf-8 start byte */ if (!nowarn) fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 start byte 0x%02x.\n", - filename, linenr, cc); + filename, linenr, cc); return cc; } /* collect all follow bytes: */ @@ -179,15 +180,15 @@ static int utf8_codepoint(int cc, int *len, int bytes[4]) cc = get(); if (cc == EOF) { /* unexpected EOF in utf-8 sequence */ if (!nowarn) - fprintf(stderr, "(W): [%s:%u] Unexpected EOF in UTF-8 sequence.\n", - filename, linenr); + fprintf(stderr, "(W): [%s:%u] Unexpected EOF in UTF-8 sequence.\n", + filename, linenr); return EOF; } bytes[(*len)++] = cc; if ((cc & 0xC0) != 0x80) { /* invalid utf-8 follow byte */ if (!nowarn) - fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 follow byte 0x%02x.\n", - filename, linenr, cc); + fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 follow byte 0x%02x.\n", + filename, linenr, cc); return cc; } cp <<= 6; @@ -199,7 +200,7 @@ static int utf8_codepoint(int cc, int *len, int bytes[4]) /* invalid Unicode code point. */ if (!nowarn) fprintf(stderr, "(W): [%s:%u] Invalid Unicode code point 0x%04x.\n", - filename, linenr, cp); + filename, linenr, cp); } return cp; } @@ -261,7 +262,7 @@ static int tokenize(char *token, const char **type, cc = get(); // Maybe EOF! if (!brackets_opened && !strchr(" \t\n#\r\f", cc)) - process_newline(0); + process_newline(0); goto restart; } @@ -274,8 +275,8 @@ static int tokenize(char *token, const char **type, if (cc == EOF) { // Undo any outstanding indents: while (!indents_empty()) { - emit("DEDENT", linenr, column); - indents_pop(); + emit("DEDENT", linenr, column); + indents_pop(); } return 0; } @@ -288,8 +289,11 @@ static int tokenize(char *token, const char **type, ; // cc == '\n' || cc == '\r' || cc == EOF if (cc == '\r') { - if (!nowarn) - fprintf(stderr, "(W): Comment may not be continued with \\.\n"); + // presumably a \ may occur in a comment as last char before \n + /* + if (!nowarn) + fprintf(stderr, "(W): Comment may not be continued with \\.\n"); + */ // Effectively ignore any \ and terminate logical line: cc == '\n'; } @@ -385,9 +389,9 @@ static int tokenize(char *token, const char **type, token_add(cc); // Assume \ is not escaped itself. Happens though! - if (pc == '\\') // escape next char; no check - cc = '\0'; - else + if (pc == '\\') // escape next char; no check + cc = '\0'; + else if (cc == qc) { // a first unescaped quote int q2 = get(); token_add(q2); @@ -419,8 +423,8 @@ static int tokenize(char *token, const char **type, do { token_add(cc); if (pc == '\\') // escape next char; no check - cc = '\0'; - else + cc = '\0'; + else if (cc == qc) { // unescaped quote *type = "string"; break; @@ -459,29 +463,29 @@ static int tokenize(char *token, const char **type, if (is_id_start(cp, utf8_len)) { int i; for (i = 0; i < utf8_len; i++) - token_add(utf8_bytes[i]); + token_add(utf8_bytes[i]); ident_token: cc = get(); cp = utf8_codepoint(cc, &utf8_len, utf8_bytes); if (cp == EOF) // bad code point; already reported. - break; + break; all_ascii &= utf8_len == 1; while (is_id_follow(cp, utf8_len)) { - int i; - for (i = 0; i < utf8_len; i++) - token_add(utf8_bytes[i]); - cc = get(); - cp = utf8_codepoint(cc, &utf8_len, utf8_bytes); - if (cp == EOF) // bad code point; already reported. - break; - all_ascii &= utf8_len == 1; + int i; + for (i = 0; i < utf8_len; i++) + token_add(utf8_bytes[i]); + cc = get(); + cp = utf8_codepoint(cc, &utf8_len, utf8_bytes); + if (cp == EOF) // bad code point; already reported. + break; + all_ascii &= utf8_len == 1; } // Undo look ahead: while (utf8_len) - unget(utf8_bytes[--utf8_len]); + unget(utf8_bytes[--utf8_len]); token[len] = '\0'; *type = all_ascii && is_keyword(token, keywords, num_keywords) - ? "keyword" : "identifier"; + ? "keyword" : "identifier"; break; } @@ -827,7 +831,7 @@ int main(int argc, char *argv[]) fputs( "A tokenizer for Python (3) source code with output in 6 formats.\n" "Recognizes the following token classes: keyword, identifier, integer,\n" -"floating, imaginary, string, and operator.\n\n", stdout); +"floating, imaginary, string, and operator.\n\n", stderr); fprintf(stderr, usage_str, basename(argv[0])); fputs( "\nCommand line options are:\n" @@ -885,7 +889,7 @@ fputs( case '?': default: - fputs("(F): unknown option. Stop.\n", stderr); + fputs("(F): Unknown option. Stop.\n", stderr); fprintf(stderr, usage_str, argv[0]); return 1; } @@ -893,7 +897,7 @@ fputs( if (outfile && outfile[0]) { if (!freopen(outfile, "w", stdout)) { - fprintf(stderr, "(F): cannot open %s for writing.\n", outfile); + fprintf(stderr, "(F): Cannot open %s for writing.\n", outfile); exit(3); } } @@ -905,7 +909,7 @@ fputs( filename = argv[optind]; if (!freopen(filename, "r", stdin)) { if (!nowarn) - fprintf(stderr, "(W): Cannot read file %s.\n", filename); + fprintf(stderr, "(W): Cannot read file %s; skipped.\n", filename); continue; } @@ -963,11 +967,11 @@ fputs( while (tokenize(token, &type, &line, &col)) { switch (mode) { case RAW: - // Watch out for multi-line strings + // Watch out for multi-line strings if (!strcmp(type, "string")) RAW_escape(stdout, token); - else - fputs(token, stdout); + else + fputs(token, stdout); fputc('\n', stdout); break; case PLAIN: diff --git a/tools/tokenizer/schemas/schema.json b/tools/tokenizer/schemas/schema.json index 61909db..0723d70 100644 --- a/tools/tokenizer/schemas/schema.json +++ b/tools/tokenizer/schemas/schema.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-04/schema#", "title": "JSON Schema for Tokenizer JSON Output", - "description": "Prepared by Geert Janssen \nCopyright IBM Corporation 2020.", + "description": "Prepared by Geert Janssen \nCopyright IBM Corporation 2020, 2021.", "type": "array", "items": { @@ -12,8 +12,10 @@ "class": { "enum": [ "identifier", "keyword", "integer", "floating", "string", "character", "operator", "preprocessor", - "filename" + "filename", "line_comment", "block_comment", "newline", + "continuation", "whitespace" ] }, + "length": { "$ref": "#/definitions/unsignedInt" }, "token": { "type": "string" } }, "required": [ "line", "column", "class", "token" ], diff --git a/tools/tokenizer/schemas/schema.rnc b/tools/tokenizer/schemas/schema.rnc index 3adaad8..1eb43d7 100644 --- a/tools/tokenizer/schemas/schema.rnc +++ b/tools/tokenizer/schemas/schema.rnc @@ -1,5 +1,5 @@ # Compact RELAX NG (RNC) Schema for Tokenizer XML Output -# Copyright IBM Corporation 2020 +# Copyright IBM Corporation 2020, 2021 # Prepared by Geert Janssen datatypes xsd = 'http://www.w3.org/2001/XMLSchema-datatypes' @@ -16,9 +16,11 @@ doc = attribute line { xsd:unsignedInt }, attribute column { xsd:unsignedInt }, attribute class { token-classes }, + attribute length { xsd:unsignedInt }, text } token-classes = "identifier" | "keyword" | "integer" | "floating" | "string" | - "character" | "operator" | "preprocessor" | "filename" + "character" | "operator" | "preprocessor" | "filename" | + "line_comment" | "block_comment" | "newline" | "continuation" | "whitespace" diff --git a/tools/tokenizer/schemas/tokml-schema.rnc b/tools/tokenizer/schemas/tokml-schema.rnc new file mode 100644 index 0000000..55e1165 --- /dev/null +++ b/tools/tokenizer/schemas/tokml-schema.rnc @@ -0,0 +1,73 @@ +# XML RNC schema for tokML 1.0 +# Copyright IBM Corporation 2021 +# Prepared by Geert Janssen + +datatypes xsd = 'http://www.w3.org/2001/XMLSchema-datatypes' + +#default namespace = "https://www.ibm.com/tokml" + +start = source + +# Children are token elements interspersed with white-space. +source = element source { + attribute language { "C" | "C++" | "Java" }, + attribute filename { xsd:string }?, + ( line_comment | + block_comment | + keyword | + identifier | + integer | + floating | + \string | + character | + operator | + preprocessor | + text )* +} + +# Attributes common to all token elements. +common-attrs = + ( attribute line { xsd:unsignedInt }, + attribute col { xsd:unsignedInt }, + attribute len { xsd:unsignedInt } ) + +line_comment = element line_comment { + common-attrs, + text +} +block_comment = element block_comment { + common-attrs, + text +} +keyword = element keyword { + common-attrs, + text +} +identifier = element identifier { + common-attrs, + text +} +integer = element integer { + common-attrs, + text +} +floating = element floating { + common-attrs, + text +} +\string = element string { + common-attrs, + text +} +character = element character { + common-attrs, + text +} +operator = element operator { + common-attrs, + text +} +preprocessor = element preprocessor { + common-attrs, + text +} diff --git a/tools/tokenizer/token_common.c b/tools/tokenizer/token_common.c index 6eefa7e..ba57ba1 100644 --- a/tools/tokenizer/token_common.c +++ b/tools/tokenizer/token_common.c @@ -25,7 +25,7 @@ unsigned num_files = 0; // number of files read int debug = 0; // when 1 debug output to stderr int verbose = 0; // when 1 info output to stderr int nowarn = 0; // when 1 warnings are suppressed -Language source = C; // language mode +Language source = C; // language mode /* Conversion table from filename extension to language code. To find language code, consider all entries and check each ext @@ -52,7 +52,7 @@ static const struct { const char *ext; Language lang; const char *name; } Returns word found (i.e., pointer value in table) or 0. */ const char *is_keyword(const char *word, - const char *table[], unsigned size) + const char *table[], unsigned size) { int i = 0, j = size; while (i < j) { @@ -90,7 +90,7 @@ void remove_BOM(void) if (c2 == 0xBB) { int c3 = getchar(); if (c3 == 0xBF) { - return; + return; } if (c3 != EOF) buffer[buffered++] = c3; } @@ -217,7 +217,7 @@ Language detect_lang(void) int i; for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++) if (!strcmp(p, langs[i].ext)) - return langs[i].lang; + return langs[i].lang; } return C; } diff --git a/tools/tokenizer/token_common.h b/tools/tokenizer/token_common.h index 1ea4706..2cfe81e 100644 --- a/tools/tokenizer/token_common.h +++ b/tools/tokenizer/token_common.h @@ -47,7 +47,7 @@ extern int nowarn/*= 0*/; // when 1 warnings are suppressed extern Language source/*= C*/; // language mode extern const char *is_keyword(const char *word, - const char *table[], unsigned size); + const char *table[], unsigned size); extern int get(void); extern void unget(int cc); diff --git a/tools/tokenizer/tokenize.c b/tools/tokenizer/tokenize.c index b1dc55f..559ed80 100644 --- a/tools/tokenizer/tokenize.c +++ b/tools/tokenizer/tokenize.c @@ -1,11 +1,11 @@ -/* Copyright (c) 2020, 2021 International Business Machines Corporation +/* Copyright (c) 2021, 2022 International Business Machines Corporation Prepared by: Geert Janssen - Simple C/C++ (and Java) Tokenizer. + Simple C/C++ and Java Tokenizer. For the most part assumes that the input source text is grammatically - correct C or C++ code. - (Since Java at the lexical level is very close, could in principle - also be used as Java tokenizer, albeit that not all of its keywords + correct C, C++, or Java code. + (Since Java at the lexical level is very close to C, we here sort of misuse + it as a Java tokenizer, albeit that not all of its keywords and some literal pecularities are not recognized.) Recognizes the following lexeme classes: @@ -30,10 +30,11 @@ its starting character. Line and column reflect positions in the physical line structure, not the logical one. All token literals are output exactly as they appear in the source text, - without any interpretation of escaped characters etc. + without any interpretation of escaped characters etc. However, the particular + output format will enforce certain escaping as needed. - Moreover, skips white-space, control characters and comments and - flags anything left over as illegal characters. + Moreover, white-space, control characters and comments are normally skipped + and anything left over is flagged as illegal characters. See these refs for details on the lexical definitions: C++14 Final Working Draft: n4140.pdf @@ -44,7 +45,6 @@ (A TAB is counted as a single character position. A CR causes a transition to a new line.) No trigraph sequences (??x) are recognized. - No alternative tokens except keyword ones for certain operators. No universal characters (\u and \U) in an identifier. Raw strings with R prefix are not supported. No preprocessing is attempted: phrases like #include are @@ -69,6 +69,7 @@ 1: illegal character(s) or premature EOF detected 2: look-ahead buffer overflow 3: output file cannot be opened + 4: could not (re-)allocate token buffer C++ Token categories as Regular Expressions: (\b = [01], \o = [0-7], \d = [0-9], \x = [a-fA-F0-9], @@ -78,11 +79,11 @@ - identifier: [_a-zA-Z][_a-zA-Z0-9]* - integer : 0[bB]\b('?\b])*\s? | 0('?\o)*\s? - | 0[xX]\x('?\x)*\s? - | [1-9]('?\d)*\s? + | 0[xX]\x('?\x)*\s? + | [1-9]('?\d)*\s? - floating : .\d('?\d)*([eE][-+]?\d('?\d)*)?[fFlL]? | \d('?\d)*.(\d('?\d)*)?([eE][-+]?\d('?\d)*)?[fFlL]? - | \d('?\d)*[eE][-+]?\d('?\d)*[fFlL]? + | \d('?\d)*[eE][-+]?\d('?\d)*[fFlL]? - string : [uUL]?"([^"\\\n]|\\.|\\\n)*" - character : [uUL]?'([^']|\\.)' - operator : one of these operator and punctuation symbols: @@ -92,851 +93,10 @@ - preprocessor : # | ## */ -#include -#include -#include -#include -#include -#include /* getopt() */ -#include /* basename() */ +#include /* getopt() */ +#include /* basename() */ -/* Let's introduce more parameters so that it becomes easier to - configure the state-machines for the various tokens. - Use a NUL character to disable the parameter, i.e., a NUL value - means "this char is not in effect; a test for it fails". - - FIXME: not yet used! -*/ -// Character that may be used to group digits in a number: -#define CFG_DIGITS_SEP '\'' -// Extra character that may start an identifier: -#define CFG_ID_START_EXTRA '_' -// Extra character that may continue an identifier: -// Maybe allows a set of characters, like also $? -#define CFG_ID_CONT_EXTRA '_' -// May a floating-point number start with a decimal point: -//#define CFG_FLOAT_DOT - -// FIXME: make token size dynamic. -#define MAX_TOKEN 65535 // maximum token length in chars (\0 exclusive) -#define MAX_BUF 8 // maximum buffer size in chars - -// Program globals: -static char *filename = "stdin";// current file being parsed -static unsigned linenr = 1; // line number counted from 1 -static unsigned column = 0; // char position in line, counted from 0 -static unsigned char_count = 0; // total char/byte count -static unsigned utf8_count = 0; // total utf-8 char count -static char buffer[MAX_BUF]; // use buffer as multi-char lookahead. -static unsigned buffered = 0; // number of buffered chars -static unsigned saved_col = 0; // one-place buf for last column on prev line -static unsigned illegals = 0; // count number of illegal characters -static unsigned unexpect_eof = 0; // encountered unexpected EOF -static unsigned num_files = 0; // number of files read -// keyword lookup function: -static const char *(*is_keyword)(const char *); - -// Program option settings: -static int debug = 0; // when 1 debug output to stderr -static int verbose = 0; // when 1 info output to stderr -static int nowarn = 0; // when 1 warnings are suppressed -static int hash_as_comment = 0; // when 1 treat # as line comment -static int start_token = 0; // when 1 start filename pseudo-token -static int newline_token = 0; // when 1 output newline pseudo-token -static int continuous_files = 0;// when 1 do not reset after each file -static enum { C, CPP, JAVA } source = CPP; - -/* No longer using perfect hash function but simple binary search. */ - -/* C11 n1570.pdf 6.4.1 (44) - C17 n2176.pdf 6.4.1 (A.1.2) (44) -*/ -static const char *C_keywords[] = { - "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", - "_Generic", "_Imaginary", "_Noreturn", "_Static_assert", - "_Thread_local", - - "auto", "break", "case", "char", "const", - "continue", "default", "do", "double", "else", - "enum", "extern", "float", "for", "goto", - "if", "inline", "int", "long", "register", - "restrict", "return", "short", "signed", "sizeof", - "static", "struct", "switch", "typedef", "union", - "unsigned", "void", "volatile", "while" -}; - -#if 0 -/* C++ 2014 n4296.pdf 2.11 (84) */ -static const char *CPP_keywords[] = { - "alignas", "alignof", "and", "and_eq", "asm", - "auto", "bitand", "bitor", "bool", "break", - "case", "catch", "char", "char16_t", "char32_t", - "class", "compl", "const", "const_cast", "constexpr", - "continue", "decltype", "default", "delete", "do", - "double", "dynamic_cast", "else", "enum", "explicit", - "export", "extern", "false", "float", "for", - "friend", "goto", "if", "inline", "int", - "long", "mutable", "namespace", "new", "noexcept", - "not", "not_eq", "nullptr", "operator", "or", - "or_eq" "private", "protected", "public", "register", - "reinterpret_cast", "return", "short", "signed", "sizeof", - "static", "static_assert", "static_cast", "struct", "switch", - "template", "this", "thread_local", "throw", "true", - "try", "typedef", "typeid", "typename", "union", - "unsigned", "using", "virtual", "void", "volatile", - "wchar_t", "while", "xor", "xor_eq" -}; -#endif - -/* C++23 n4885.pdf 5.11 (92) */ -static const char *CPP_keywords[] = { - "alignas", "alignof", "and", "and_eq", "asm", - "auto", "bitand", "bitor", "bool", "break", - "case", "catch", "char", "char16_t", "char32_t", - "char8_t", "class", "co_await", "co_return", "co_yield", - "compl", "concept", "const", "const_cast", "consteval", - "constexpr", "constinit", "continue", "decltype", "default", - "delete", "do", "double", "dynamic_cast", "else", - "enum", "explicit", "export", "extern", "false", - "float", "for", "friend", "goto", "if", - "inline", "int", "long", "mutable", "namespace", - "new", "noexcept", "not", "not_eq", "nullptr", - "operator", "or", "or_eq" "private", "protected", - "public", "register", "reinterpret_cast", "requires","return", - "short", "signed", "sizeof", "static", "static_assert", - "static_cast", "struct", "switch", "template", "this", - "thread_local", "throw", "true", "try", "typedef", - "typeid", "typename", "union", "unsigned", "using", - "virtual", "void", "volatile", "wchar_t", "while", - "xor", "xor_eq" -}; - -/* Java SE 8 (50) (false, true, null are literals) */ -/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */ -static const char *Java_keywords[] = { - "abstract", "assert", "boolean", "break", "byte", "case", - "catch", "char", "class", "const", "continue", "default", - "do", "double", "else", "enum", "extends", "final", - "finally", "float", "for", "goto", "if", "implements", - "import", "instanceof", "int", "interface", "long", "native", - "new", "package", "private", "protected", "public", "return", - "short", "static", "strictfp","super", "switch", "synchronized", - "this", "throw", "throws", "transient", "try", "void", - "volatile", "while" -}; - -#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]); - -/* Generic binary search lookup in some keyword table. - `word' to be searched must be NUL-terminated C string. - `table' is array of const char * of `size' sorted alphabetically. - Returns word found (i.e., pointer value in table) or 0. -*/ -#define lang_is_keyword(lang) \ - static const char *lang##_is_keyword(const char *word) \ - { \ - int i = 0, j = num_keywords(lang); \ - while (i < j) { \ - int k = (i + j) >> 1 /* / 2 */; \ - const char *kw = lang##_keywords[k]; \ - int cmp = strcmp(word, kw); \ - if (!cmp) \ - return kw; \ - if (cmp < 0) j = k; else i = k + 1; \ - } \ - return 0; \ - } - -/* Define individual is_keyword functions per language: */ -/* C_is_keyword */ -lang_is_keyword(C) -/* CPP_is_keyword */ -lang_is_keyword(CPP) -/* Java_is_keyword */ -lang_is_keyword(Java) - -// Append char cc to token; discard when no more room: -#define token_add(cc) \ - do { if (len < MAX_TOKEN) token[len++] = (cc); } while(0) - -#define utf8_start(cc) (((cc)&0xC0)!=0x80) -#define utf8_follow(cc) (((cc)&0xC0)==0x80) - -#define utf8_len(cc) \ - (((cc)&0xF8)==0xF0 ? 4 : ((cc)&0xF0)==0xE0 ? 3 : ((cc)&0xE0)==0xC0 ? 2 : 1) - -/* Let's assume UTF-8 encoding. - https://www.cprogramming.com/tutorial/unicode.html - https://opensource.apple.com/source/tidy/tidy-2.2/tidy/src/utf8.c.auto.html -*/ - -void unget(int cc) -{ - if (cc == EOF) return; - if (buffered < MAX_BUF) { - if (cc == '\n') { - linenr--; - // column was 0 right after getting the \n - // hopefully there are no multiple ungets of \n - column = saved_col; - } - else - column--; - buffer[buffered++] = cc; - } - else { - fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF); - exit(2); - } -} - -// Act like getchar(). -// Mind linenr,column apply to physical lines not logical ones. -int get(void) -{ - int cc; - - restart: - // Get the next character: - if (buffered) // chars available in lookahead buffer - cc = buffer[--buffered]; // never EOF - // cc might be \ and followed by fresh \n - // Note: never can have buffered line continuation, i.e., \ \n. - else { // must read fresh char - cc = getchar(); - if (cc == EOF) return EOF; - // Count all chars, even the \ of a line continuation: - char_count++; - if (utf8_start(cc)) utf8_count++; - } - - // Treat Mac line endings ('\r') as regular newlines: - if (cc == '\n' || cc == '\r') { - linenr++; - saved_col = column; - column = 0; - return '\n'; - } - - // Deal with \ line continuations! Must look ahead. - if (cc == '\\') { - // Must look ahead; mind next char might be buffered! - if (buffered) - // Never can have \n for next char: - assert(buffer[buffered-1] != '\n'); - else { - // Must get fresh character: - int nc = getchar(); // do not count yet; maybe must unget - - // Maybe \r \n combination? - if (nc == '\r') { - // Look ahead for \n: - int c2 = getchar(); // do not count yet; maybe must unget - if (c2 == '\n') { - // Skip \r but count it: - char_count++; - utf8_count++; - nc = '\n'; - } - else { - unget(c2); - // nc == '\r' - } - } - - if (nc == '\n') { // 1 logical line: discard \\n combo: - char_count++; // counts the newline - linenr++; // on next physical line - // never unget a continuation - //saved_col = column; - column = 0; - - // Still need to get a character. - // Could again start a line continuation! - goto restart; - } - // Mind nc not \n but maybe \ or \r, then goes to buffer. - unget(nc); - } - // cc == '\\' a regular backslash - } - column++; - return cc; -} - -/* Tokenization of C++ programming language source text. - Recognizes: - - identifier - - reserved word/keyword - - binary, octal, decimal, hexadecimal and floating-point numbers - - double-quoted string literal - - single-quoted character literal - - all single, double, and triple operator and punctuation symbols - - the preprocessor tokens # and ## - Skips white-space, control characters and comments and flags anything - left over as illegal characters. - - (In the order of 20 tests per single character worst-case.) - - Returns 0 upon EOF or error. -*/ -int tokenize(char *token, const char **type, unsigned *line, unsigned *col) -{ - unsigned len; - int cc; - *type = ""; - - do { // infinite loop; after token recognized breaks out. - len = 0; - cc = get(); - - restart: - // cc already read. - - /*** WHITE-SPACE ***/ - - // Skip (abutted) space and control chars and comments: - // [ \t\f\v\n] - // while (cc <= ' ' && cc != EOF) - while (isspace(cc) && cc != EOF && cc != '\n') - cc = get(); - if (cc == EOF) - return 0; - if (cc == '\n') { - if (newline_token) { - // token is empty. - *line = linenr-1; - *col = saved_col; - *type = "newline"; - break; - } - cc = get(); - goto restart; - } - // !isspace(cc) && cc != EOF - - /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/ - // Java: no preprocessor directives. - - if (cc == '#' && hash_as_comment) { - // Skip till end-of-line (\n exclusive): - while ((cc = get()) != '\n' && cc != EOF) - ; - // cc == '\n' || cc == EOF - goto restart; - } - - /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/ - - if (cc == '/') { - cc = get(); - if (cc == '/') { - // Skip till end-of-line (\n exclusive): - while ((cc = get()) != '\n' && cc != EOF) - ; - // cc == '\n' || cc == EOF - goto restart; - } - - if (cc == '*') { - // Remember start position: - unsigned lin = linenr; - - // Skip till */ inclusive: - int nc = get(); // if EOF next get will be EOF too - do { - cc = nc; - nc = get(); - if (nc == EOF) { // Error! - fprintf(stderr, - "(E): [%s:%u] Unexpected end-of-file in /* comment.\n", - filename, lin); - unexpect_eof++; - return 0; - } - } while (cc != '*' || nc != '/'); - // cc == '*' && nc == '/' - cc = get(); - goto restart; - } - // seen / but not // or /* - unget(cc); // char after / - cc = '/'; // restore / - } - - // Start collecting a token. - // Token should finish with cc being last char of token! - *line = linenr; - *col = column-1; // 1 char lookahead - - /*** CHAR and STRING PREFIX (C/C++) ***/ - - // Allow u,U,L prefix for string and char - // FIXME: allow u8 as prefix for string - if (cc == 'L' || cc == 'u' || cc == 'U') { - token[len++] = cc; - cc = get(); - if (cc == '"') - goto string_token; - if (cc == '\'') - goto char_token; - // u,U,L will be interpreted as (start of) identifier. - unget(cc); // char after u,U,L - cc = token[--len]; // restore original and remove from token - } - - /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/ - // Java: false, true, null are literals - // FIXME: Flag to allow .letter as part of identifier? - // (compound identifier) - - // Simplistic solution to allowing Unicode: allow any char >= 128 without - // actual checking for UTF-8. - if (isalpha(cc) || cc == '_' || cc == '$' || cc & 0x80) { - // First char always fits. - token[len++] = cc; - while (isalnum(cc = get()) || cc == '_' || cc == '$' || - cc != EOF && (cc & 0x80)) - token_add(cc); - unget(cc); - token[len] = '\0'; - *type = is_keyword(token) ? "keyword" : "identifier"; - break; - } - - /*** INTEGER and FLOATING ***/ - // Java: uses _ in numbers as insignificant separator - // Java: decimal suffix: [lL], float suffix: [fFdD] - // Java: allows hex float - -#if 0 - // Examples: - int bin_num = 0B010101u; - int oct_num = 01234567L; - int hex_num = 0x123ABCLL; - int dec_num = 12345678; - - float flt_num1 = 077.; - float flt_num2 = 077.987; - float flt_num3 = 77.; - float flt_num4 = .77; -#endif - - // . digits ... floating - if (cc == '.') { - // Look ahead for a digit: - int nc; - if (isdigit(nc = get())) { - unget(nc); - goto start_fraction; - } - unget(nc); - // Could go immediately to operator: goto seen_period - } - - if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal - // Types of integer literals: - enum { - BIN, OCT, DEC, HEX - } int_lit = cc == '0' ? OCT : DEC; - - // Lookahead: - int nc = get(); - if (int_lit == OCT && (nc == 'x' || nc == 'X')) { - int_lit = HEX; - token_add(cc); // the 0 - cc = nc; // the x or X - } - else - if (int_lit == OCT && (nc == 'b' || nc == 'B')) { - int_lit = BIN; - token_add(cc); // the 0 - cc = nc; // the b or B - } - else - unget(nc); // isdigit(cc) - - do { - token_add(cc); - cc = get(); - - // Allow for ' between `digits': - if (cc == '\'') { - // Keep the ' in the token for now: - token_add(cc); - int nc = get(); - if (isdigit(nc) || int_lit == HEX && isxdigit(nc)) - cc = nc; - else { // Error! - fprintf(stderr, - "(E): [%s:%u] C++14 only allows ' between digits.\n", - filename, linenr); - // what to do? - } - } - } while (isdigit(cc) || int_lit == HEX && isxdigit(cc)); - // !is[x]digit(cc) - - // FIXME: allow hex floats in C - if (int_lit == OCT || int_lit == DEC) { - int floating = 0; - // Seen digits-sequence. Maybe followed by . or e or E? - if (cc == '.') { // fractional part - start_fraction: - floating = 1; - token_add(cc); - // digits? FIXME: again allow ' between digits - while (isdigit(cc = get())) - token_add(cc); - // !isdigit(cc) - } - // cc != '.' || !isdigit(cc) - if (cc == 'e' || cc == 'E') { // exponent - floating = 1; - token_add(cc); - if ((cc = get()) == '-' || cc == '+') { - token_add(cc); - cc = get(); - } - // FIXME: no check for at least 1 digit - // FIXME: again allow ' between digits - while (isdigit(cc)) { - token_add(cc); - cc = get(); - } - // !isdigit(cc) - } - if (floating) { - if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L') - token_add(cc); - else - unget(cc); - *type = "floating"; - break; - } - } - - // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case - if (cc == 'l' || cc == 'L') { - token_add(cc); - // maybe another l - cc = get(); - if (cc == 'l' || cc == 'L') { - token_add(cc); - // Here: token is digits[lL][lL] - cc = get(); - } - // maybe a u - if (cc == 'u' || cc == 'U') - // Here: token is digits[lL][lL]?[u|U] - token_add(cc); - else - unget(cc); - } - else if (cc == 'u' || cc == 'U') { - token_add(cc); - // maybe an l - cc = get(); - if (cc == 'l' || cc == 'L') { - token_add(cc); - // Here: token is digits[uU][lL] - cc = get(); - } - // maybe another l - if (cc == 'l' || cc == 'L') - // Here: token is digits[uU][lL]?[lL] - token_add(cc); - else - unget(cc); - } - else - unget(cc); - *type = "integer"; - break; - } - - /*** STRING (C/C++/Java) ***/ - - if (cc == '"') { - string_token: - // First char always fits. - token[len++] = cc; - // Remember start position: - unsigned lin = linenr; - // Watch out for escaped " inside string. - cc = get(); - while (cc != '"') { - if (cc == EOF) { // Error! - fprintf(stderr, - "(E): [%s:%u] Unexpected end-of-file in string literal.\n", - filename, lin); - unexpect_eof++; - return 0; - } - token_add(cc); - int nc = get(); - - if (cc == '\\') { - // FIXME: No check on valid escape char! - // ' " ? \ a b f n r t v - token_add(nc); - cc = get(); - } - else - cc = nc; - } - // cc == '"' - token_add(cc); - *type = "string"; - break; - } - - /*** CHARACTER (C/C++/Java) ***/ - - if (cc == '\'') { - char_token: - // First char always fits. - token[len++] = cc; - // Watch out for escaped ' inside char. - cc = get(); - // FIXME: Cannot have empty char! - while (cc != '\'') { - if (cc == EOF) { // Error! - fprintf(stderr, - "(E): [%s:%u] Unexpected end-of-file in char literal.\n", - filename, linenr); - unexpect_eof++; - return 0; - } - token_add(cc); - int nc = get(); - if (cc == '\\') { - token_add(nc); - cc = get(); - // FIXME: No check on valid escape char! - // ' " ? \ a b f n r t v 0[d[d]] xh* - } - else - cc = nc; - } - // cc == '\'' - token_add(cc); - *type = "character"; - break; - } - - /*** OPERATOR (and PUNCTUATION) (C/C++/Java) ***/ - - // Operator and punctuation symbols. Longest match. - - /* Operator or punctuator Alternative representation - { <% - } %> - [ <: - ] :> - # %: (not supported here) - ## %:%: (not supported here) - */ - - // Single char operator or punctuator (C/C++/Java) - // { } [ ] ( ) ; : ? . ~ ! + - * / % ^ = & | < > , - // Double char operator or punctuator (C/C++) - // <: :> <% %> - // Double char operator or punctuator (C/C++/Java) - // += -= *= /= %= ^= &= |= == != <= >= && || << >> ++ -- -> - // Double char operator or punctuator (C++/Java) - // :: - // Double char operator or punctuator (C++) - // .* - // Triple char operator or punctuator (C/C++/Java) - // ... <<= >>= - // Triple char operator or punctuator (C++) - // ->* <=> - // Java: @ >>> >>>= - - //seen_period: - - // First char always fits. - token[len++] = cc; - token[len] = '\0'; - //token=[cc,0];len=1 - - if (strstr("{}[]();?~,@", token)) { // allow @ for Java - // Single char operator/punctuator. - *type = "operator"; - break; - } - - if (strstr("<:.-+*/%^&|=!>", token)) { // single or start of double/triple - // Check second char: - int c2 = get(); - if (c2 != EOF) { - token[len++] = c2; - //token=[cc,c2];len=2 - - // Check third char: - int c3 = get(); - if (c3 != EOF) { - token[len++] = c3; - token[len] = '\0'; - //token=[cc,c2,c3,0];len=3 - if (!strcmp(">>>", token)) { // allow >>> for Java - //token=[>,>,>,0];len=3 - // Look-ahead for =: - int c4 = get(); - if (c4 == '=') // >>>= for Java - token[len++] = c4; - //token=[>,>,>,=];len=4 - else - unget(c4); - //token=[>,>,>,0];len=3 - *type = "operator"; - break; - } - //token=[cc,c2,c3,0];len=3 - - if (!strcmp("...", token) || - !strcmp("<=>", token) || - !strcmp("->*", token) || - !strcmp("<<=", token)) { - // Triple char operator/punctuator. - *type = "operator"; - break; - } - - // Maybe double char. Undo the c3 token extension: - token[--len] = '\0'; - //token=[cc,c2,0];len=2 - } - else - token[len] = '\0'; - //token=[cc,c2,0];len=2 - unget(c3); - - // Maybe double char. - static const char * const ops2[] = { - "<:", "<%", "<=", "<<", ":>", - "::", ".*", "->", "-=", "--", - "+=", "++", "*=", "/=", "%>", - "%=", "^=", "&=", "&&", "|=", - "||", "==", "!=", ">=", ">>" - }; - unsigned size = sizeof(ops2) / sizeof(ops2[0]); - unsigned i; - for (i = 0; i < size; i++) - if (!strcmp(ops2[i], token)) - break; - if (i < size) { - *type = "operator"; - break; - } - //token=[cc,c2,0];len=2 - - // Must be single char. Undo the c2 token extension: - token[--len] = '\0'; - //token=[cc,0];len=1 - } - //else token=[cc,0];len=1 - - // Must be single char. - unget(c2); - *type = "operator"; - break; - } - //token=[cc,0];len=1 - - /*** PREPROCESSOR (C/C++) ***/ - - if (cc == '#') { - int nc = get(); - if (nc != '#') - unget(nc); - else - token[len++] = nc; - *type = "preprocessor"; - break; - } - - // What is left here? Illegal chars! - if (!nowarn) - // Mind non-printing chars! - fprintf(stderr, - "(W): [%s:%u] Illegal character `%s%c` (0x%02x) skipped.\n", - filename, linenr, cc<32?"CTRL-":"", cc<32?cc+64:cc, cc); - // Count them: - illegals++; - - } while (1); - // len <= MAX_TOKEN - token[len] = '\0'; - return 1; -} - -// Escape token for output as CSV string. -void CSV_escape(FILE *out, const char *token) -{ - const char *p; - // start CSV string: - fputc('"', out); - for (p = token; *p; p++) { - if (*p == '"') - fputc('"', out); - fputc(*p, out); - } - // end CSV string: - fputc('"', out); -} - -// Escape token for output as JSON string. -void JSON_escape(FILE *out, const char *token) -{ - // C/C++ has escapes: \' \" \? \a \b \f \n \r \t \v \x \0. - // To preserve, simply escape the escape and all ": - const char *p; - for (p = token; *p; p++) { - if (*p == '\\' || *p == '"') - fputc('\\', out); - fputc(*p, out); - } -} - -// Escape token for output as XML text. -void XML_escape(FILE *out, const char *token) -{ -#if 1 - // Alternative: escape every <, >, and &: - const char *p; - for (p = token; *p; p++) { - if (*p == '<') - fputs("<", out); - else - if (*p == '>') - fputs(">", out); - else - if (*p == '&') - fputs("&", out); - else - fputc(*p, out); - } -#else - // User CDATA construct for escaping. - // Impossible to escape ]]> occurring in token! - // Must chop up the substring ]]> in ]] and >. - const char *p; - const char *q = token; - // "abc]]>hello" => hello"]]> - // "]]>]]>" => ]]]]>"]]> - while ((p = strstr(q, "]]>"))) { - int len = p - q; // always > 0 - fputs("", out); - q = p+2; // q start at >... - } - if (q < token+strlen(token)) - fprintf(out, "", q); -#endif -} +#include "libtoken.h" int main(int argc, char *argv[]) { @@ -944,19 +104,26 @@ int main(int argc, char *argv[]) extern int opterr; extern int optind; int option; - char const *opt_str = "1acdhjl:m:no:rsvw"; + char const *opt_str = "1acdhjkl:m:nNo:rsvwW"; char usage_str[80]; - char token[MAX_TOKEN+1]; /* leave room for a terminating NUL */ - const char *type; + const char *token; + enum TokenClass type; unsigned line; unsigned col; + unsigned pos; + unsigned token_len; + unsigned num_files = 0; // number of files read + int start_token = 0; // when 1 start filename pseudo-token + int continuous_files = 0; // when 1 do not reset after each file char *outfile = 0; enum { PLAIN, CSV, JSON, JSONL, XML, RAW } mode = PLAIN; int first_time = 1; + Language source; int explicit_source = 0; int append = 0; + int suppress_newline = 0; sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str); @@ -984,7 +151,7 @@ int main(int argc, char *argv[]) fputs( "A tokenizer for C/C++ (and Java) source code with output in 6 formats.\n" "Recognizes the following token classes: keyword, identifier, integer,\n" -"floating, string, character, operator, and preprocessor.\n\n", stdout); +"floating, string, character, operator, and preprocessor.\n\n", stderr); fprintf(stderr, usage_str, basename(argv[0])); fputs( "\nCommand line options are:\n" @@ -993,34 +160,33 @@ fputs( "-d : print debug info to stderr; implies -v.\n" "-h : print just this text to stderr and stop.\n" "-j : assume input is Java (deprecated: use -l Java or .java).\n" +"-k : output line and block comments as tokens.\n" "-l : specify language explicitly (C, C++, Java).\n" "-m : output mode either plain (default), csv, json, jsonl, xml, or raw.\n" "-n : output newlines as a special pseudo token.\n" +"-N : output line continuations as a special pseudo token.\n" "-o : write output to this file (instead of stdout).\n" +"-r : suppress newline after each token in raw mode.\n" "-s : enable a special start token specifying the filename.\n" "-1 : treat all filename arguments as a continuous single input.\n" "-v : print action summary to stderr.\n" -"-w : suppress all warning messages.\n", +"-w : suppress all warning messages.\n" +"-W : output adjacent white-space as a token.\n", stderr); return 0; case 'j': - source = JAVA; + source = set_or_detect_lang("Java"); explicit_source = 1; break; + case 'k': + comment_token = 1; + break; + case 'l': - if (!strcmp(optarg, "C")) - source = C; - else if (!strcmp(optarg, "C++")) - source = CPP; - else if (!strcmp(optarg, "Java")) - source = JAVA; - else { - if (!nowarn) - fprintf(stderr, "(W): Unknown source %s (assuming C++).\n", optarg); - } - explicit_source = 1; + source = set_or_detect_lang(optarg); + explicit_source = 1; break; case 'm': @@ -1037,7 +203,7 @@ fputs( else if (!strcmp(optarg, "raw")) mode = RAW; else { - if (!nowarn) + if (!nowarn) fprintf(stderr, "(W): Invalid mode %s (using plain).\n", optarg); mode = PLAIN; } @@ -1047,10 +213,18 @@ fputs( newline_token = 1; break; + case 'N': + continuation_token = 1; + break; + case 'o': outfile = optarg; break; + case 'r': + suppress_newline = 1; + break; + case 's': start_token = 1; break; @@ -1063,6 +237,10 @@ fputs( nowarn = 1; break; + case 'W': + whitespace_token = 1; + break; + case '?': default: fputs("(F): unknown option. Stop.\n", stderr); @@ -1088,34 +266,14 @@ fputs( fprintf(stderr, "(W): Cannot read file %s.\n", filename); continue; } - if (!explicit_source) { - // Determine language from extension: - int len = strlen(filename); - if (len > 2 && !strcmp(filename+len-2, ".c")) - source = C; - else if (len > 4 && !strcmp(filename+len-4, ".cpp")) - source = CPP; - else if (len > 5 && !strcmp(filename+len-5, ".java")) - source = JAVA; - } + + if (!explicit_source) + source = set_or_detect_lang(0); doit: if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename); num_files++; - // Determine which keyword lookup function to use: - switch (source) { - case C: - is_keyword = C_is_keyword; - break; - case CPP: - is_keyword = CPP_is_keyword; - break; - case JAVA: - is_keyword = Java_is_keyword; - break; - } - // Header: switch (mode) { case RAW: @@ -1127,59 +285,65 @@ fputs( break; case CSV: if (!continuous_files || num_files == 1) - fputs("line,column,class,token\n", stdout); + fputs("line,column,class,token\n", stdout); if (start_token) fprintf(stdout, "0,0,filename,\"%s\"\n", filename); break; case JSON: case JSONL: if (!continuous_files || num_files == 1) { - if (mode == JSON) fputs("[\n", stdout); + if (mode == JSON) fputs("[\n", stdout); } else { - if (mode == JSON) fputc(',', stdout); - fputc('\n', stdout); - first_time = 1; + if (mode == JSON) fputc(',', stdout); + fputc('\n', stdout); + first_time = 1; } if (start_token) { fprintf(stdout, "{ \"line\": 0, \"column\": 0, " - "\"class\": \"filename\", \"token\": \"%s\" }", - filename); - first_time = 0; + "\"class\": \"filename\", \"length\": %d, \"token\": \"%s\" }", + strlen(filename), filename); + first_time = 0; } break; case XML: if (!continuous_files || num_files == 1) { - fputs("\n", stdout); - // standalone="yes" - fputs("\n", stdout); + fputs("\n", stdout); + // standalone='yes' + fputs("\n", stdout); } if (start_token) { - fprintf(stdout, ""); - XML_escape(stdout, filename); + fprintf(stdout, + "", + strlen(filename)); + XML_escape(stdout, filename); fputs("\n", stdout); } break; } - while (tokenize(token, &type, &line, &col)) { + while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) { switch (mode) { case RAW: fputs(token, stdout); - fputc('\n', stdout); - break; + if (!suppress_newline) fputc('\n', stdout); + break; case PLAIN: - fprintf(stdout, "(%4u,%3u) %s: %s\n", line, col, type, token); + fprintf(stdout, "(%4u,%3u;%6u:%3u) %s: %s\n", + line, col, pos, token_len, token_class[type], token); break; case CSV: // Escape , " in token // csvkit treats . as null fields even as ".". - fprintf(stdout, "%u,%u,%s,", line, col, type); - if (!strcmp(type, "string") || + fprintf(stdout, "%u,%u,%s,", line, col, token_class[type]); + if (type == STRING || // Do we need this too? Yes! - !strcmp(type, "character") && strchr(token, '"') || - !strcmp(type, "character") && strchr(token, ',')) + type == CHARACTER && (strchr(token, '"') || strchr(token, ',')) || + type == WHITESPACE && strchr(token, '\n') || + type == NEWLINE || + type == CONTINUATION || + comment_token && (type == LINE_COMMENT || type == BLOCK_COMMENT)) CSV_escape(stdout, token); else if (!strcmp(token, ",")) fputs("\",\"", stdout); @@ -1194,24 +358,28 @@ fputs( else { if (mode == JSON) fputc(',', stdout); fputc('\n', stdout); - } + } fprintf(stdout, "{ \"line\": %u, \"column\": %u, " - "\"class\": \"%s\", \"token\": \"", - line, col, type); + "\"class\": \"%s\", \"length\": %u, \"token\": \"", + line, col, token_class[type], token_len); // token value is always a JSON string. - if (!strcmp(type, "string") || !strcmp(type, "character")) + if (type == STRING || type == CHARACTER || + type == NEWLINE || type == WHITESPACE || + type == CONTINUATION) JSON_escape(stdout, token); else fputs(token, stdout); fputs("\" }", stdout); break; case XML: - fprintf(stdout, "", - line, col, type); - if (!strcmp(type, "string") - || !strcmp(type, "character") - || !strcmp(type, "operator")) + fprintf(stdout, "", + line, col, token_class[type], token_len); + if (type == STRING || + type == CHARACTER || + type == OPERATOR || + comment_token && (type == LINE_COMMENT || + type == BLOCK_COMMENT)) XML_escape(stdout, token); else fputs(token, stdout); @@ -1224,25 +392,25 @@ fputs( // Trailer: switch (mode) { case RAW: - break; + break; case PLAIN: - break; + break; case CSV: - break; + break; case JSON: - fputs("\n]", stdout); - /*FALL THROUGH*/ + fputs("\n]", stdout); + /*FALL THROUGH*/ case JSONL: - fputc('\n', stdout); - break; + fputc('\n', stdout); + break; case XML: - fputs("\n", stdout); - break; + fputs("\n", stdout); + break; } if (verbose) - fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n", - char_count, utf8_count); + fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n", + char_count, utf8_count); // Reset globals: char_count = 0; @@ -1277,7 +445,7 @@ fputs( if (verbose) fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n", - char_count, utf8_count); + char_count, utf8_count); } if (num_files > 1 && verbose) diff --git a/tools/tokenizer/tokenize.py b/tools/tokenizer/tokenize.py new file mode 100755 index 0000000..f5a162c --- /dev/null +++ b/tools/tokenizer/tokenize.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +# Copyright IBM Corporation 2021, 2022 +# Written by Geert Janssen + +# Simple ctypes-based Python wrapper of libtoken.so +# See ctypes documentation: https://docs.python.org/3/library/ctypes.html +# This Python script works with versions 2.6, 2.7, and 3.5 + +import sys +from ctypes import * + +# Load the shared object (expects it in current directory): +libtoken = CDLL('./libtoken.so') + +# Define the exported function signatures: +libtoken.C_tokenize.argtypes = (POINTER(c_char_p), + POINTER(c_char_p), + POINTER(c_uint), + POINTER(c_uint), + POINTER(c_uint)) +libtoken.open_as_stdin.argtypes = (c_char_p,) + +# 'Declare' the C function argument types: +_token = c_char_p() +_kind = c_char_p() +_linenr = c_uint() +_column = c_uint() +_pos = c_uint() + +# Token generator: +def token(): + global _token, _kind, _linenr, _column, _pos + + # C_tokenize returns 0 upon end-of-file. + while int(libtoken.C_tokenize(byref(_token), byref(_kind), byref(_linenr), + byref(_column), byref(_pos))): + # Turn ctypes into real Python values: + lin = _linenr.value + col = _column.value + pos = _pos.value # not used for now + clas = _kind.value.decode() + text = _token.value.decode() + yield (lin,col,clas,text) + +if len(sys.argv) == 1: + for tok in token(): + print('[%u:%u] %s, %s' % tok) +else: + for file in sys.argv[1:]: + # Set C filename global and reopen as stdin: + b_str = file.encode('utf-8') # need handle b_str to retain as C pointer + libtoken.open_as_stdin(b_str) + + # Access C globals: + filename = c_char_p.in_dll(libtoken, 'filename') + print('[0:0] filename, %s' % filename.value.decode()) + + for tok in token(): + print('[%u:%u] %s, %s' % tok) + + # Reset globals: + c_uint.in_dll(libtoken, 'linenr').value = 1 + c_uint.in_dll(libtoken, 'column').value = 0 + c_uint.in_dll(libtoken, 'char_count').value = 0 + c_uint.in_dll(libtoken, 'utf8_count').value = 0 diff --git a/tools/tokenizer/tokml-test.sh b/tools/tokenizer/tokml-test.sh new file mode 100755 index 0000000..445647f --- /dev/null +++ b/tools/tokenizer/tokml-test.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Showcasing the use of tokml and xidel. +# Works for any Java, C, and C++ source file. +# Extracts certain tokens and statistics of interest. + +# Show command and execute it. +run() { + echo "$ $1" + eval "$1" + [ $? == 0 ] || die "got non-0 program exit code" +} + +die() { + echo "(E) ${@}" 1>&2 + exit 1 +} + +# We need an input file: +[ -z "$1" ] && die "expect a C, C++, or Java file as argument" + +# Quick check for availabilty of tokml and xidel: +command -v tokml &>/dev/null +[ $? == 0 ] || die "tokml not available; please install" +command -v xidel &>/dev/null +[ $? == 0 ] || die "xidel not available; please install" + +# Create temp file: +XML="$(mktemp /tmp/${1%.*}-XXX.xml)" +# Ensure clean up when done: +trap "/bin/rm -f $XML" EXIT +echo \# Run tokml to obtain the .xml output file: +run "tokml $1 > $XML" + +echo +echo \# Count the number of tokens in the arg source file: +run "xidel -s -e 'count(//source/*)' $XML" + +echo +echo \# Show all unique identifiers \(sorted\): +run "xidel -s -e '//identifier' $XML | sort | uniq" + +echo +echo \# Show the identifier occurrences of length greater than 10: +run "xidel -s -e '//identifier[@len>10]' $XML" + +echo +echo \# How many block_comment occurrences are there? +run "xidel -s -e 'count(//block_comment)' $XML" + +echo +echo \# Which tokens immediately follow the keyword static? +run "xidel -s -e '//keyword[text()=\"static\"]/following-sibling::*[1]' $XML | sort | uniq" + +echo +echo \# What is the value of the first integer number? +run "xidel -s -e '//integer[1]' $XML" + +echo +echo \# Convert the XML back to the original source and show 20 lines: +run "xidel -s -e 'source' $XML | head -n20" diff --git a/tools/tokenizer/tokml.c b/tools/tokenizer/tokml.c new file mode 100644 index 0000000..3fdc6aa --- /dev/null +++ b/tools/tokenizer/tokml.c @@ -0,0 +1,223 @@ +/* Copyright (c) 2021, 2022 International Business Machines Corporation + Prepared by: Geert Janssen + + Tokenizer for C, C++ and Java with output as annotated XML, + much like srcML annotates a parse tree. Any white-space (including + newlines) is output as is, without any special XML element. + All other tokens (even comments) are output as a stream of XML + elements with tag names indicating the type/kind/class of + token provided as the enclosed text node. + + + + <@kind@ line='' col='' len=''>... + + + Note that end-of-line characters (\r, \n) and sequences (\r \n) are + normalized and will always be output as a LINEFEED (LF, 0x0A). + + The characters <, >, and & will be replaced by the special XML entities + <, > and & respectively. + + To undo the XML annotation in .xml use either: + (this will also correctly revert the XML entities) + xmlstarlet sel -T -t -v 'source' .xml, or + xidel -s -e 'source' .xml + + Useful xpath queries: + (the results show all occurrences and these are not necessarily unique) + - all identifiers: //identifier + - the length of the last identifier: //identifier[last()]/@len + - the value of the first integer: //integer[1] + - all comments starting at the beginning of a line: + //line_comment[@col=0]|//block_comment[@col=0] + - all while keywords: /keyword[text()="while"] + - identifiers of length greater than 10: //identifier[@len>10] + - tokens immediately following a long identifier: + //identifier[@len>15]/following-sibling::*[1] + - tokens immediately following the keyword static: + //keyword[text()="static"]/following-sibling::*[1] +*/ + +#include /* getopt() */ +#include /* basename() */ + +#include "libtoken.h" + +int main(int argc, char *argv[]) +{ + extern char *optarg; + extern int opterr; + extern int optind; + int option; + char const *opt_str = "1acdhl:o:rvw"; + char usage_str[80]; + + const char *token; + enum TokenClass type; + unsigned line; + unsigned col; + unsigned pos; + unsigned token_len; + unsigned num_files = 0; // number of files read + int continuous_files = 0; // when 1 do not reset after each file + + char *outfile = 0; + Language source; + int explicit_source = 0; + int append = 0; + + comment_token = 1; + whitespace_token = 1; + + sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str); + + /* Process arguments: */ + while ((option = getopt(argc, argv, opt_str)) != EOF) { + switch (option) { + + case '1': + continuous_files = 1; + break; + + case 'a': + append = 1; + break; + + case 'c': + hash_as_comment = 1; + break; + + case 'd': + debug = verbose = 1; + break; + + case 'h': +fputs( +"A tokenizer for C/C++ (and Java) source code with output in XML.\n" +"Recognizes the following token classes: keyword, identifier, integer,\n" +"floating, string, character, operator, preprocessor, line_comment,\n" +"and block_comment.\n\n", stderr); +fprintf(stderr, usage_str, basename(argv[0])); +fputs( +"\nCommand line options are:\n" +"-a : append to output file instead of create or overwrite.\n" +"-c : treat a # character as the start of a line comment.\n" +"-d : print debug info to stderr; implies -v.\n" +"-h : print just this text to stderr and stop.\n" +"-l : specify language explicitly (C, C++, Java).\n" +"-o : write output to this file (instead of stdout).\n" +"-1 : treat all filename arguments as a continuous single input.\n" +"-v : print action summary to stderr.\n" +"-w : suppress all warning messages.\n", + stderr); + return 0; + + case 'l': + source = set_or_detect_lang(optarg); + explicit_source = 1; + break; + + case 'o': + outfile = optarg; + break; + + case 'v': + verbose = 1; + break; + + case 'w': + nowarn = 1; + break; + + case '?': + default: + fputs("(F): unknown option. Stop.\n", stderr); + fprintf(stderr, usage_str, argv[0]); + return 1; + } + } + + if (outfile && outfile[0]) { + if (!freopen(outfile, append ? "a" : "w", stdout)) { + fprintf(stderr, "(F): cannot open %s for writing.\n", outfile); + exit(3); + } + } + + if (optind == argc) + goto doit; + + do { + filename = argv[optind]; + if (!freopen(filename, "r", stdin)) { + if (!nowarn) + fprintf(stderr, "(W): Cannot read file %s.\n", filename); + continue; + } + + if (!explicit_source) + source = set_or_detect_lang(0); + + doit: + if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename); + num_files++; + + // Header: + if (!continuous_files || num_files == 1) { + fputs("\n", stdout); + // standalone="yes" + fprintf(stdout, "", + lang_name(source), filename); + } + + while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) { + if (type == WHITESPACE) { + fputs(token, stdout); + continue; + } + fprintf(stdout, "<%s line='%u' col='%u' len='%u'>", + token_class[type], line, col, token_len); + if (type == STRING || + type == CHARACTER || + type == OPERATOR || + type == LINE_COMMENT || + type == BLOCK_COMMENT) + XML_escape(stdout, token); + else + fputs(token, stdout); + fprintf(stdout, "", token_class[type]); + } + + if (!continuous_files) { + // Trailer: + fputs("\n", stdout); + + if (verbose) + fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n", + char_count, utf8_count); + + // Reset globals: + char_count = 0; + utf8_count = 0; + linenr = 1; + column = 0; + buffered = 0; + saved_col = 0; + } + } while (++optind < argc); + + if (continuous_files) { + // Trailer: + fputs("\n", stdout); + + if (verbose) + fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n", + char_count, utf8_count); + } + + if (num_files > 1 && verbose) + fprintf(stderr, "(I): Total number of files processed: %u\n", num_files); + + return (illegals || unexpect_eof) ? 1 : 0; +}