diff --git a/doc/srcml.md b/doc/srcml.md
index cf462ec..3002501 100644
--- a/doc/srcml.md
+++ b/doc/srcml.md
@@ -179,23 +179,25 @@ TRANSFORMATIONS:
 For instance, the XML markup can be enhanced with line and column coordinates.
 Notice also that srcML has built-in capabilities to query and manipulate the
 XML. Queries can be done with XPath expressions. General transformations can
-be executed with XSLT.
+be executed with XSLT. By the way, there is no need to first convert the
+source to XML; the built-in queries capabilities work also directly off the
+source code.
 
 Here are a few examples of useful operations:
 
 - Get all function and method definition names:
 ```console
-$ srcml --xpath="//src:function/src:name" program.xml
+$ srcml --xpath="//src:function/src:name" program.java
 ```
 
 - Count the number of conditions:
 ```console
-$ srcml --xpath='count(//src:condition)' program.xml
+$ srcml --xpath='count(//src:condition)' program.c
 ```
 
 - Output all line comments:
 ```console
-$ srcml --xpath='//src:comment[@type="line"]' program.xml
+$ srcml --xpath='//src:comment[@type="line"]' program.cpp
 ```
 
 Much more versatile and powerful tools to process any XML are
diff --git a/tools/json-graph/src/jsonml2jgf.c b/tools/json-graph/src/jsonml2jgf.c
index 73a0bf8..c1395ac 100644
--- a/tools/json-graph/src/jsonml2jgf.c
+++ b/tools/json-graph/src/jsonml2jgf.c
@@ -94,7 +94,7 @@ static void det_coords_adjust_labels(Graph g)
     for (p = start; p < end; ++p)
       if (isspace(*p))
 	n->label->start++;
-      else if (*p == '\\' && (*p == 'n' || *p == 't')) {
+      else if (*p == '\\' && (*(p+1) == 'n' || *(p+1) == 't')) {
 	n->label->start += 2;
 	++p;
       }
diff --git a/tools/tokenizer/Makefile b/tools/tokenizer/Makefile
index b6ca860..9851522 100644
--- a/tools/tokenizer/Makefile
+++ b/tools/tokenizer/Makefile
@@ -3,16 +3,31 @@
 
 INCLUDES =
 CPPFLAGS = $(INCLUDES)
-CFLAGS   = -O2
+CFLAGS   = -g -O2 -fPIC
 LDFLAGS  =
 
-PROGS = tokenize antlr4tojson pytokenize jstokenize
+PROGS = tokenize antlr4tojson pytokenize jstokenize tokml libtoken.so
 
 .PHONY: all
 all: $(PROGS)
 
-tokenize: tokenize.o
-tokenize.o: tokenize.c
+tokenize: tokenize.o libtoken.a
+tokenize.o: tokenize.c libtoken.h
+
+tokml: tokml.o libtoken.a
+tokml.o: tokml.c libtoken.h
+
+libtoken.o: libtoken.c libtoken.h
+
+.PHONY: lib
+lib: libtoken.a libtoken.so
+
+libtoken.a: libtoken.o
+	ar r $@ $^
+	ranlib $@
+
+libtoken.so: libtoken.o
+	$(CC) -shared -Wl,-soname,$@.1 -o $@ $^
 
 antlr4tojson: antlr4tojson.o
 antlr4tojson.o: antlr4tojson.c
@@ -27,5 +42,5 @@ token_common.o: token_common.c token_common.h
 
 .PHONY: clean
 clean:
-	@-rm -f *.o
+	@-rm -f *.o *.a
 	@-rm -f $(PROGS)
diff --git a/tools/tokenizer/README.md b/tools/tokenizer/README.md
index 6008eec..e82940f 100644
--- a/tools/tokenizer/README.md
+++ b/tools/tokenizer/README.md
@@ -8,29 +8,35 @@ This same repository also offers separate programs for a Python tokenizer
 of the command-line options and have the same output formats.
 
 Here we focus on the C/C++/Java tokenizer (`tokenize`), but most of this
-documentation equally applies to the other tokenizer program. The `Makefile`
-builds them all.
+documentation equally applies to the other tokenizer program.
+The `Makefile` builds them all.
 
 The following lexeme classes are recognized:
 
 - identifier
-- reserved word/keyword
+- reserved word/keyword of the language of the input source
 - binary, octal, decimal, hexadecimal and floating-point numbers
 - double-quoted string literal
 - single-quoted character literal
 - all single, double, and triple operator and punctuation symbols
 - the preprocessor tokens # and ##
+- a number of pseudo tokens depending on selected options
 
 For each correctly recognized token, the program determines its class/type and
 the exact coordinates (line number and column) in the input text of
 its starting character. All token literals are output exactly as they appear in
-the source text, without any interpretation of escaped characters.
+the source text, without any interpretation of possibly escaped characters.
 
-A newline is defined as a single linefeed character `\n` or the combination
-carriage return `\r` followed by linefeed `\n`.
-Line continuations (a backslash immediately followed by a newline) are handled
+A newline is defined as a single linefeed character `\n`, a carriage return
+`\r`, or the combination carriage return `\r` followed by linefeed `\n`.
+Line continuations, i.e., a backslash immediately followed by a newline, are handled
 at the character input level, so the token recognizers will only see logical
-lines. Line and column reflect positions in the physical line structure, not the logical one.
+lines. Line and column coordinates however reflect positions in the physical line
+structure, not the logical one. When so requested, logical line endings are
+output as `newline` pseudo tokens and will be represented by a linefeed
+character. Similarly, when requested, continuations are output as
+`continuation` pseudo tokens and will be represented by a backslash-escaped
+linefeed `\\n`.
 
 For instance the appearance of a line continuation inside a string literal:
 
@@ -45,8 +51,8 @@ upon output as a token becomes:
 	"A long string literal that is broken here to stretch over two lines."
 ```
 
-Moreover, white-space, control characters and comments are skipped and
-anything left over is flagged as illegal characters.
+White-space (SPACE and TAB characters), certain control characters, and comments are
+normally skipped and anything left over is flagged as illegal characters.
 
 Since Java at the lexical level is very close to C and C++, this tokenizer
 can also be used for Java, albeit that some literal pecularities are not
@@ -54,9 +60,7 @@ recognized. The program looks at the file name extension to determine the
 language. This can be overridden (and must be specified in case of using
 standard input) by the `-l` option.
 Depending on the language setting, the proper set of keywords will be
-recognized. For C and C++ their
-combined set of (95) keywords is recognized, assuming that a C program will not
-inadvertently use C++ keywords as regular identifiers.
+recognized.
 
 ## Program options
 
@@ -68,22 +72,24 @@ A tokenizer for C/C++ (and Java) source code with output in 6 formats.
 Recognizes the following token classes: keyword, identifier, integer,
 floating, string, character, operator, and preprocessor.
 
-usage: tokenize [ -1acdhjl:m:no:rsvw ] [ FILES ]
+usage: tokenize [ -1acdhjkl:m:nNo:rsvwW ] [ FILES ]
 
 Command line options are:
 -a       : append to output file instead of create or overwrite.
 -c       : treat a # character as the start of a line comment.
 -d       : print debug info to stderr; implies -v.
 -h       : print just this text to stderr and stop.
--j       : assume input is Java (deprecated: use -l Java or .java).
+-k       : output line and block comments as tokens.
 -l<lang> : specify language explicitly (C, C++, Java).
 -m<mode> : output mode either plain (default), csv, json, jsonl, xml, or raw.
 -n       : output newlines as a special pseudo token.
+-N       : output line continuations as a special pseudo token.
 -o<file> : write output to this file (instead of stdout).
 -s       : enable a special start token specifying the filename.
 -1       : treat all filename arguments as a continuous single input.
 -v       : print action summary to stderr.
 -w       : suppress all warning messages.
+-W       : output adjacent white-space as a token.
 ```
 
 The program reads multiple files. Depending on the `-1` option, the files
@@ -95,7 +101,7 @@ the mode setting.
 ## Multiple output modes
 
 The tokenizer has multiple output modes. They are plain text, CSV, JSON, JSONL
-and XML. A sample of plain text output looks like this:
+XML, and RAW mode. A sample of plain text output looks like this:
 
 ```text
 (  62,  0) preprocessor: #
@@ -139,26 +145,52 @@ and XML. A sample of plain text output looks like this:
 Line numbers are 1 based, columns start at 0 (Emacs-style).
 The token classes are:
 
-| Class:       | Description:
-|--------------|------------
-| identifier   | any identifier
-| keyword      | a reserved word
-| integer      | integer number irrespective of notation
-| floating     | a floating-point number
-| string       | a double-quoted string (maybe empty)
-| character    | a single-quoted character
-| operator     | any operator or punctuator symbol
-| preprocessor | either # or ##
-| filename     | pseudo token: start of a new file
-| newline      | pseudo token: end of logical line
+| Class:        | Description:
+|---------------|------------
+| identifier    | any identifier
+| keyword       | a reserved word
+| integer       | integer number irrespective of notation
+| floating      | a floating-point number
+| string        | a double-quoted string (maybe empty)
+| character     | a single-quoted character
+| operator      | any operator or punctuator symbol
+| preprocessor  | either `#` or `##`
+
+The following classes are only recognized when the appropriate switch has been set:
+
+| Class:        | Description:                           | Switch:
+|---------------|----------------------------------------|---------
+| line_comment  | treat `#` till end of line as comment  | -c -k
+| line_comment  | a comment that starts with `//`        | -k
+| block_comment | a comment enclosed in `/*` and `*/`    | -k
+| filename      | pseudo token: start of a new file      | -s
+| newline       | pseudo token `\n`: end of logical line | -n
+| continuation  | pseudo token `\\n`: line continuation  | -N
+| whitespace    | adjacent white-space                   | -W
 
 The `filename` token is optional. It will be included when the `-s` option is
 provided. It is a pseudo token that provides the filename of the input as the
 first token. Similarly, the `newline` is a pseudo token and appears only with
 the `-n` option. It signals the end of a logical line. Mind that multiple
-newlines occurring in sequence are not suppressed. The `newline` token has no
-textual representation, e.g. in XML mode output it will appear as an empty
-text element.
+newlines occurring in sequence are not suppressed nor aggregated but appear as
+separate newline tokens (the same holds for continuations).
+The `newline` token will
+be represented by a linefeed character (LF). Depending on the output mode this
+will be escaped appropriately. The `-W` would normally also collect any
+newlines except when `-n` is a also set and continuations except when `-N` is
+set in which case they are treated as separate tokens. To summarize, the valid
+combinations of these options and their effect are:
+
+| Switches: | Effect on output:
+|-----------|------------------
+|           | all white-space, line endings inclusive, discarded
+| -n        | newline tokens for logical lines
+| -N        | continuation tokens
+| -W        | whitespace tokens inclusive all physical line endings
+| -n -N     | newline and continuation tokens
+| -W -n     | whitespace tokens and newline tokens separately
+| -W -N     | whitespace tokens and continuation tokens separately
+| -W -n -N  | whitespace, newline, and continuation all separately
 
 ### CSV output
 
@@ -177,7 +209,10 @@ line,column,class,token
 ```
 
 The operator token `,` is escaped with double quotes, like so `","`.
-String tokens are escaped as well and any original double quote is doubled.
+String tokens are always escaped and any original double quote is doubled.
+A newline on its own or as part of whitespace will appear escaped as `\n`.
+A whitespace token text will appear inside double quotes. A continuation token
+will appear as `"\\n"`.
 
 ### JSON output
 
@@ -222,6 +257,18 @@ tokens. (An alternative would be to use the CDATA construct.)
 
 ```
 
+## tokML
+
+Recently a new program has been added: `tokml`. As the name suggests the
+output is in XML format but unlike the `-mxml` option to `tokenize`, `tokml`
+outputs the original source code annotated with XML elements that supply the
+token information. This is an approach identical to what `srcML` does for a
+parse tree. The precise XML syntax used is defined by the RelaxNG schema in
+the file `tokml-schema.rnc`.
+
+The XML annotation makes it very convenient to apply XPath and XQuery queries
+to the token stream, e.g. by using tools like `xidel` and `xmlstarlet`.
+
 ## References
 
 > <a id="1">[1]</a>
diff --git a/tools/tokenizer/antlr4tojson.c b/tools/tokenizer/antlr4tojson.c
index f2d2b9c..7cdb254 100644
--- a/tools/tokenizer/antlr4tojson.c
+++ b/tools/tokenizer/antlr4tojson.c
@@ -45,9 +45,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <regex.h>
-#include <unistd.h>		/* getopt() */
-#include <libgen.h>		/* basename() */
-#include <ctype.h>		/* tolower() */
+#include <unistd.h>             /* getopt() */
+#include <libgen.h>             /* basename() */
+#include <ctype.h>              /* tolower() */
 
 // POSIX Extended Regular Expressions for all parts of token output.
 
@@ -87,15 +87,15 @@
   class_RE "),(channel=(" posint_RE "),)?(" line_RE "):(" column_RE ")\\]$"
 
 // Program option settings:
-static int debug = 0;		// when 1 debug output to stderr
+static int debug = 0;           // when 1 debug output to stderr
 static int verbose = 0;         // when 1 info output to stderr
-static int nowarn = 0;		// when 1 warnings are suppressed
-static int start_token = 0;	// when 1 start filename pseudo-token
+static int nowarn = 0;          // when 1 warnings are suppressed
+static int start_token = 0;     // when 1 start filename pseudo-token
 static int continuous_files = 0;// when 1 do not reset after each file
 
 // Program globals:
 static char *filename = "stdin";// current file being parsed
-static unsigned num_files = 0;	// number of files read
+static unsigned num_files = 0;  // number of files read
 static unsigned linenr = 1;     // line number counted from 1
 static enum { CSV, JSON, JSONL, RAW } mode = JSON;
 
@@ -179,10 +179,10 @@ static void JSON_escape(FILE *out, const char *p, unsigned len)
       const char peek = len ? *(p+1) : anything_but_valid_escape; // look ahead
       fputc('\\', out);
       if (strchr("\\\"bfnrt", peek)) {
-	// An valid JSON escape. Output it and skip peek:
-	c = peek;
-	p++;
-	len--;
+        // An valid JSON escape. Output it and skip peek:
+        c = peek;
+        p++;
+        len--;
       }
       //else Not a correct JSON escape, a standalone backslash; double it.
     }
@@ -214,7 +214,7 @@ static unsigned get(char const *text)
   if (regexec(re, text, nmatch, pmatch, REG_NOTEOL) == REG_NOMATCH) {
     // Warn about the failed match:
     fprintf(stderr, "(W) [%s:%u] not a valid token; skipped.\n",
-	    filename, linenr);
+            filename, linenr);
     // Cannot recover; no more input.
     return 0;
   }
@@ -256,22 +256,22 @@ static unsigned get(char const *text)
     case CLASS_IDENT:
       // CSV output does not need the quoting. 
       if (mode == JSON || mode == JSONL)
-	fputc('"', stdout);
+        fputc('"', stdout);
       // Undo the capitalization?
       fputc(tolower(*p), stdout);
       fwrite(p+1, 1, len-1, stdout);
       if (mode == JSON || mode == JSONL)
-	fputc('"', stdout);
+        fputc('"', stdout);
       break;
     case TEXT:
       // CSV output benefits from quoting; must escape the "
       fputc('"', stdout);
       // Strip off the enclosing single quotes.
       if (mode == JSON || mode == JSONL)
-	JSON_escape(stdout, p+1, len-2);
+        JSON_escape(stdout, p+1, len-2);
       else
       if (mode == CSV)
-	CSV_escape(stdout, p+1, len-2);
+        CSV_escape(stdout, p+1, len-2);
       fputc('"', stdout);
       break;
     case CLASS_STRING:
@@ -281,10 +281,10 @@ static unsigned get(char const *text)
       // Keep the enclosing single quotes!
       fputc('"', stdout);
       if (mode == JSON || mode == JSONL)
-	JSON_escape(stdout, p, len);
+        JSON_escape(stdout, p, len);
       else
       if (mode == CSV)
-	CSV_escape(stdout, p, len);
+        CSV_escape(stdout, p, len);
       fputc('"', stdout);
       break;
     case CHANNEL:
@@ -333,7 +333,7 @@ main(int argc, char *argv[])
 
     case 'h':
 fputs(
-"A converter for the ANTLR4 token output format.\n\n", stdout);
+"A converter for the ANTLR4 token output format.\n\n", stderr);
  fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
@@ -358,7 +358,7 @@ fputs(
       else if (!strcmp(optarg, "raw"))
         mode = RAW;
       else {
-	if (!nowarn)
+        if (!nowarn)
         fprintf(stderr, "(W): Invalid mode %s (using csv).\n", optarg);
         mode = CSV;
       }
@@ -419,10 +419,10 @@ fputs(
       break;
     case CSV:
       if (!continuous_files || num_files == 1)
-	fputs("seqnr,start,stop,text,class,channel,line,column\n", stdout);
+        fputs("seqnr,start,stop,text,class,channel,line,column\n", stdout);
       else {
-	fputc('\n', stdout);
-	first_time = 1;
+        fputc('\n', stdout);
+        first_time = 1;
       }
       if (start_token) {
         fprintf(stdout, "0,0,0,%s,File,0,1,0\n", filename);
@@ -431,20 +431,20 @@ fputs(
     case JSON:
     case JSONL:
       if (!continuous_files || num_files == 1) {
-	if (mode == JSON) fputs("[\n", stdout);
+        if (mode == JSON) fputs("[\n", stdout);
       }
       else {
-	if (mode == JSON) fputc(',', stdout);
-	fputc('\n', stdout);
-	first_time = 1;
+        if (mode == JSON) fputc(',', stdout);
+        fputc('\n', stdout);
+        first_time = 1;
       }
       if (start_token) {
-	// Must quote filename:
+        // Must quote filename:
         fprintf(stdout,
-	    "{\"seqnr\":0, \"start\":0, \"stop\":0, \"text\":\"%s\","
-	    " \"class\":\"File\", \"line\":1, \"column\":0}",
-		filename);
-	first_time = 0;
+            "{\"seqnr\":0, \"start\":0, \"stop\":0, \"text\":\"%s\","
+            " \"class\":\"File\", \"line\":1, \"column\":0}",
+                filename);
+        first_time = 0;
       }
       break;
     }
@@ -452,19 +452,19 @@ fputs(
     while (getline(&line, &len, stdin) != -1) {
       // If already did some output must close that previous line:
       if (first_time)
-	first_time = 0;
+        first_time = 0;
       else {
-	switch (mode) {
-	case RAW:
-	  break;
-	case JSON:
-	  fputc(',', stdout);
-	  /*FALL THROUGH*/
-	case CSV:
-	case JSONL:
-	  fputc('\n', stdout);
-	  break;
-	}
+        switch (mode) {
+        case RAW:
+          break;
+        case JSON:
+          fputc(',', stdout);
+          /*FALL THROUGH*/
+        case CSV:
+        case JSONL:
+          fputc('\n', stdout);
+          break;
+        }
       }
       get(line); // no , and/or \n output yet
       linenr++;
@@ -476,15 +476,15 @@ fputs(
       // Trailer:
       switch (mode) {
       case RAW:
-	break;
+        break;
       case JSON:
-	// no last comma!
-	fputs("\n]", stdout);
-	/*FALL THROUGH*/
+        // no last comma!
+        fputs("\n]", stdout);
+        /*FALL THROUGH*/
       case CSV:
       case JSONL:
-	fputc('\n', stdout);
-	break;
+        fputc('\n', stdout);
+        break;
       }
       first_time = 1;
     }
diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk
new file mode 100755
index 0000000..3105a63
--- /dev/null
+++ b/tools/tokenizer/filter6.awk
@@ -0,0 +1,263 @@
+#!/usr/bin/awk -f
+
+# Copyright (c) 2021 International Business Machines Corporation
+# Prepared by: Geert Janssen <geert@us.ibm.com>
+
+# Expects a C/C++ tokenizer generated CSV file as input with explicit
+# whitespace and separate newline (and continuation) tokens.
+# (tokenize -W -n [-N] -mcsv)
+# Outputs one possibly modified token (class or literal) per line.
+# Tries to use some context to better discriminate the meaning of some
+# otherwise ambiguous tokens.
+
+# Should use yacc/bison or lemon?
+
+# Ambiguous tokens in C/C++:
+# < > delimiters of filename in preprocessor include directive
+# Resolved by using preceding #include context
+# < > delimiters of template parameters
+# < less than operator
+# Resolve: preceding context keyword template, template <+
+# > greater than operator
+# Resolve: preceding context keyword template <
+# " " delimiters of filename in preprocessor include directive
+# " " delimiters of string literal
+# Resolved by using preceding #include context
+# ( ) expression grouping
+# ( ) argument list
+# { } block
+# { } initializer
+# [ ] indexing
+# [ ] lambda capture
+# ~ destructor
+# ~ unary operator
+# - unary operator
+# - binary operator
+# Resolve: no white-space after - then unary?
+# * unary operator (dereference pointer)
+# * binary operator (multiplication)
+# * pointer declarator
+# & bitwise and operator
+# & address of operator
+# Can of worms: overloaded operator symbols
+
+# Simplistic CPP line syntax:
+# "#" directive-name (token)* newline
+
+# #include <sys.h>
+# #include "local"
+# #define identifier-macro-def
+# #define identifier-macro-const val
+# #define identifier-macro-func( ... )
+
+# Using a stack to remember CSV token lines whose output is temporarily
+# suppressed. That way can have unbounded lookahead.
+# Use function to empty and print stack from bottom to top.
+
+function push(record) {
+    stack[sp++]=record
+}
+
+function empty_out() {
+    for (i=0; i<sp; i++)
+	print stack[i]
+    sp=0
+}
+
+BEGIN {
+    # CPP directive-names:
+    directive["include"]=1
+    directive["define"]=1
+    directive["undef"]=1
+    directive["if"]=1
+    directive["ifdef"]=1
+    directive["ifndef"]=1
+    directive["else"]=1
+    directive["elif"]=1
+    directive["endif"]=1
+    directive["line"]=1
+    directive["pragma"]=1
+    directive["error"]=1
+
+    # Empty stack of tokens:
+    sp=0
+    # Start (current) state:
+    state=0
+    # Next state:
+    next_state=-1 # indicates no specific rule matches
+    # Field separator of input record (line):
+    FS=","
+    # Read CSV header line:
+    getline
+    # Echo to output:
+    print #0
+}
+
+# Note: only gawk has switch statement.
+
+# Dispatch on current state and input.
+# Make sure all conditions are mutually exclusive, except last one.
+# Last one is made exclusive by next_state==-1.
+# Must use next_state to avoid immediate action on current line.
+# All rules when matched must set next_state to something other than -1.
+
+# Instead of composing new CSV record could also modify $0 via
+# assignments to its fields (like $3="identifier").
+
+# A # followed by an identifier in a macro body means stringize the identifier.
+(state == 0 && $4 == "#") {
+    push($0)
+    next_state=1
+}
+
+# The keyword template provides context for some < and > disambiguation.
+(state == 0 && $4 == "template") {
+    print $0
+    next_state=0 # switched off for now
+}
+
+# # seen; expect directive or identifier.
+(state == 1 && $3 == "identifier") {
+    push($0)
+    if ($4 in directive) {
+	if ($4 == "include")
+	    next_state=2
+	else
+	if ($4 == "define")
+	    next_state=7
+	else {
+	    empty_out()
+	    next_state=0
+	}
+    }
+    else { # #ident => stringize to "ident"
+	empty_out()
+	next_state=0
+    }
+}
+
+# Handle #include <...
+(state == 2 && $4 == "<") {
+    # Note: suppressing this token.
+    next_state=3
+}
+
+# Handle #include "...".
+(state == 2 && $3 == "string") {
+    # $4 has enclosing " doubled!
+    filename=substr($4,3,length($4)-4)
+    empty_out()
+    print $1 "," $2 ",string-local-filename," filename
+    next_state=0
+}
+
+# Collect all tokens after the < till >.
+# Treat first specially to get its coordinates.
+(state == 3 && ($3 == "identifier" || $3 == "keyword")) {
+    id_lin=$1
+    id_col=$2
+    filename=$4
+    # Note: modifying this token.
+    next_state=4
+}
+
+# Keep collecting tokens till > or newline.
+(state == 4 && $3 != "newline" && $4 != ">") { # eats up anything
+    filename=filename $4
+    # Note: suppressing this token.
+    next_state=4
+}
+
+# Handling #include <...>, or #include <...newline.
+(state == 4 && ($3 == "newline" || $4 == ">")) {
+    # When newline it's an error, but act as if > was present:
+    empty_out()
+    print id_lin "," id_col ",string-sys-filename,\"" filename "\""
+    if ($3 == "newline")
+	print $0
+    # else suppressing the > token.
+    next_state=0
+}
+
+# Handle template <.
+(state == 5 && $4 == "<") {
+    $3="start-template-paramlist"
+    print $0
+    next_state=6
+}
+
+# Handle template < >, explicit specialization.
+(state == 6 && $4 == ">") {
+    $3="end-template-paramlist"
+    print $0
+    next_state=0
+}
+
+# Handle #define name.
+(state == 7 && ($3 == "identifier" || $3 == "keyword")) {
+    id_lin=$1
+    id_col=$2
+    macro_name=$4
+    # Note: modifying this token later.
+    next_state=8
+}
+
+# Handle #define name(.
+(state == 8 && $4 == "(") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-func," macro_name
+    print $0
+    next_state=0
+}
+
+# Handle #define name whitespace
+(state == 8 && $3 == "whitespace") {
+    # Note: suppressing this token.
+    next_state=9
+}
+
+# Handle #define name whitespace? newline
+((state == 8 || state == 9) && $3 == "newline") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-def," macro_name
+    print $0
+    next_state=0
+}
+
+# Handle #define name whitespace !newline.
+(state == 9 && $3 != "newline") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-const," macro_name
+    print $0
+    next_state=0
+}
+
+# Default rule; always executed:
+# 1. no prior rule matched:
+#    - stay in same state only for whitespace, newline, and continuation;
+#      this allows for their presence without explicit mention in rules
+#    - output any previously suppressed tokens (to not lose them)
+#    - print current token except for whitespace
+#    - back to state 0 to quickly recover for any errors in input
+# 2. some rule matched:
+#    - simply move on to next state as stated in that rule
+#    - reset next_state to -1
+{
+    if (next_state == -1) {
+	# Echo the current token as is (ignore whitespace though):
+	if ($3 != "whitespace") {
+	    if ($3 != "newline" && $3 != "continuation") {
+		empty_out()
+		state=0
+	    }
+	    print $0
+	}
+	# otherwise: Do not change state!
+    }
+    else {
+	state=next_state
+	next_state=-1
+    }
+}
+
+END {}
diff --git a/tools/tokenizer/jstokenize.c b/tools/tokenizer/jstokenize.c
index c935837..e9604a8 100644
--- a/tools/tokenizer/jstokenize.c
+++ b/tools/tokenizer/jstokenize.c
@@ -108,20 +108,20 @@ static int tokenize(char *token, const char **type,
         // Skip till end-of-line (\n exclusive):
         while ((cc = get()) != EOF && cc != '\n' && cc != '\r')
           ;
-	// cc == '\n' || cc == '\r' || cc == EOF
-	if (cc == '\r') {
-	  if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected continuation in line comment.\n");
-	  // Effectively ignore any \ and terminate logical line:
-	  cc == '\n';
-	}
+        // cc == '\n' || cc == '\r' || cc == EOF
+        if (cc == '\r') {
+          if (!nowarn)
+            fprintf(stderr,
+                    "(W): Unexpected continuation in line comment.\n");
+          // Effectively ignore any \ and terminate logical line:
+          cc == '\n';
+        }
         goto restart;
       }
 
       if (cc == '*') {
-	// Remember start position:
-	unsigned lin = linenr;
+        // Remember start position:
+        unsigned lin = linenr;
 
         // Skip till */ inclusive:
         int nc = get(); // if EOF next get will be EOF too
@@ -130,9 +130,9 @@ static int tokenize(char *token, const char **type,
           nc = get();
           if (nc == EOF) { // Error!
             fprintf(stderr,
-		    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
-		    filename, lin);
-	    unexpect_eof++;
+                    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
+                    filename, lin);
+            unexpect_eof++;
             return 0;
           }
         } while (cc != '*' || nc != '/');
@@ -153,13 +153,13 @@ static int tokenize(char *token, const char **type,
         // Skip till end-of-line (\n exclusive):
         while ((cc = get()) != EOF && cc != '\n' && cc != '\r')
           ;
-	if (cc == '\r') {
-	  if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected continuation in hashbang comment.\n");
-	  // Effectively ignore any \ and terminate logical line:
-	  cc == '\n';
-	}
+        if (cc == '\r') {
+          if (!nowarn)
+            fprintf(stderr,
+                    "(W): Unexpected continuation in hashbang comment.\n");
+          // Effectively ignore any \ and terminate logical line:
+          cc == '\n';
+        }
         goto restart;
       }
       // seen # but not #!
@@ -201,37 +201,37 @@ static int tokenize(char *token, const char **type,
       int pc;
       do {
         token_add(cc);
-	pc = cc;
-	cc = get();
-	if (cc == '\r') {
-	  if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected continuation in regex literal.\n");
-	  // Effectively ignore:
-	  cc = get();
-	}
-
-	if (cc == '\n') {
+        pc = cc;
+        cc = get();
+        if (cc == '\r') {
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected newline in regular expression literal.\n");
-	  // discard:
-	  cc = get();	  
-	}
+            fprintf(stderr,
+                    "(W): Unexpected continuation in regex literal.\n");
+          // Effectively ignore:
+          cc = get();
+        }
+
+        if (cc == '\n') {
+          if (!nowarn)
+            fprintf(stderr,
+                    "(W): Unexpected newline in regular expression literal.\n");
+          // discard:
+          cc = get();     
+        }
 
-	if (cc == EOF) {
+        if (cc == EOF) {
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected EOF in regular expression literal.\n");
+            fprintf(stderr,
+                    "(W): Unexpected EOF in regular expression literal.\n");
           unexpect_eof++;
-	  break;
-	}
+          break;
+        }
       } while (cc != '/' || pc == '\\');
       token_add(cc); // the /
       cc = get();
       while (strchr("gimsuy", cc)) {
         token_add(cc);
-	cc = get();
+        cc = get();
       }
       unget(cc);
       *type = "regex";
@@ -259,15 +259,15 @@ static int tokenize(char *token, const char **type,
       int nesting = 0; // keep track of ${} nesting
       do {
         token_add(cc);
-	// For template can have nesting inside placeholder ${...}
-	// FIXME: no check for nested paired ``; same for {}
-	if (qc == '`') {
-	  if (pc == '$' && cc == '{')
-	    nesting++;
-	  else
-	  if (cc == '}')
-	    nesting--;
-	}
+        // For template can have nesting inside placeholder ${...}
+        // FIXME: no check for nested paired ``; same for {}
+        if (qc == '`') {
+          if (pc == '$' && cc == '{')
+            nesting++;
+          else
+          if (cc == '}')
+            nesting--;
+        }
 
         // Assume \ is not escaped itself.
         if (pc != '\\' && cc == qc && !nesting) { // unescaped quote
@@ -283,16 +283,16 @@ static int tokenize(char *token, const char **type,
 
         if (cc == '\n' && qc != '`') { // Ok in template
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected unescaped newline in string.\n");
+            fprintf(stderr,
+                    "(W): Unexpected unescaped newline in string.\n");
           // discard
           cc = get();
         }
 
         if (cc == EOF) {
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected EOF in string/template.\n");
+            fprintf(stderr,
+                    "(W): Unexpected EOF in string/template.\n");
           unexpect_eof++;
           break;
         }
@@ -312,11 +312,11 @@ static int tokenize(char *token, const char **type,
       unget(cc);
       token[len] = '\0';
       if (is_keyword(token, keywords, num_keywords)) {
-	*type = "keyword";
-	regex_ok = !!is_keyword(token, regex_preceders, num_preceders);
+        *type = "keyword";
+        regex_ok = !!is_keyword(token, regex_preceders, num_preceders);
       }
       else
-	*type = "identifier";
+        *type = "identifier";
       break;
     }
 
@@ -340,16 +340,16 @@ static int tokenize(char *token, const char **type,
       } int_lit = DEC; // assume decimal number
 
       /* BIN: 0[bB][01](_?[01])*
-	 LEGACY_OCT: 0[0-7]+
-	 OCT: 0[oO][0-7](_?[0-7])*
-	 DEC: 0|[1-9](_?[0-9])*
-	 HEX: 0[xX][0-9a-fA-F](_?[0-9a-fA-F])*
+         LEGACY_OCT: 0[0-7]+
+         OCT: 0[oO][0-7](_?[0-7])*
+         DEC: 0|[1-9](_?[0-9])*
+         HEX: 0[xX][0-9a-fA-F](_?[0-9a-fA-F])*
 
-	 EXP: [eE][+-]?[0-9](_?[0-9])*
+         EXP: [eE][+-]?[0-9](_?[0-9])*
 
-	 FLOATING: .[0-9][_0-9]*EXP?
-	         | DEC.([0-9][_0-9]*)?EXP?
-	         | DEC EXP
+         FLOATING: .[0-9][_0-9]*EXP?
+                 | DEC.([0-9][_0-9]*)?EXP?
+                 | DEC EXP
        */
 
       if (cc == '0') {
@@ -368,14 +368,14 @@ static int tokenize(char *token, const char **type,
           int_lit = HEX;
           break;
         default:
-	  if ('0' <= nc && nc <= '7') {
-	    token_add(cc); // the 0
-	    int_lit = LEGACY_OCT;
-	  }
-	  else {
-	    unget(nc);
-	    nc = cc;
-	  }
+          if ('0' <= nc && nc <= '7') {
+            token_add(cc); // the 0
+            int_lit = LEGACY_OCT;
+          }
+          else {
+            unget(nc);
+            nc = cc;
+          }
           break;
         }
         cc = nc;
@@ -454,9 +454,9 @@ static int tokenize(char *token, const char **type,
       }
 
       if (cc == 'n') // BigInt
-	token_add(cc);
+        token_add(cc);
       else
-	unget(cc);
+        unget(cc);
 
       *type = "integer";
       break;
@@ -492,28 +492,28 @@ static int tokenize(char *token, const char **type,
       if (strchr("*+-<>&|?.=", cc) && c2 == cc) { // double or triple
         // ** ++ -- << >> && || ?? .. ==
 
-	// special case ++ and --
-	if (c2 == '+' || c2 == '-') {
+        // special case ++ and --
+        if (c2 == '+' || c2 == '-') {
             token_add(c2);
             *type = "operator";
             break;
-	}
+        }
 
         // ** << >> && || ?? .. ==
         int c3 = get();
 
-	// special case . and ...
+        // special case . and ...
         if (c2 == '.') {
           if (c3 == '.') {
             // ...
             token_add(c2);
             token_add(c3);
           }
-	  else {
-	    // ..x
-	    unget(c3);
-	    unget(c2);
-	  }
+          else {
+            // ..x
+            unget(c3);
+            unget(c2);
+          }
           // .
           *type = "operator";
           break;
@@ -530,18 +530,18 @@ static int tokenize(char *token, const char **type,
 
         // ** << >> && || ?? ==
 
-	if (c2 == '>' && c3 == c2) {
-	  // >>>
-	  int c4 = get();
+        if (c2 == '>' && c3 == c2) {
+          // >>>
+          int c4 = get();
           token_add(c3);
-	  if (c4 == '=')
-	    // >>>=
-	    token_add(c4);
-	  else
-	    unget(c4);
-	}
-	else
-	  unget(c3);
+          if (c4 == '=')
+            // >>>=
+            token_add(c4);
+          else
+            unget(c4);
+        }
+        else
+          unget(c3);
 
         // ** << >> && || ?? ==
         *type = "operator";
@@ -552,7 +552,7 @@ static int tokenize(char *token, const char **type,
       // also missing => ?. !== <= >= == != += -= *= %= &= |= ^= /=
 
       if (cc == '?' && c2 == '.' ||
-	  cc == '=' && c2 == '>') {
+          cc == '=' && c2 == '>') {
         // ?. =>
         token_add(c2);
         *type = "operator";
@@ -562,20 +562,20 @@ static int tokenize(char *token, const char **type,
       // still missing !== <= >= == != += -= *= %= &= |= ^= /=
 
       if (c2 == '=') {
-	// <= >= == != += -= *= %= &= |= ^= /=
-	token_add(c2);
-	if (cc == '!') {
-	  // !=
-	  int c3 = get();
-	  if (c3 == '=')
-	    // !==
-	    token_add(c3);
-	  else
-	    unget(c3);
-	}
+        // <= >= == != += -= *= %= &= |= ^= /=
+        token_add(c2);
+        if (cc == '!') {
+          // !=
+          int c3 = get();
+          if (c3 == '=')
+            // !==
+            token_add(c3);
+          else
+            unget(c3);
+        }
       }
       else
-	unget(c2);
+        unget(c2);
       *type = "operator";
       break;
     }
@@ -711,7 +711,7 @@ int main(int argc, char *argv[])
 fputs(
 "A tokenizer for JavaScript source code with output in 6 formats.\n"
 "Recognizes the following token classes: keyword, identifier, integer,\n"
-"floating, string, regex, and operator.\n\n", stdout);
+"floating, string, regex, and operator.\n\n", stderr);
 fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
new file mode 100644
index 0000000..2fa4563
--- /dev/null
+++ b/tools/tokenizer/libtoken.c
@@ -0,0 +1,1282 @@
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
+   Prepared by: Geert Janssen <geert@us.ibm.com>
+
+   Code functionality shared by all tokenizers.
+   This obviously avoids code duplication and associated maintenance problems.
+*/
+
+#include "libtoken.h"
+
+// Program globals:
+const char *filename = "stdin";  // current file being parsed
+unsigned linenr = 1;       // physical line number counted from 1
+unsigned column = 0;       // byte position in physical line, from 0
+unsigned char_count = 0;   // total byte count
+unsigned utf8_count = 0;   // total utf-8 encoded unicode codepoints
+
+int buffer[MAX_BUF];       // use buffer as multi-char lookahead.
+unsigned buffered = 0;     // number of buffered bytes
+unsigned saved_col = 0;    // one-place buf for last column on prev line
+
+// Program option settings:
+int debug = 0;             // when 1 debug output to stderr
+int verbose = 0;           // when 1 info output to stderr
+int nowarn = 0;            // when 1 warnings are suppressed
+
+unsigned illegals = 0;     // count number of illegal characters
+unsigned unexpect_eof = 0; // encountered unexpected EOF
+int hash_as_comment = 0;   // when 1 treat # as line comment
+int newline_token = 0;     // when 1 output newline pseudo-token
+int comment_token = 0;     // when 1 output comments as tokens
+int whitespace_token = 0;  // when 1 output adjacent white-space as a token
+int continuation_token = 0;  // when 1 output line continuation pseudo-token
+
+static int logical_lines = 0;     // when 1 ignore line continuations in get)
+
+// Must be synced with enum TokenClass!
+const char *token_class[] = {
+  /* 0*/ "identifier",
+  /* 1*/ "keyword",
+  /* 2*/ "string",
+  /* 3*/ "character",
+  /* 4*/ "integer",
+  /* 5*/ "floating",
+  /* 6*/ "operator",
+  /* 7*/ "preprocessor",
+  /* 8*/ "line_comment",
+  /* 9*/ "block_comment",
+  /*10*/ "whitespace",
+  /*11*/ "newline",
+  /*12*/ "continuation",
+  /*13*/ "filename",
+  /*14*/ "endoffile"
+};
+
+/* No longer using perfect hash function but simple binary search. */
+
+/* C11 n1570.pdf 6.4.1 (44)
+   C17 n2176.pdf 6.4.1 (A.1.2) (44)
+*/
+static const char *C_keywords[] = {
+  "_Alignas",   "_Alignof",     "_Atomic",      "_Bool",        "_Complex",
+  "_Generic",   "_Imaginary",   "_Noreturn",    "_Static_assert",
+  "_Thread_local",
+
+  "auto",       "break",        "case",         "char",         "const",
+  "continue",   "default",      "do",           "double",       "else",
+  "enum",       "extern",       "float",        "for",          "goto",
+  "if",         "inline",       "int",          "long",         "register",
+  "restrict",   "return",       "short",        "signed",       "sizeof",
+  "static",     "struct",       "switch",       "typedef",      "union",
+  "unsigned",   "void",         "volatile",     "while"
+};
+
+#if 0
+/* C++ 2014 n4296.pdf 2.11 (84) */
+static const char *CPP_keywords[] = {
+  "alignas",       "alignof",       "and",           "and_eq",     "asm",
+  "auto",          "bitand",        "bitor",         "bool",       "break",
+  "case",          "catch",         "char",          "char16_t",   "char32_t",
+  "class",         "compl",         "const",         "const_cast", "constexpr",
+  "continue",      "decltype",      "default",       "delete",     "do",
+  "double",        "dynamic_cast",  "else",          "enum",       "explicit",
+  "export",        "extern",        "false",         "float",      "for",
+  "friend",        "goto",          "if",            "inline",     "int",
+  "long",          "mutable",       "namespace",     "new",        "noexcept",
+  "not",           "not_eq",        "nullptr",       "operator",   "or",
+  "or_eq",         "private",       "protected",     "public",     "register",
+  "reinterpret_cast", "return",     "short",         "signed",     "sizeof",
+  "static",        "static_assert", "static_cast",   "struct",     "switch",
+  "template",      "this",          "thread_local",  "throw",      "true",
+  "try",           "typedef",       "typeid",        "typename",   "union",
+  "unsigned",      "using",         "virtual",       "void",       "volatile",
+  "wchar_t",       "while",         "xor",           "xor_eq"
+};
+#endif
+
+/* C++23 n4885.pdf 5.11 (92) */
+static const char *CPP_keywords[] = {
+  "alignas",       "alignof",       "and",           "and_eq",     "asm",
+  "auto",          "bitand",        "bitor",         "bool",       "break",
+  "case",          "catch",         "char",          "char16_t",   "char32_t",
+  "char8_t",       "class",         "co_await",      "co_return",  "co_yield",
+  "compl",         "concept",       "const",         "const_cast", "consteval",
+  "constexpr",     "constinit",     "continue",      "decltype",   "default",
+  "delete",        "do",            "double",        "dynamic_cast", "else",
+  "enum",          "explicit",      "export",        "extern",     "false",
+  "float",         "for",           "friend",        "goto",       "if",
+  "inline",        "int",           "long",          "mutable",    "namespace",
+  "new",           "noexcept",      "not",           "not_eq",     "nullptr",
+  "operator",      "or",            "or_eq",         "private",    "protected",
+  "public",        "register",      "reinterpret_cast", "requires","return",
+  "short",         "signed",        "sizeof",        "static",  "static_assert",
+  "static_cast",   "struct",        "switch",        "template",   "this",
+  "thread_local",  "throw",         "true",          "try",        "typedef",
+  "typeid",        "typename",      "union",         "unsigned",   "using",
+  "virtual",       "void",          "volatile",      "wchar_t",    "while",
+  "xor",           "xor_eq"
+};
+
+/* Java SE 8 (50) (false, true, null are literals) */
+/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */
+static const char *Java_keywords[] = {
+  "abstract", "assert",     "boolean", "break",     "byte",      "case",
+  "catch",    "char",       "class",   "const",     "continue",  "default",
+  "do",       "double",     "else",    "enum",      "extends",   "final",
+  "finally",  "float",      "for",     "goto",      "if",        "implements",
+  "import",   "instanceof", "int",     "interface", "long",      "native",
+  "new",      "package",    "private", "protected", "public",    "return",
+  "short",    "static",     "strictfp","super",     "switch", "synchronized",
+  "this",     "throw",      "throws",  "transient", "try",       "void",
+  "volatile", "while"
+};
+
+static const char *Python_keywords[] = {
+  "False",  "None",   "True",    "and",      "as",       "assert", "async",
+  "await",  "break",  "class",   "continue", "def",      "del",    "elif",
+  "else",   "except", "finally", "for",      "from",     "global", "if",
+  "import", "in",     "is",      "lambda",   "nonlocal", "not",    "or",
+  "pass",   "raise",  "return",  "try",      "while",    "with",   "yield"
+};
+
+/* Includes future reserved keywords, strict mode reserved words and module
+   code reserved words, as well as all the older standards future reserved
+   words, and the literals null, false, and true.
+*/
+static const char *JavaScript_keywords[] = {
+  "abstract", "await",      "boolean",   "break",        "byte",
+  "case",     "catch",      "char",      "class",        "const",
+  "continue", "debugger",   "default",   "delete",       "do",
+  "double",   "else",       "enum",      "export",       "extends",
+  "false",    "final",      "finally",   "float",        "for",
+  "function", "goto",       "if",        "implements",   "import",
+  "in",       "instanceof", "int",       "interface",    "let",
+  "long",     "native",     "new",       "null",         "package",
+  "private",  "protected",  "public",    "return",       "short",
+  "static",   "super",      "switch",    "synchronized", "this",
+  "throw",    "throws",     "transient", "true",         "try",
+  "typeof",   "var",        "void",      "volatile",     "while",
+  "with",     "yield"
+};
+
+#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]);
+
+/* Generic binary search lookup in some keyword table.
+   `word' to be searched must be NUL-terminated C string.
+   `table' is array of const char * of `size' sorted alphabetically.
+   Returns word found (i.e., pointer value in table) or 0.
+*/
+#define lang_is_keyword(lang)                                           \
+  static const char *lang##_is_keyword(const char *word)                \
+  {                                                                     \
+    int i = 0, j = num_keywords(lang);                                  \
+    while (i < j) {                                                     \
+      int k = (i + j) >> 1 /* / 2 */;                                   \
+      const char *kw = lang##_keywords[k];                              \
+      int cmp = strcmp(word, kw);                                       \
+      if (!cmp)                                                         \
+        return kw;                                                      \
+      if (cmp < 0) j = k; else i = k + 1;                               \
+    }                                                                   \
+    return 0;                                                           \
+  }
+
+/* Define individual is_keyword functions per language: */
+/* C_is_keyword */
+lang_is_keyword(C)
+/* CPP_is_keyword */
+lang_is_keyword(CPP)
+/* Java_is_keyword */
+lang_is_keyword(Java)
+/* Python_is_keyword */
+lang_is_keyword(Python)
+/* JavaScript_is_keyword */
+lang_is_keyword(JavaScript)
+
+const char *(*is_keyword)(const char *) = C_is_keyword;
+
+/* Conversion table from filename extension to language code.
+   To find language code, consider all entries and check each ext
+   against filename; matched language is langs[i].lang.
+   Invariant: langs[X].lang == X for every Language value.
+   String representation of language code is langs[X].name.
+
+   Have certain config settings depend on the language.
+   Use 2 step:
+   1. determine language from name/extension
+   2. look up language config
+*/
+static const struct {
+  const char *ext;
+  Language lang;
+  const char *name;
+}
+  langs[] = {
+    { ".c",    C,          "C" },
+    { ".cpp",  CPP,        "C++" },
+    { ".java", JAVA,       "Java" },
+    { ".js",   JAVASCRIPT, "JavaScript" },
+    { ".py",   PYTHON,     "Python" },
+
+    // Alternatives:
+    { ".h",    C,          "" },
+    { ".C",    CPP,        "" },
+    { ".cc",   CPP,        "" },
+    { ".hh",   CPP,        "" },
+};
+
+const char *lang_name(Language lang)
+{
+  return langs[lang].name;
+}
+
+static const struct {
+  //Language lang; implicit
+  const char *(*is_keyword)(const char *);
+}
+  lang_configs[] = {
+    { C_is_keyword,          },
+    { CPP_is_keyword,        },
+    { Java_is_keyword,       },
+    { JavaScript_is_keyword, },
+    { Python_is_keyword,     },
+};
+
+/* Must be called right after a file is opened as stdin.
+   Will attempt to remove any UTF-8 unicode signature (byte-order mark, BOM)
+   at the beginning of the file.
+   Unicode: U+FEFF
+   UTF-8: EF BB BF
+
+   First bytes Encoding              Must remove?
+   00 00 FE FF UTF-32 big endian     Yes
+   FF FE 00 00 UTF-32 little endian  Yes
+   FE FF       UTF-16 big endian     Yes
+   FF FE       UTF-16 little endian  Yes
+   00 00 00 xx UTF-32 big endian     No
+   xx 00 00 00 UTF-32 little endian  No
+   00 xx       UTF-16 big endian     No
+   xx 00       UTF-16 little endian  No
+   otherwise   UTF-8                 No
+*/
+static void remove_BOM(void)
+{
+  int c1 = getchar();
+  if (c1 == 0xEF) {
+    int c2 = getchar();
+    if (c2 == 0xBB) {
+      int c3 = getchar();
+      if (c3 == 0xBF) {
+        return;
+      }
+      if (c3 != EOF) buffer[buffered++] = c3;
+    }
+    if (c2 != EOF) buffer[buffered++] = c2;
+  }
+  if (c1 != EOF) buffer[buffered++] = c1;
+}
+
+int open_as_stdin(const char *file)
+{
+  filename = file;
+  if (!freopen(filename, "r", stdin)) {
+    if (!nowarn)
+      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
+    return -1;
+  }
+  return set_or_detect_lang(0);
+}
+
+/* Deal with DOS (\r \n) and classic Mac OS (\r) (physical) line endings.
+   In case of CR LF skip (but count) the CR and return LF.
+   In case of CR not followed by LF turns the CR into LF and returns that.
+   All other chars are returned as is.
+   Note: never returns a CR (\r). Line/column counts are not affected here.
+*/
+static int normalize_newline(void)
+{
+  /* No need to recognize Unicode code points here. */
+  int cc = getchar();
+
+  if (cc == '\r') {
+    // Maybe \r \n (CR NL) combination?
+    int nc = getchar();
+    if (nc == '\n') {
+      char_count++; // counts the carriage return
+      utf8_count++;
+      // No use incrementing column.
+      return nc; // return \n; effectively skipping the \r
+    }
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
+    // cc == '\r'; consider a newline as well, so turn into \n:
+    cc = '\n';
+  }
+  return cc;
+}
+
+/* Detects escaped newlines (line continuations) and signals them with the
+   special '\r' character (that otherwise is not used).
+   Keeps track of physical coordinates and absolute location for each character.
+*/
+int get(void)
+{
+  int cc;
+
+ restart:
+  // Get the next character:
+  if (buffered) { // chars available in lookahead buffer
+    cc = buffer[--buffered]; // never EOF
+    char_count++;
+    // cc maybe '\r' (line continuation); act like '\n':
+    if (cc == '\n' || cc == '\r') {
+      linenr++;
+      saved_col = column;
+      column = 0;
+      return cc;
+    }
+    column++;
+    return cc;
+  }
+
+  // Read a fresh char:
+  cc = normalize_newline(); // cc != '\r'
+  if (cc == EOF) return EOF;
+  char_count++;
+  if (utf8_start(cc)) utf8_count++;
+
+  if (cc == '\n') { // a normalized end-of-line (\r|\r?\n)
+    linenr++;
+    saved_col = column;
+    column = 0;
+    return cc; // \n here signals a logical end-of-line
+  }
+
+  // Deal with explicit \ line continuations!
+  if (cc == '\\') {
+    // Must look ahead (never maintained across get calls!):
+    int nc = normalize_newline(); // cc != '\r'
+    if (nc == '\n') {
+      char_count++; // counts the newline
+      utf8_count++;
+      linenr++;     // on next physical line
+      saved_col = column+1; // +1 for backslash
+      column = 0;
+
+      if (logical_lines)
+        // Still need to get a character.
+        // Could again start a line continuation!
+        goto restart;
+
+      // Signal that this was an escaped newline (= line continuation):
+      return '\r';
+    }
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
+    // cc == '\\' a regular backslash
+  }
+  column++;
+  return cc;
+}
+
+/* Undo action of a get() lookahead call.
+   An attempt at undoing an EOF read has no effect.
+   Since get() encodes logical line endings with \n and continuation
+   line endings with \r, both could be subject to an unget().
+*/
+void unget(int cc)
+{
+  if (cc == EOF) return;
+  if (buffered < MAX_BUF) {
+    if (cc == '\n' || cc == '\r') {
+      linenr--;
+      // column was 0 right after getting the \n
+      // hopefully there are no multiple ungets of \n
+      column = saved_col;
+    }
+    else
+      column--;
+    char_count--;
+    buffer[buffered++] = cc;
+  }
+  else {
+    fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF);
+    exit(2);
+  }
+}
+
+/* Either set this file's input language explicitly via a string or
+   use the filename extension to determine the language.
+   If neither works out, use the default language C.
+   Uses global filename (maybe stdin).
+   Once the language is known, configs for that language are applied,
+   e.g. the correct keyword table to use.
+*/
+Language set_or_detect_lang(const char *source)
+{
+  int i;
+  Language lang = C; // default language
+
+  if (source) {
+    /* Check if explicit language is known: */
+    for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
+      if (!strcmp(source, langs[i].name)) {
+        lang = langs[i].lang;
+        goto done;
+      }
+    fprintf(stderr, "(E): No support for language `%s'.\n", source);
+  }
+
+  char *p;
+  if (p = strrchr(filename, '.')) {
+    for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
+      if (!strcmp(p, langs[i].ext)) {
+        lang = langs[i].lang;
+        goto done;
+      }
+    fprintf(stderr, "(E): Unknown filename extension `%s'.\n", p);
+  }
+  if (!nowarn)
+    fprintf(stderr, "(W): Assuming default language C.\n");
+
+ done:
+  is_keyword = lang_configs[lang].is_keyword;
+  return lang;
+}
+
+// Dynamically sized token buffer:
+static char *token_buf = 0;
+static unsigned token_alloc = 0;
+static unsigned token_len = 0;
+
+// Makes sure there is room in the token buffer.
+static void token_buf_room(void)
+{
+  if (token_len == token_alloc) { // all space used up
+    if (!token_alloc) { // first time allocation
+      token_alloc = 65536;
+      if (!(token_buf = malloc(token_alloc))) {
+        fprintf(stderr, "(F): Allocation of token buffer failed.\n");
+        exit(4);
+      }
+      token_buf[0] = '\0'; // for safety
+      return;
+    }
+
+    token_alloc <<= 1;
+    if (!(token_buf = realloc(token_buf, token_alloc))) {
+      fprintf(stderr, "(F): Reallocation of token buffer failed.\n");
+      exit(4);
+    }
+    //fprintf(stderr, "Realloc-ed token buf.\n");
+  }
+}
+
+// Appends a character to the token buffer, always making sure there is room.
+static void token_buf_push(int cc)
+{
+  token_buf_room();
+  // There is room: token_len < token_alloc
+  token_buf[token_len++] = cc;
+}
+
+// Undoes the push action but only if there is some content.
+static int token_buf_pop(void)
+{
+  return token_len ? token_buf[--token_len] : 0;
+}
+
+// Adds a terminating NUL character which does not change the token length.
+static void token_buf_close(void)
+{
+  token_buf_room();
+  token_buf[token_len] = '\0'; // Note: no advance
+}
+
+// Resets the token buffer cursor.
+static void token_buf_reset(void)
+{
+  token_len = 0;
+}
+
+/* Tokenization of C++ programming language source text.
+   Recognizes:
+   - identifier
+   - reserved word/keyword
+   - binary, octal, decimal, hexadecimal and floating-point numbers
+   - double-quoted string literal
+   - single-quoted character literal
+   - all single, double, and triple operator and punctuation symbols
+   - the preprocessor tokens # and ##
+   Optionally:
+   - filename       start_token
+   - line_comment   comment_token
+   - block_comment  comment_token
+   - newline        newline_token
+   - continuation   continuation_token
+   - whitespace     whitespace_token
+
+   Normally skips white-space and comments and flags anything
+   left over as illegal characters.
+
+   (Approximately 20 tests per single character worst-case.)
+
+   Returns 0 upon EOF else the token length in bytes.
+   (There are no 0-length tokens!)
+   EOF may be interpreted as a token. The function then returns:
+   token = "", type = endoffile, line and col correctly defined.
+
+   An unexpected EOF in the middle of a token will cause an error message
+   and the partial token to be output first before a next call returns 0
+   (to indicate the EOF condition).
+*/
+
+unsigned C_tokenize_int(const char **token, enum TokenClass *type,
+			unsigned *line, unsigned *col, unsigned *pos)
+{
+  int cc;
+  *type = ENDOFFILE;
+
+  do { // infinite loop; after token recognized breaks out.
+    // Start collecting a token.
+    token_buf_reset();
+    *line = linenr;
+    *col = column;
+    *pos = char_count;
+    // white-space tokens see continuation lines:
+    logical_lines = 0;
+    cc = get();
+
+  restart:
+    // cc already read; coordinates for it are correct.
+
+    /*** WHITE-SPACE ***/
+
+    /* In principle all consecutive white-space including \n and \r (and some
+       other control chars) are collected and form a single whitespace token.
+       However, when newlines are requested to be reported as separate tokens,
+       they break this pattern. Note that we cannot issues multiple tokens
+       in a single call to this function.
+
+       Token buf will only hold some white-space chars when implicitly
+       requested via whitespace_token; otherwise stays empty.
+       Same for the \n and \r requests.
+     */
+
+    if (cc == '\n' && newline_token) { // end of a logical line
+      // Here we assume the buf is empty.
+      token_buf_push(cc);
+      *type = NEWLINE;
+      break;
+    }
+
+    if (cc == '\r' && continuation_token) { // end of a physical line
+      // Here we assume the buf is empty.
+      token_buf_push('\\');
+      token_buf_push('\n');
+      *type = CONTINUATION;
+      break;
+    }
+
+    // Aggregate as much white-space as possible.
+    // FIXME: officially a NUL should be considered white-space.
+    while (isspace(cc)) {	// i.e., cc in [ \f\n\r\t\v]
+      // Here: !newline_token (!continuation_token)
+      if (whitespace_token)
+        if (cc == '\r') { // line continuation
+          // Convert back to original char sequence:
+          token_buf_push('\\');
+          token_buf_push('\n');
+        }
+        else
+          token_buf_push(cc); // perhaps \n
+      //else: white-space is discarded
+
+      // Here: whitespace_token implies token_len > 0
+
+      cc = get();
+      if (cc == '\n' && newline_token ||
+	  cc == '\r' && continuation_token) {
+	// Must issue whitespace token if so requested.
+	if (whitespace_token) {
+	  // Undo lookahead (unget(EOF) has no effect!):
+	  unget(cc); // next token will be newline/continuation
+	  *type = WHITESPACE;
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
+	}
+	// Issue newline/continuation token right away:
+	goto restart;
+      }
+    }
+    // Here: !isspace: must break or start real token.
+
+    if (whitespace_token && token_len) {
+      // Undo lookahead (unget(EOF) has no effect!):
+      unget(cc);
+      *type = WHITESPACE;
+      break;
+    }
+
+    if (cc == EOF) {
+      token_buf_reset();
+      break;
+    }
+
+    // Rest of tokens treat line continuations as non-existent:
+    logical_lines = 1;
+
+    // If white-space skipped must reset coordinates:
+    *line = linenr;
+    *col = column-1;
+    *pos = char_count-1;
+
+    /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/
+    // Java: no preprocessor directives.
+
+    // NULs (like many other chars) in comments are silently ignored!
+
+    if (cc == '#' && hash_as_comment) {
+      if (comment_token)
+        token_buf_push(cc);
+      // Skip till end-of-line (\n exclusive):
+      while ((cc = get()) != '\n' && cc != EOF)
+        if (comment_token)
+          token_buf_push(cc);
+      // cc == '\n' || cc == EOF
+      // Don't consider \n part of comment.
+      if (comment_token) {
+	// Undo lookahead (unget(EOF) has no effect!):
+        unget(cc);
+        *type = LINE_COMMENT;
+        break;
+      }
+      *line = linenr-1;
+      *col = saved_col;
+      *pos = char_count;
+      goto restart;
+    }
+
+    /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/
+
+    if (cc == '/') {
+      cc = get();
+      if (cc == '/') {
+        if (comment_token) {
+          token_buf_push(cc);
+          token_buf_push(cc);
+        }
+        // Skip till end-of-line (\n exclusive):
+        while ((cc = get()) != '\n' && cc != EOF)
+          if (comment_token)
+            token_buf_push(cc);
+        // cc == '\n' || cc == EOF
+        // Don't consider \n part of comment.
+        if (comment_token) {
+	  // Undo lookahead (unget(EOF) has no effect!):
+          unget(cc);
+          *type = LINE_COMMENT;
+          break;
+        }
+	*line = linenr-1;
+	*col = saved_col;
+	*pos = char_count;
+        goto restart;
+      }
+
+      if (cc == '*') {
+        if (comment_token) {
+          token_buf_push('/');
+          token_buf_push(cc);
+        }
+        // Skip till */ inclusive:
+        int nc = get(); // if EOF next get will be EOF too
+        if (comment_token && nc != EOF)
+          token_buf_push(nc);
+        do {
+          cc = nc;
+          nc = get();
+          if (nc == EOF) { // Error!
+            fprintf(stderr,
+                    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
+                    filename, *line);
+            unexpect_eof++;
+	    if (comment_token)
+	      // Better return partial comment as token and postpone EOF:
+	      *type = BLOCK_COMMENT;
+	    else
+	      token_buf_reset();
+	    token_buf_close();
+	    *token = token_buf;
+            return token_len;
+          }
+          if (comment_token)
+            token_buf_push(nc);
+        } while (cc != '*' || nc != '/');
+        // cc == '*' && nc == '/'
+        // Don't consider char right after */ as part of comment.
+        if (comment_token) {
+          *type = BLOCK_COMMENT;
+          break;
+        }
+	*line = linenr;
+	*col = column;
+	*pos = char_count;
+        cc = get();
+        goto restart;
+      }
+      // seen / but not // or /*
+      unget(cc); // char after /
+      cc = '/'; // restore /
+    }
+
+    // If white-space and/or comments skipped must reset coordinates:
+    *line = linenr;
+    *col = column-1;
+    *pos = char_count-1;
+
+    /*** CHAR and STRING PREFIX (C/C++) ***/
+
+    // Allow u,U,L prefix for string and char
+    // FIXME: allow u8 as prefix for string
+    if (cc == 'L' || cc == 'u' || cc == 'U') {
+      token_buf_push(cc);
+      cc = get();
+      if (cc == '"')
+        goto string_token;
+      if (cc == '\'')
+        goto char_token;
+      // u,U,L will be interpreted as (start of) identifier.
+      unget(cc); // char after u,U,L
+      cc = token_buf_pop(); // restore original and remove from token
+    }
+
+    /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/
+    // Java: false, true, null are literals
+    // FIXME: Flag to allow .letter as part of identifier?
+    // (compound identifier)
+
+    // Simplistic solution to allowing Unicode: allow any char >= 128 without
+    // actual checking for UTF-8.
+    if (isalpha(cc) || cc == '_' || cc == '$' || (cc & 0x80)) {
+      token_buf_push(cc);
+      while (isalnum(cc = get()) || cc == '_' || cc == '$' ||
+             cc != EOF && (cc & 0x80))
+        token_buf_push(cc);
+      unget(cc);
+      token_buf_close();
+      *type = is_keyword(token_buf) ? KEYWORD : IDENTIFIER;
+      break;
+    }
+
+    /*** INTEGER and FLOATING ***/
+    // Java: uses _ in numbers as insignificant separator
+    // Java: decimal suffix: [lL], float suffix: [fFdD]
+    // Java: allows hex float
+
+#if 0
+    // Examples:
+    int bin_num = 0B010101u;
+    int oct_num = 01234567L;
+    int hex_num = 0x123ABCLL;
+    int dec_num = 12345678;
+
+    float flt_num1 = 077.;
+    float flt_num2 = 077.987;
+    float flt_num3 = 77.;
+    float flt_num4 = .77;
+#endif
+
+    // . digits ... floating
+    if (cc == '.') {
+      // Look ahead for a digit:
+      int nc;
+      if (isdigit(nc = get())) {
+        unget(nc);
+        goto start_fraction;
+      }
+      unget(nc);
+      // Could go immediately to operator: goto seen_period
+    }
+
+    if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal
+      // Types of integer literals:
+      enum {
+        BIN, OCT, DEC, HEX
+      } int_lit = cc == '0' ? OCT : DEC;
+
+      // Lookahead:
+      int nc = get();
+      if (int_lit == OCT && (nc == 'x' || nc == 'X')) {
+        int_lit = HEX;
+        token_buf_push(cc); // the 0
+        cc = nc; // the x or X
+      }
+      else
+      if (int_lit == OCT && (nc == 'b' || nc == 'B')) {
+        int_lit = BIN;
+        token_buf_push(cc); // the 0
+        cc = nc; // the b or B
+      }
+      else
+        unget(nc); // isdigit(cc)
+
+      do {
+        token_buf_push(cc);
+        cc = get();
+
+        // Allow for ' between `digits':
+        if (cc == '\'') {
+          // Keep the ' in the token for now:
+          token_buf_push(cc);
+          int nc = get();
+          if (isdigit(nc) || int_lit == HEX && isxdigit(nc))
+            cc = nc;
+          else { // Error!
+            fprintf(stderr,
+                    "(E): [%s:%u] C++14 only allows ' between digits.\n",
+                    filename, linenr);
+            // what to do?
+          }
+        }
+      } while (isdigit(cc) || int_lit == HEX && isxdigit(cc));
+      // !is[x]digit(cc)
+
+      // FIXME: allow hex floats in C
+      if (int_lit == OCT || int_lit == DEC) {
+        int floating = 0;
+        // Seen digits-sequence. Maybe followed by . or e or E?
+        if (cc == '.') { // fractional part
+        start_fraction:
+          floating = 1;
+          token_buf_push(cc);
+          // digits? FIXME: again allow ' between digits
+          while (isdigit(cc = get()))
+            token_buf_push(cc);
+          // !isdigit(cc)
+        }
+        // cc != '.' || !isdigit(cc)
+        if (cc == 'e' || cc == 'E') { // exponent
+          floating = 1;
+          token_buf_push(cc);
+          if ((cc = get()) == '-' || cc == '+') {
+            token_buf_push(cc);
+            cc = get();
+          }
+          // FIXME: no check for at least 1 digit
+          // FIXME: again allow ' between digits
+          while (isdigit(cc)) {
+            token_buf_push(cc);
+            cc = get();
+          }
+          // !isdigit(cc)
+        }
+        if (floating) {
+          if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L')
+            token_buf_push(cc);
+          else
+            unget(cc);
+          *type = FLOATING;
+          break;
+        }
+      }
+
+      // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case
+      if (cc == 'l' || cc == 'L') {
+        token_buf_push(cc);
+        // maybe another l
+        cc = get();
+        if (cc == 'l' || cc == 'L') {
+          token_buf_push(cc);
+          // Here: token is digits[lL][lL]
+          cc = get();
+        }
+        // maybe a u
+        if (cc == 'u' || cc == 'U')
+          // Here: token is digits[lL][lL]?[u|U]
+          token_buf_push(cc);
+        else
+          unget(cc);
+      }
+      else if (cc == 'u' || cc == 'U') {
+        token_buf_push(cc);
+        // maybe an l
+        cc = get();
+        if (cc == 'l' || cc == 'L') {
+          token_buf_push(cc);
+          // Here: token is digits[uU][lL]
+          cc = get();
+        }
+        // maybe another l
+        if (cc == 'l' || cc == 'L')
+          // Here: token is digits[uU][lL]?[lL]
+          token_buf_push(cc);
+        else
+          unget(cc);
+      }
+      else
+        unget(cc);
+      *type = INTEGER;
+      break;
+    }
+
+    /*** STRING (C/C++/Java) ***/
+
+    if (cc == '"') {
+    string_token:
+      token_buf_push(cc);
+      // Watch out for escaped " inside string.
+      cc = get();
+      while (cc != '"') {
+        if (cc == EOF) { // Error!
+          fprintf(stderr,
+                  "(E): [%s:%u] Unexpected end-of-file in string literal.\n",
+                  filename, *line);
+          unexpect_eof++;
+	  // Better return partial string as token and postpone EOF:
+	  *type = STRING;
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
+        }
+        token_buf_push(cc);
+        int nc = get();
+
+        if (cc == '\\') {
+          // FIXME: No check on valid escape char!
+          // ' " ? \ a b f n r t v
+          token_buf_push(nc);
+          cc = get();
+        }
+        else
+          cc = nc;
+      }
+      // cc == '"'
+      token_buf_push(cc);
+      *type = STRING;
+      break;
+    }
+
+    /*** CHARACTER (C/C++/Java) ***/
+
+    if (cc == '\'') {
+    char_token:
+      token_buf_push(cc);
+      // Watch out for escaped ' inside char.
+      cc = get();
+      // Cannot have empty char!
+      if (cc == '\'') {
+	fprintf(stderr,
+		"(E): [%s:%u] Cannot have an empty character literal.\n",
+		filename, linenr);
+	// Output as token anyway, but count as illegal:
+	token_buf_push(cc);
+	*type = CHARACTER;
+	illegals++;
+	break;
+      }
+
+      // FIXME: Avoid including too many chars.
+      while (cc != '\'') {
+        if (cc == EOF) { // Error!
+          fprintf(stderr,
+                  "(E): [%s:%u] Unexpected end-of-file in character literal.\n",
+                  filename, linenr);
+          unexpect_eof++;
+	  // Better return partial character as token and postpone EOF:
+	  *type = CHARACTER;
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
+        }
+        if (cc == '\n') { // Error!
+          fprintf(stderr,
+                 "(E): [%s:%u] Cannot have end-of-line in character literal.\n",
+                  filename, linenr);
+	  illegals++;
+	  // Immediately terminate character literal as if ' present.
+	  // cc = '\''; make into valid literal??? No!
+	  break;
+        }
+	token_buf_push(cc);
+        int nc = get();
+        if (cc == '\\') {
+          token_buf_push(nc);
+          cc = get();
+          // FIXME: No check on valid escape char!
+          // ' " ? \ a b f n r t v 0[d[d]] xh*
+        }
+        else {
+	  cc = nc;
+	  // If first char then expect no more.
+	  if (token_len == 2) {
+	    if (nc != '\'') {
+	      fprintf(stderr,
+		      "(E): [%s:%u] Cannot have multi-character literal.\n",
+		      filename, linenr);
+	      illegals++;
+	      // Immediately terminate character literal as if ' present.
+	      // cc = '\''; make into valid literal???
+	      break;
+	    }
+	  }
+	}
+      }
+      if (cc == '\'')
+	token_buf_push(cc);
+      else
+	unget(cc);
+      *type = CHARACTER;
+      break;
+    }
+
+    /*** OPERATOR (and PUNCTUATION) (C/C++/Java) ***/
+
+    // Operator and punctuation symbols. Longest match.
+
+    /* Operator or punctuator   Alternative representation
+       {        <%
+       }        %>
+       [        <:
+       ]        :>
+       #        %:      (not supported here)
+       ##       %:%:    (not supported here)
+    */
+
+    // Single char operator or punctuator (C/C++/Java)
+    // { } [ ] ( ) ; : ? . ~ ! + - * / % ^ = & | < > ,
+    // Double char operator or punctuator (C/C++)
+    // <: :> <% %>
+    // Double char operator or punctuator (C/C++/Java)
+    // += -= *= /= %= ^= &= |= == != <= >= && || << >> ++ -- ->
+    // Double char operator or punctuator (C++/Java)
+    // ::
+    // Double char operator or punctuator (C++)
+    // .*
+    // Triple char operator or punctuator (C/C++/Java)
+    // ... <<= >>=
+    // Triple char operator or punctuator (C++)
+    // ->* <=>
+    // Java: @ >>> >>>=
+
+    //seen_period:
+
+    token_buf_push(cc);
+    token_buf_close();
+    //token=[cc,0];len=1
+
+    if (strstr("{}[]();?~,@", token_buf)) { // allow @ for Java
+      // Single char operator/punctuator.
+      *type = OPERATOR;
+      break;
+    }
+
+    if (strstr("<:.-+*/%^&|=!>", token_buf)) { // single or start of double/triple
+      // Check second char:
+      int c2 = get();
+      if (c2 != EOF) {
+        token_buf_push(c2);
+        //token=[cc,c2];len=2
+
+        // Check third char:
+        int c3 = get();
+        if (c3 != EOF) {
+          token_buf_push(c3);
+          token_buf_close();
+          //token=[cc,c2,c3,0];len=3
+          if (!strcmp(">>>", token_buf)) { // allow >>> for Java
+            //token=[>,>,>,0];len=3
+            // Look-ahead for =:
+            int c4 = get();
+            if (c4 == '=') // >>>= for Java
+              token_buf_push(c4);
+              //token=[>,>,>,=];len=4
+            else
+              unget(c4);
+              //token=[>,>,>,0];len=3
+            *type = OPERATOR;
+            break;
+          }
+          //token=[cc,c2,c3,0];len=3
+
+          if (!strcmp("...", token_buf) ||
+              !strcmp("<=>", token_buf) ||
+              !strcmp("->*", token_buf) ||
+              !strcmp("<<=", token_buf) ||
+              !strcmp(">>=", token_buf)) {
+            // Triple char operator/punctuator.
+            *type = OPERATOR;
+            break;
+          }
+
+          // Maybe double char. Undo the c3 token extension:
+          token_buf_pop();
+          token_buf_close();
+          //token=[cc,c2,0];len=2
+        }
+        else
+          token_buf_close();
+          //token=[cc,c2,0];len=2
+        unget(c3);
+
+        // Maybe double char.
+        static const char * const ops2[] = {
+          "<:", "<%", "<=", "<<", ":>",
+          "::", ".*", "->", "-=", "--",
+          "+=", "++", "*=", "/=", "%>",
+          "%=", "^=", "&=", "&&", "|=",
+          "||", "==", "!=", ">=", ">>"
+        };
+        unsigned size = sizeof(ops2) / sizeof(ops2[0]);
+        unsigned i;
+        for (i = 0; i < size; i++)
+          if (!strcmp(ops2[i], token_buf))
+            break;
+        if (i < size) {
+          *type = OPERATOR;
+          break;
+        }
+        //token=[cc,c2,0];len=2
+
+        // Must be single char. Undo the c2 token extension:
+        token_buf_pop();
+        token_buf_close();
+        //token=[cc,0];len=1
+      }
+      //else token=[cc,0];len=1
+
+      // Must be single char.
+      unget(c2);
+      *type = OPERATOR;
+      break;
+    }
+    //token=[cc,0];len=1
+
+    /*** PREPROCESSOR (C/C++) ***/
+
+    if (cc == '#') {
+      int nc = get();
+      if (nc != '#')
+        unget(nc);
+      else
+        token_buf_push(nc);
+      *type = PREPROCESSOR;
+      break;
+    }
+
+    // What is left here? Illegal chars!
+    if (!nowarn)
+      // Mind non-printing chars!
+      fprintf(stderr,
+              "(W): [%s:%u] Illegal character `%s%c` (0x%02x) skipped.\n",
+              filename, linenr, cc<32?"CTRL-":"", cc<32?cc+64:cc, cc);
+    // Count them:
+    illegals++;
+
+  } while (1);
+  token_buf_close();
+  *token = token_buf;
+  return token_len;
+}
+
+unsigned C_tokenize(const char **token, const char **type,
+                    unsigned *line, unsigned *col, unsigned *pos)
+{
+  enum TokenClass typeid;
+  unsigned result = C_tokenize_int(token, &typeid, line, col, pos);
+  *type = token_class[typeid];
+  return result;
+}
+
+// Escape hard newlines in a string.
+void RAW_escape(FILE *out, const char *token)
+{
+  const char *p;
+  for (p = token; *p; p++) {
+    if (*p == '\n') {
+      fputs("\\n", out);
+      continue;
+    }
+    fputc(*p, out);
+  }
+}
+
+// Escape token for output as CSV string.
+void CSV_escape(FILE *out, const char *token)
+{
+  const char *p;
+  // start CSV string:
+  fputc('"', out);
+  for (p = token; *p; p++) {
+    if (*p == '\n') { // escape embedded real new lines
+      fputs("\\n", out);
+      continue;
+    }
+    if (*p == '"')
+      fputc('"', out);
+    fputc(*p, out);
+  }
+  // end CSV string:
+  fputc('"', out);
+}
+
+// Escape token for output as JSON string.
+void JSON_escape(FILE *out, const char *token)
+{
+  // C/C++ has escapes: \' \" \? \a \b \f \n \r \t \v \x \0.
+  // To preserve, simply escape the backslash and all ":
+  const char *p;
+  for (p = token; *p; p++) {
+    if (*p == '\n') { // escape embedded real new lines
+      fputs("\\n", out);
+      continue;
+    }
+    if (*p == '\t') { // escape embedded real TABs
+      fputs("\\t", out);
+      continue;
+    }
+    // FIXME: control characters from U+0000 through U+001F must be escaped
+    if (*p == '\\' || *p == '"')
+      fputc('\\', out);
+    fputc(*p, out);
+  }
+}
+
+// Escape token for output as XML text.
+void XML_escape(FILE *out, const char *token)
+{
+#if 1
+  // Alternative: escape every <, >, and &:
+  const char *p;
+  for (p = token; *p; p++) {
+    if (*p == '<')
+      fputs("&lt;", out);
+    else
+    if (*p == '>')
+      fputs("&gt;", out);
+    else
+    if (*p == '&')
+      fputs("&amp;", out);
+    else
+      fputc(*p, out);
+  }
+#else
+  // User CDATA construct for escaping.
+  // Impossible to escape ]]> occurring in token!
+  // Must chop up the substring ]]> in ]] and >.
+  const char *p;
+  const char *q = token;
+  // "abc]]>hello" => <![CDATA["abc]]]]><![CDATA[>hello"]]>
+  // "]]>]]>" => <![CDATA[]]]]><!CDATA[>]]]]><![CDATA[>"]]>
+  while ((p = strstr(q, "]]>"))) {
+    int len = p - q; // always > 0
+    fputs("<![CDATA[", out);
+    fwrite(q, 1, len, out);
+    fputs("]]]]>", out);
+    q = p+2; // q start at >...
+  }
+  if (q < token+strlen(token))
+    fprintf(out, "<![CDATA[%s]]>", q);
+#endif
+}
diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h
new file mode 100644
index 0000000..0af2491
--- /dev/null
+++ b/tools/tokenizer/libtoken.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
+   Prepared by: Geert Janssen <geert@us.ibm.com>
+
+   Code functionality shared by all tokenizers.
+*/
+
+#ifndef LIBTOKEN_H
+#define LIBTOKEN_H
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_BUF       8  // maximum lookahead in chars
+
+/* Let's assume UTF-8 encoding.
+   https://www.cprogramming.com/tutorial/unicode.html
+   https://opensource.apple.com/source/tidy/tidy-2.2/tidy/src/utf8.c.auto.html
+*/
+
+// Test for start of UTF-8 sequence.
+#define utf8_start(cc)  (((cc)&0xC0)!=0x80)
+#define utf8_follow(cc) (((cc)&0xC0)==0x80)
+
+#define utf8_len(cc) \
+  (((cc)&0xF8)==0xF0 ? 4 : ((cc)&0xF0)==0xE0 ? 3 : ((cc)&0xE0)==0xC0 ? 2 : 1)
+
+typedef enum { C, CPP, JAVA, JAVASCRIPT, PYTHON } Language;
+
+// Program globals:
+extern const char *filename/*= "stdin"*/;  // current file being parsed
+extern unsigned linenr/*= 1*/;       // physical line number counted from 1
+extern unsigned column/*= 0*/;       // char position in physical line, from 0
+extern unsigned saved_col/*= 0*/;    // 1-place buf for last column on prev line
+extern unsigned char_count/*= 0*/;   // total char/byte count
+extern unsigned utf8_count/*= 0*/;   // total utf-8 char count
+extern unsigned buffered/*= 0*/;     // number of buffered chars
+extern int buffer[MAX_BUF];          // use buffer as multi-char lookahead.
+
+// Program option settings:
+extern int debug/*= 0*/;             // when 1 debug output to stderr
+extern int verbose/*= 0*/;           // when 1 info output to stderr
+extern int nowarn/*= 0*/;            // when 1 warnings are suppressed
+
+extern unsigned illegals/*= 0*/;     // count number of illegal characters
+extern unsigned unexpect_eof/*= 0*/; // encountered unexpected EOF
+extern int hash_as_comment/*= 0*/;   // when 1 treat # as line comment
+extern int newline_token/*= 0*/;     // when 1 output newline pseudo-token
+extern int comment_token/*= 0*/;     // when 1 output comments as tokens
+extern int whitespace_token/*= 0*/;  // when 1 output adjacent white-space as a token
+extern int continuation_token/*= 0*/; // when 1 output line continuation pseudo-token
+
+enum TokenClass {
+  /* 0*/ IDENTIFIER,
+  /* 1*/ KEYWORD,
+  /* 2*/ STRING,
+  /* 3*/ CHARACTER,
+  /* 4*/ INTEGER,
+  /* 5*/ FLOATING,
+  /* 6*/ OPERATOR,
+  /* 7*/ PREPROCESSOR,
+  /* 8*/ LINE_COMMENT,
+  /* 9*/ BLOCK_COMMENT,
+  /*10*/ WHITESPACE,
+  /*11*/ NEWLINE,
+  /*12*/ CONTINUATION,
+  /*13*/ FILENAME,
+  /*14*/ ENDOFFILE
+};
+
+extern const char *token_class[];
+
+// keyword lookup function (pointer variable):
+// (initialized by set_or_detect_lang())
+extern const char *(*is_keyword)(const char *);
+
+extern int get(void);
+extern void unget(int cc);
+extern Language set_or_detect_lang(const char *source);
+extern const char *lang_name(Language lang);
+extern int open_as_stdin(const char *file);
+
+extern unsigned C_tokenize_int(const char **token, enum TokenClass *type,
+			       unsigned *line, unsigned *col, unsigned *pos);
+extern unsigned C_tokenize(const char **token, const char **type,
+			   unsigned *line, unsigned *col, unsigned *pos);
+
+extern void  RAW_escape(FILE *out, const char *token);
+extern void  CSV_escape(FILE *out, const char *token);
+extern void JSON_escape(FILE *out, const char *token);
+extern void  XML_escape(FILE *out, const char *token);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBTOKEN_H */
diff --git a/tools/tokenizer/ntokenize.c b/tools/tokenizer/ntokenize.c
index b5625fe..7adac51 100644
--- a/tools/tokenizer/ntokenize.c
+++ b/tools/tokenizer/ntokenize.c
@@ -56,62 +56,62 @@
 #define ws_RE           "[ \t\v\f\n]*"
 
 // 96 chars (omitted are e.g.: @ $ `)
-//                                     3  5  67         8         9       9
-//                        1234 5 6 7   3  9  9012345678901234567890123 4 56
-#define basic_char0_RE	"[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]"
+//                                     33 56 67         8         9       9
+//                        1234 5 6 7 8 34 90 9012345678901234567890123 4 56
+#define basic_char0_RE  "[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]"
 
 // all basic chars except \n and >
-#define h_chars_RE	"[][ \t\v\fa-zA-Z0-9_{}#()<%:;.?*+/^&|~!=,\\\"'-]+"
+#define h_chars_RE      "[][ \t\v\fa-zA-Z0-9_{}#()<%:;.?*+/^&|~!=,\\\"'-]+"
 // all basic chars except \n and \"
-#define q_chars_RE	"[][ \t\v\fa-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\'-]+"
-#define header_RE	"<"h_chars_RE">|\""q_chars_RE"\""
-#define pp_number_RE	"\\.?[0-9]('?[a-zA-Z_0-9]|[eE][+-]|\\.)*"
-
-#define unichar_RE	"\\\\u[0-9a-fA-F]{4}|\\\\U[0-9a-fA-F]{8}"
-
-//#define identifier_RE	"[_a-zA-Z][_a-zA-Z0-9]*"
-#define identifier_RE	"([_a-zA-Z]|"unichar_RE")([_a-zA-Z0-9]|"unichar_RE")*"
-
-#define suffix_RE	"([uU]ll?|[uU]LL?|ll?[uU]?|LL?[uU]?)?"
-#define binary_RE	"0[bB][01]('?[01])*"suffix_RE
-#define octal_RE	"0('?[0-7])*"suffix_RE
-#define decimal_RE	"[1-9]('?[0-9])*"suffix_RE
-#define hexadecimal_RE	"0[xX][0-9a-fA-F]('?[0-9a-fA-F])*"suffix_RE
-#define integer_RE	binary_RE"|"octal_RE"|"decimal_RE"|"hexadecimal_RE
-
-#define dec_part_RE	"[0-9]('?[0-9])*"
-#define exponent_RE	"[eE][-+]?[0-9]('?[0-9])*"
-#define floating_RE	"(\\."dec_part_RE"("exponent_RE")?|"\
-	                dec_part_RE"\\.("dec_part_RE")?("exponent_RE")?|"\
-        	        dec_part_RE exponent_RE")[fFlL]?"
-
-#define oct_char_RE	"\\\\[0-7]{1,3}"
-#define hex_char_RE	"\\\\x[0-9a-fA-F]+"
-#define escape_RE	"\\\\['\"?abfnrtv\\]|"oct_char_RE"|"hex_char_RE
-#define character_RE	"[uUL]?'([^'\\\n]|"escape_RE"|"unichar_RE")'"
-#define string_RE	"[uUL]?\"([^\"\\\n]|"escape_RE"|"unichar_RE")*\""
+#define q_chars_RE      "[][ \t\v\fa-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\'-]+"
+#define header_RE       "<"h_chars_RE">|\""q_chars_RE"\""
+#define pp_number_RE    "\\.?[0-9]('?[a-zA-Z_0-9]|[eE][+-]|\\.)*"
+
+#define unichar_RE      "\\\\u[0-9a-fA-F]{4}|\\\\U[0-9a-fA-F]{8}"
+
+//#define identifier_RE "[_a-zA-Z][_a-zA-Z0-9]*"
+#define identifier_RE   "([_a-zA-Z]|"unichar_RE")([_a-zA-Z0-9]|"unichar_RE")*"
+
+#define suffix_RE       "([uU]ll?|[uU]LL?|ll?[uU]?|LL?[uU]?)?"
+#define binary_RE       "0[bB][01]('?[01])*"suffix_RE
+#define octal_RE        "0('?[0-7])*"suffix_RE
+#define decimal_RE      "[1-9]('?[0-9])*"suffix_RE
+#define hexadecimal_RE  "0[xX][0-9a-fA-F]('?[0-9a-fA-F])*"suffix_RE
+#define integer_RE      binary_RE"|"octal_RE"|"decimal_RE"|"hexadecimal_RE
+
+#define dec_part_RE     "[0-9]('?[0-9])*"
+#define exponent_RE     "[eE][-+]?[0-9]('?[0-9])*"
+#define floating_RE     "(\\."dec_part_RE"("exponent_RE")?|"\
+                        dec_part_RE"\\.("dec_part_RE")?("exponent_RE")?|"\
+                        dec_part_RE exponent_RE")[fFlL]?"
+
+#define oct_char_RE     "\\\\[0-7]{1,3}"
+#define hex_char_RE     "\\\\x[0-9a-fA-F]+"
+#define escape_RE       "\\\\['\"?abfnrtv\\]|"oct_char_RE"|"hex_char_RE
+#define character_RE    "[uUL]?'([^'\\\n]|"escape_RE"|"unichar_RE")'"
+#define string_RE       "[uUL]?\"([^\"\\\n]|"escape_RE"|"unichar_RE")*\""
 
 // should really be: any basic source char except ) followed by delimiter
-#define r_chars_RE	"[^)]*"
+#define r_chars_RE      "[^)]*"
 // delimiter; first and second occurrence in rawstring must be the same
 // use back reference \3:
-#define d_chars_RE	"([^ ()\\\t\v\f\n]{0,16})"
-#define rawstring_RE	"[uUL]?R\""d_chars_RE"\\("r_chars_RE"\\)\\3\""
+#define d_chars_RE      "([^ ()\\\t\v\f\n]{0,16})"
+#define rawstring_RE    "[uUL]?R\""d_chars_RE"\\("r_chars_RE"\\)\\3\""
 
-#define operator_RE	"[][{}();?~,]|<=>|<<=|\\.\\.\\.|->\\*|>>=|"\
-  			"[*/!=^]=?|<[:%=<]?|:[:>]?|\\.[*]?|-[->=]?|\\+[=+]?|"\
-			"%[>=]?|&[=&]?|>[>=]?|\\|[|=]?"
+#define operator_RE     "[][{}();?~,]|<=>|<<=|\\.\\.\\.|->\\*|>>=|"\
+                        "[*/!=^]=?|<[:%=<]?|:[:>]?|\\.[*]?|-[->=]?|\\+[=+]?|"\
+                        "%[>=]?|&[=&]?|>[>=]?|\\|[|=]?"
 
 #define preprocessor_RE "##?"
 
-#define token_RE	"^"ws_RE"(("rawstring_RE")|("identifier_RE")|("\
+#define token_RE        "^"ws_RE"(("rawstring_RE")|("identifier_RE")|("\
                         integer_RE")|("floating_RE")|("string_RE")|("\
                         character_RE")|("operator_RE")|("preprocessor_RE"))"
 
 #define NMATCH 34
 
 // Guarded against overflow but not full-proof!
-#define MAX_LINE 4096	// maximum logical line length in chars (\0 exclusive)
+#define MAX_LINE 4096   // maximum logical line length in chars (\0 exclusive)
 
 #define utf8_start(cc)          (((cc)&0xC0)!=0x80)
 
@@ -204,7 +204,7 @@ unsigned get_token(char const *text, unsigned start)
   if (regexec(re, text, nmatch, pmatch, REG_NOTEOL) == REG_NOMATCH) {
     // Warn about the failed match:
     fprintf(stderr, "(W) [%u:%u] not a valid token; skipped.\n",
-	    linenrs[start],columns[start]);
+            linenrs[start],columns[start]);
     // Cannot recover; no more input.
     return 0;
   }
@@ -239,7 +239,6 @@ unsigned get_token(char const *text, unsigned start)
 int normalize_newline(void)
 {
   int cc = getchar();
-  if (cc == EOF || cc == '\n') return cc;
 
   if (cc == '\r') {
     // Maybe \r \n (CR NL) combination?
@@ -248,10 +247,10 @@ int normalize_newline(void)
       char_count++; // counts the carriage return
       utf8_count++;
       // No use incrementing column.
-      return nc; // effectively skip the \r
+      return nc; // return \n; effectively skipping the \r
     }
-    // Mind nc not \n.
-    if (nc != EOF) ungetc(nc, stdin);
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
     // cc == '\r'; consider a newline as well, so turn into \n:
     cc = '\n';
   }
@@ -266,15 +265,15 @@ int get(void)
   int cc;
  restart:
   // Read a fresh char:
-  cc = normalize_newline();
+  cc = normalize_newline(); // cc != '\r'
   if (cc == EOF) return EOF;
   char_count++;
   if (utf8_start(cc)) utf8_count++;
 
-  if (cc == '\n') {
+  if (cc == '\n') { // a normalized end-of-line (\r|\r?\n)
     linenr++;
     column = 0;
-    return cc;
+    return cc; // \n here signals a logical end-of-line
   }
 
   // Deal with \ line continuations!
@@ -291,8 +290,8 @@ int get(void)
       // Could again start a line continuation!
       goto restart;
     }
-    // Mind nc not \n.
-    if (nc != EOF) ungetc(nc, stdin);
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
     // cc == '\\' a regular backslash
   }
   column++;
@@ -397,22 +396,22 @@ int buffer_fill(void)
     if (cc == '"') {
       // Switch to unfiltered input till unescaped closing ":
       if ((cc = get()) == '"') {
-	buffer_add(cc);
-	// An empty string literal.
-	continue;
+        buffer_add(cc);
+        // An empty string literal.
+        continue;
       }
       if (cc == EOF || cc == '\n')
-	// unexpected EOF or newline in string
-	break;
+        // unexpected EOF or newline in string
+        break;
       buffer_add(cc);
       int pc;
       do {
-	pc = cc;
-	cc = get();
-	if (cc == EOF || cc == '\n')
-	  // unexpected EOF or newline in string
-	  goto break_outer;
-	buffer_add(cc);
+        pc = cc;
+        cc = get();
+        if (cc == EOF || cc == '\n')
+          // unexpected EOF or newline in string
+          goto break_outer;
+        buffer_add(cc);
       } while (pc == '\\' || cc != '"');
       // pc != '\\' && cc == '"'
     }
diff --git a/tools/tokenizer/pytokenize.c b/tools/tokenizer/pytokenize.c
index fb787a9..664fdcb 100644
--- a/tools/tokenizer/pytokenize.c
+++ b/tools/tokenizer/pytokenize.c
@@ -1,7 +1,7 @@
 /* Copyright (c) 2021 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
-   Tokenizer for Python 3.
+   Tokenizer for Python 3.x
 
    Token classes:
    - identifier
@@ -33,13 +33,13 @@
 // Program globals:
 static unsigned brackets_opened = 0; // unpaired nested ( [ { seen
 static int prev_was_newline = 1;     // no previous token or was newline
-static int first_time = 1;
+static int first_time = 1;	     // control add , for JSON and JSONL
 
 // Program option settings:
 static int start_token = 0;       // when 1 start filename pseudo-token
 static int continuous_files = 0;  // when 1 do not reset after each file
 static enum { PLAIN, CSV, JSON, JSONL, XML, RAW } mode = PLAIN;
-static int output_layout = 0;	  // when 1 output layout pseudo tokens
+static int output_layout = 0;     // when 1 output layout pseudo tokens
 
 static const char *keywords[] = {
   "False",  "None",   "True",    "and",      "as",       "assert", "async",
@@ -51,7 +51,7 @@ static const char *keywords[] = {
 
 static const unsigned num_keywords = sizeof(keywords)/sizeof(keywords[0]);
 
-static void emit(const char *s, unsigned line,  unsigned col)
+static void emit(const char *s, unsigned line, unsigned col)
 {
   if (output_layout) {
     switch (mode) {
@@ -67,18 +67,18 @@ static void emit(const char *s, unsigned line,  unsigned col)
     case JSON:
     case JSONL:
       if (first_time)
-	first_time = 0;
+        first_time = 0;
       else {
-	if (mode == JSON) fputc(',', stdout);
-	fputc('\n', stdout);
+        if (mode == JSON) fputc(',', stdout);
+        fputc('\n', stdout);
       }
       fprintf(stdout, "{ \"line\": %u, \"column\": %u, "
-	      "\"class\": \"layout\", \"token\": \"%s\" }", line, col, s);
+              "\"class\": \"layout\", \"token\": \"%s\" }", line, col, s);
       break;
     case XML:
       fprintf(stdout,
-	      "<token line=\"%u\" column=\"%u\" class=\"layout\">%s</token>\n",
-	      line, col, s);
+              "<token line=\"%u\" column=\"%u\" class=\"layout\">%s</token>\n",
+              line, col, s);
       break;
     }
   }
@@ -88,19 +88,19 @@ static void emit(const char *s, unsigned line,  unsigned col)
 #define MAX_INDENTS 128
 static unsigned indents[MAX_INDENTS];
 static unsigned *sp = indents;
-#define indents_reset()	do { sp = indents; } while(0)
-#define indents_empty()	(sp == indents)
-#define indents_full()	(sp == indents+MAX_INDENTS)
-#define indents_top()	(indents_empty() ? 0 : *(sp-1))
-#define indents_push(i)	do { assert(!indents_full()); *sp++ = (i); } while(0)
-#define indents_pop()	do { assert(!indents_empty()); sp--; } while(0)
+#define indents_reset() do { sp = indents; } while(0)
+#define indents_empty() (sp == indents)
+#define indents_full()  (sp == indents+MAX_INDENTS)
+#define indents_top()   (indents_empty() ? 0 : *(sp-1))
+#define indents_push(i) do { assert(!indents_full()); *sp++ = (i); } while(0)
+#define indents_pop()   do { assert(!indents_empty()); sp--; } while(0)
 
 // emit NEWLINE and deal with indentation
 static void process_newline(unsigned indent)
 {
   emit("NEWLINE", linenr-1, saved_col);
 
-  unsigned last_indent = indents_top();
+  unsigned last_indent = indents_top(); // maybe 0
 
   if (indent > last_indent) {
     indents_push(indent);
@@ -116,11 +116,12 @@ static void process_newline(unsigned indent)
     } while (indent < indents_top());
     // Here: empty() || indent >= top()
     if (indent > indents_top() && !nowarn)
-      fprintf(stderr, "(W): incorrect indentation.\n");
+      fprintf(stderr, "(W): Incorrect indentation.\n");
   }
   // else: indent == last_indent: no action
 }
 
+// cc in [ \t\f]
 static int process_ws(int cc)
 {
   // Collect white-space and compute possible indentation:
@@ -171,7 +172,7 @@ static int utf8_codepoint(int cc, int *len, int bytes[4])
   else { /* invalid utf-8 start byte */
     if (!nowarn)
       fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 start byte 0x%02x.\n",
-	      filename, linenr, cc);
+              filename, linenr, cc);
     return cc;
   }
   /* collect all follow bytes: */
@@ -179,15 +180,15 @@ static int utf8_codepoint(int cc, int *len, int bytes[4])
     cc = get();
     if (cc == EOF) { /* unexpected EOF in utf-8 sequence */
       if (!nowarn)
-	fprintf(stderr, "(W): [%s:%u] Unexpected EOF in UTF-8 sequence.\n",
-		filename, linenr);
+        fprintf(stderr, "(W): [%s:%u] Unexpected EOF in UTF-8 sequence.\n",
+                filename, linenr);
       return EOF;
     }
     bytes[(*len)++] = cc;
     if ((cc & 0xC0) != 0x80) { /* invalid utf-8 follow byte */
       if (!nowarn)
-	fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 follow byte 0x%02x.\n",
-		filename, linenr, cc);
+        fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 follow byte 0x%02x.\n",
+                filename, linenr, cc);
       return cc;
     }
     cp <<= 6;
@@ -199,7 +200,7 @@ static int utf8_codepoint(int cc, int *len, int bytes[4])
     /* invalid Unicode code point. */
     if (!nowarn)
       fprintf(stderr, "(W): [%s:%u] Invalid Unicode code point 0x%04x.\n",
-	      filename, linenr, cp);
+              filename, linenr, cp);
   }
   return cp;
 }
@@ -261,7 +262,7 @@ static int tokenize(char *token, const char **type,
       cc = get();
       // Maybe EOF!
       if (!brackets_opened && !strchr(" \t\n#\r\f", cc))
-	process_newline(0);
+        process_newline(0);
       goto restart;
     }
 
@@ -274,8 +275,8 @@ static int tokenize(char *token, const char **type,
     if (cc == EOF) {
       // Undo any outstanding indents:
       while (!indents_empty()) {
-	emit("DEDENT", linenr, column);
-	indents_pop();
+        emit("DEDENT", linenr, column);
+        indents_pop();
       }
       return 0;
     }
@@ -288,8 +289,11 @@ static int tokenize(char *token, const char **type,
         ;
       // cc == '\n' || cc == '\r' || cc == EOF
       if (cc == '\r') {
-        if (!nowarn)
-        fprintf(stderr, "(W): Comment may not be continued with \\.\n");
+	// presumably a \ may occur in a comment as last char before \n
+        /*
+	  if (!nowarn)
+	  fprintf(stderr, "(W): Comment may not be continued with \\.\n");
+	*/
         // Effectively ignore any \ and terminate logical line:
         cc == '\n';
       }
@@ -385,9 +389,9 @@ static int tokenize(char *token, const char **type,
 
             token_add(cc);
             // Assume \ is not escaped itself. Happens though!
-	    if (pc == '\\') // escape next char; no check
-	      cc = '\0';
-	    else
+            if (pc == '\\') // escape next char; no check
+              cc = '\0';
+            else
             if (cc == qc) { // a first unescaped quote
               int q2 = get();
               token_add(q2);
@@ -419,8 +423,8 @@ static int tokenize(char *token, const char **type,
       do {
         token_add(cc);
         if (pc == '\\') // escape next char; no check
-	  cc = '\0';
-	else
+          cc = '\0';
+        else
         if (cc == qc) { // unescaped quote
           *type = "string";
           break;
@@ -459,29 +463,29 @@ static int tokenize(char *token, const char **type,
     if (is_id_start(cp, utf8_len)) {
       int i;
       for (i = 0; i < utf8_len; i++)
-	token_add(utf8_bytes[i]);
+        token_add(utf8_bytes[i]);
     ident_token:
       cc = get();
       cp = utf8_codepoint(cc, &utf8_len, utf8_bytes);
       if (cp == EOF) // bad code point; already reported.
-	break;
+        break;
       all_ascii &= utf8_len == 1;
       while (is_id_follow(cp, utf8_len)) {
-	int i;
-	for (i = 0; i < utf8_len; i++)
-	  token_add(utf8_bytes[i]);
-	cc = get();
-	cp = utf8_codepoint(cc, &utf8_len, utf8_bytes);
-	if (cp == EOF) // bad code point; already reported.
-	  break;
-	all_ascii &= utf8_len == 1;
+        int i;
+        for (i = 0; i < utf8_len; i++)
+          token_add(utf8_bytes[i]);
+        cc = get();
+        cp = utf8_codepoint(cc, &utf8_len, utf8_bytes);
+        if (cp == EOF) // bad code point; already reported.
+          break;
+        all_ascii &= utf8_len == 1;
       }
       // Undo look ahead:
       while (utf8_len)
-	unget(utf8_bytes[--utf8_len]);
+        unget(utf8_bytes[--utf8_len]);
       token[len] = '\0';
       *type = all_ascii && is_keyword(token, keywords, num_keywords)
-	? "keyword" : "identifier";
+        ? "keyword" : "identifier";
       break;
     }
 
@@ -827,7 +831,7 @@ int main(int argc, char *argv[])
 fputs(
 "A tokenizer for Python (3) source code with output in 6 formats.\n"
 "Recognizes the following token classes: keyword, identifier, integer,\n"
-"floating, imaginary, string, and operator.\n\n", stdout);
+"floating, imaginary, string, and operator.\n\n", stderr);
 fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
@@ -885,7 +889,7 @@ fputs(
 
     case '?':
     default:
-      fputs("(F): unknown option. Stop.\n", stderr);
+      fputs("(F): Unknown option. Stop.\n", stderr);
       fprintf(stderr, usage_str, argv[0]);
       return 1;
     }
@@ -893,7 +897,7 @@ fputs(
 
   if (outfile && outfile[0]) {
     if (!freopen(outfile, "w", stdout)) {
-      fprintf(stderr, "(F): cannot open %s for writing.\n", outfile);
+      fprintf(stderr, "(F): Cannot open %s for writing.\n", outfile);
       exit(3);
     }
   }
@@ -905,7 +909,7 @@ fputs(
     filename = argv[optind];
     if (!freopen(filename, "r", stdin)) {
       if (!nowarn)
-      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
+      fprintf(stderr, "(W): Cannot read file %s; skipped.\n", filename);
       continue;
     }
 
@@ -963,11 +967,11 @@ fputs(
     while (tokenize(token, &type, &line, &col)) {
       switch (mode) {
       case RAW:
-	// Watch out for multi-line strings
+        // Watch out for multi-line strings
         if (!strcmp(type, "string"))
           RAW_escape(stdout, token);
-	else
-	  fputs(token, stdout);
+        else
+          fputs(token, stdout);
         fputc('\n', stdout);
         break;
       case PLAIN:
diff --git a/tools/tokenizer/schemas/schema.json b/tools/tokenizer/schemas/schema.json
index 61909db..0723d70 100644
--- a/tools/tokenizer/schemas/schema.json
+++ b/tools/tokenizer/schemas/schema.json
@@ -1,7 +1,7 @@
 {
   "$schema": "http://json-schema.org/draft-04/schema#",
   "title": "JSON Schema for Tokenizer JSON Output",
-  "description": "Prepared by Geert Janssen <geert@us.ibm.com>\nCopyright IBM Corporation 2020.",
+  "description": "Prepared by Geert Janssen <geert@us.ibm.com>\nCopyright IBM Corporation 2020, 2021.",
 
   "type": "array",
   "items": {
@@ -12,8 +12,10 @@
       "class": { "enum": [
         "identifier", "keyword", "integer", "floating",
         "string", "character", "operator", "preprocessor",
-	"filename"
+	"filename", "line_comment", "block_comment", "newline",
+	"continuation", "whitespace"
       ] },
+      "length": { "$ref": "#/definitions/unsignedInt" },
       "token": { "type": "string" }
     },
     "required": [ "line", "column", "class", "token" ],
diff --git a/tools/tokenizer/schemas/schema.rnc b/tools/tokenizer/schemas/schema.rnc
index 3adaad8..1eb43d7 100644
--- a/tools/tokenizer/schemas/schema.rnc
+++ b/tools/tokenizer/schemas/schema.rnc
@@ -1,5 +1,5 @@
 # Compact RELAX NG (RNC) Schema for Tokenizer XML Output
-# Copyright IBM Corporation 2020
+# Copyright IBM Corporation 2020, 2021
 # Prepared by Geert Janssen <geert@us.ibm.com>
 
 datatypes xsd = 'http://www.w3.org/2001/XMLSchema-datatypes'
@@ -16,9 +16,11 @@ doc =
       attribute line   { xsd:unsignedInt },
       attribute column { xsd:unsignedInt },
       attribute class  { token-classes },
+      attribute length { xsd:unsignedInt },
       text
    }
 
 token-classes =
    "identifier" | "keyword" | "integer" | "floating" | "string" |
-   "character" | "operator" | "preprocessor" | "filename"
+   "character" | "operator" | "preprocessor" | "filename" |
+   "line_comment" | "block_comment" | "newline" | "continuation" | "whitespace"
diff --git a/tools/tokenizer/schemas/tokml-schema.rnc b/tools/tokenizer/schemas/tokml-schema.rnc
new file mode 100644
index 0000000..55e1165
--- /dev/null
+++ b/tools/tokenizer/schemas/tokml-schema.rnc
@@ -0,0 +1,73 @@
+# XML RNC schema for tokML 1.0
+# Copyright IBM Corporation 2021
+# Prepared by Geert Janssen <geert@us.ibm.com>
+
+datatypes xsd = 'http://www.w3.org/2001/XMLSchema-datatypes'
+
+#default namespace = "https://www.ibm.com/tokml"
+
+start = source
+
+# Children are token elements interspersed with white-space.
+source = element source {
+   attribute language { "C" | "C++" | "Java" },
+   attribute filename { xsd:string }?,
+   (  line_comment |
+      block_comment |
+      keyword |
+      identifier |
+      integer |
+      floating |
+      \string |
+      character |
+      operator |
+      preprocessor |
+      text )*
+}
+
+# Attributes common to all token elements.
+common-attrs = 
+   (  attribute line { xsd:unsignedInt },
+      attribute col { xsd:unsignedInt },
+      attribute len { xsd:unsignedInt } )
+
+line_comment = element line_comment {
+   common-attrs,
+   text
+}
+block_comment = element block_comment {
+   common-attrs,
+   text
+}
+keyword = element keyword {
+   common-attrs,
+   text
+}
+identifier = element identifier {
+   common-attrs,
+   text
+}
+integer = element integer {
+   common-attrs,
+   text
+}
+floating = element floating {
+   common-attrs,
+   text
+}
+\string = element string {
+   common-attrs,
+   text
+}
+character = element character {
+   common-attrs,
+   text
+}
+operator = element operator {
+   common-attrs,
+   text
+}
+preprocessor = element preprocessor {
+   common-attrs,
+   text
+}
diff --git a/tools/tokenizer/token_common.c b/tools/tokenizer/token_common.c
index 6eefa7e..ba57ba1 100644
--- a/tools/tokenizer/token_common.c
+++ b/tools/tokenizer/token_common.c
@@ -25,7 +25,7 @@ unsigned num_files = 0;    // number of files read
 int debug = 0;             // when 1 debug output to stderr
 int verbose = 0;           // when 1 info output to stderr
 int nowarn = 0;            // when 1 warnings are suppressed
-Language source = C;	   // language mode
+Language source = C;       // language mode
 
 /* Conversion table from filename extension to language code.
    To find language code, consider all entries and check each ext
@@ -52,7 +52,7 @@ static const struct { const char *ext; Language lang; const char *name; }
    Returns word found (i.e., pointer value in table) or 0.
 */
 const char *is_keyword(const char *word,
-		       const char *table[], unsigned size)
+                       const char *table[], unsigned size)
 {
   int i = 0, j = size;
   while (i < j) {
@@ -90,7 +90,7 @@ void remove_BOM(void)
     if (c2 == 0xBB) {
       int c3 = getchar();
       if (c3 == 0xBF) {
-	return;
+        return;
       }
       if (c3 != EOF) buffer[buffered++] = c3;
     }
@@ -217,7 +217,7 @@ Language detect_lang(void)
     int i;
     for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
       if (!strcmp(p, langs[i].ext))
-	return langs[i].lang;
+        return langs[i].lang;
   }
   return C;
 }
diff --git a/tools/tokenizer/token_common.h b/tools/tokenizer/token_common.h
index 1ea4706..2cfe81e 100644
--- a/tools/tokenizer/token_common.h
+++ b/tools/tokenizer/token_common.h
@@ -47,7 +47,7 @@ extern int nowarn/*= 0*/;            // when 1 warnings are suppressed
 extern Language source/*= C*/;       // language mode
 
 extern const char *is_keyword(const char *word,
-			      const char *table[], unsigned size);
+                              const char *table[], unsigned size);
 
 extern int get(void);
 extern void unget(int cc);
diff --git a/tools/tokenizer/tokenize.c b/tools/tokenizer/tokenize.c
index b1dc55f..559ed80 100644
--- a/tools/tokenizer/tokenize.c
+++ b/tools/tokenizer/tokenize.c
@@ -1,11 +1,11 @@
-/* Copyright (c) 2020, 2021 International Business Machines Corporation
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
-   Simple C/C++ (and Java) Tokenizer.
+   Simple C/C++ and Java Tokenizer.
    For the most part assumes that the input source text is grammatically
-   correct C or C++ code.
-   (Since Java at the lexical level is very close, could in principle
-   also be used as Java tokenizer, albeit that not all of its keywords
+   correct C, C++, or Java code.
+   (Since Java at the lexical level is very close to C, we here sort of misuse
+   it as a Java tokenizer, albeit that not all of its keywords
    and some literal pecularities are not recognized.)
 
    Recognizes the following lexeme classes:
@@ -30,10 +30,11 @@
    its starting character. Line and column reflect positions in the
    physical line structure, not the logical one.
    All token literals are output exactly as they appear in the source text,
-   without any interpretation of escaped characters etc.
+   without any interpretation of escaped characters etc. However, the particular
+   output format will enforce certain escaping as needed.
 
-   Moreover, skips white-space, control characters and comments and
-   flags anything left over as illegal characters.
+   Moreover, white-space, control characters and comments are normally skipped
+   and anything left over is flagged as illegal characters.
 
    See these refs for details on the lexical definitions:
    C++14 Final Working Draft: n4140.pdf
@@ -44,7 +45,6 @@
    (A TAB is counted as a single character position. A CR causes a transition
    to a new line.)
    No trigraph sequences (??x) are recognized.
-   No alternative tokens except keyword ones for certain operators.
    No universal characters (\u and \U) in an identifier.
    Raw strings with R prefix are not supported.
    No preprocessing is attempted: phrases like #include <stdio.h> are
@@ -69,6 +69,7 @@
    1: illegal character(s) or premature EOF detected
    2: look-ahead buffer overflow
    3: output file cannot be opened
+   4: could not (re-)allocate token buffer
 
    C++ Token categories as Regular Expressions:
    (\b = [01], \o = [0-7], \d = [0-9], \x = [a-fA-F0-9],
@@ -78,11 +79,11 @@
    - identifier: [_a-zA-Z][_a-zA-Z0-9]*
    - integer   : 0[bB]\b('?\b])*\s?
                | 0('?\o)*\s?
-	       | 0[xX]\x('?\x)*\s?
-	       | [1-9]('?\d)*\s?
+               | 0[xX]\x('?\x)*\s?
+               | [1-9]('?\d)*\s?
    - floating  : .\d('?\d)*([eE][-+]?\d('?\d)*)?[fFlL]?
                | \d('?\d)*.(\d('?\d)*)?([eE][-+]?\d('?\d)*)?[fFlL]?
-	       | \d('?\d)*[eE][-+]?\d('?\d)*[fFlL]?
+               | \d('?\d)*[eE][-+]?\d('?\d)*[fFlL]?
    - string    : [uUL]?"([^"\\\n]|\\.|\\\n)*"
    - character : [uUL]?'([^']|\\.)'
    - operator  : one of these operator and punctuation symbols:
@@ -92,851 +93,10 @@
    - preprocessor : # | ##
 */
 
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include <unistd.h>		/* getopt() */
-#include <libgen.h>		/* basename() */
+#include <unistd.h>             /* getopt() */
+#include <libgen.h>             /* basename() */
 
-/* Let's introduce more parameters so that it becomes easier to
-   configure the state-machines for the various tokens.
-   Use a NUL character to disable the parameter, i.e., a NUL value
-   means "this char is not in effect; a test for it fails".
-
-   FIXME: not yet used!
-*/
-// Character that may be used to group digits in a number:
-#define CFG_DIGITS_SEP		'\''
-// Extra character that may start an identifier:
-#define CFG_ID_START_EXTRA	'_'
-// Extra character that may continue an identifier:
-// Maybe allows a set of characters, like also $?
-#define CFG_ID_CONT_EXTRA	'_'
-// May a floating-point number start with a decimal point:
-//#define CFG_FLOAT_DOT
-
-// FIXME: make token size dynamic.
-#define MAX_TOKEN 65535         // maximum token length in chars (\0 exclusive)
-#define MAX_BUF 8               // maximum buffer size in chars
-
-// Program globals:
-static char *filename = "stdin";// current file being parsed
-static unsigned linenr = 1;     // line number counted from 1
-static unsigned column = 0;     // char position in line, counted from 0
-static unsigned char_count = 0; // total char/byte count
-static unsigned utf8_count = 0; // total utf-8 char count
-static char buffer[MAX_BUF];    // use buffer as multi-char lookahead.
-static unsigned buffered = 0;   // number of buffered chars
-static unsigned saved_col = 0;  // one-place buf for last column on prev line
-static unsigned illegals = 0;	// count number of illegal characters
-static unsigned unexpect_eof = 0; // encountered unexpected EOF
-static unsigned num_files = 0;	// number of files read
-// keyword lookup function:
-static const char *(*is_keyword)(const char *);
-
-// Program option settings:
-static int debug = 0;		// when 1 debug output to stderr
-static int verbose = 0;         // when 1 info output to stderr
-static int nowarn = 0;		// when 1 warnings are suppressed
-static int hash_as_comment = 0;	// when 1 treat # as line comment
-static int start_token = 0;	// when 1 start filename pseudo-token
-static int newline_token = 0;	// when 1 output newline pseudo-token
-static int continuous_files = 0;// when 1 do not reset after each file
-static enum { C, CPP, JAVA } source = CPP;
-
-/* No longer using perfect hash function but simple binary search. */
-
-/* C11 n1570.pdf 6.4.1 (44)
-   C17 n2176.pdf 6.4.1 (A.1.2) (44)
-*/
-static const char *C_keywords[] = {
-  "_Alignas",	"_Alignof",	"_Atomic",	"_Bool",	"_Complex",
-  "_Generic",	"_Imaginary",	"_Noreturn",	"_Static_assert",
-  "_Thread_local",
-
-  "auto",	"break",	"case",		"char",		"const",
-  "continue",	"default",	"do",		"double",	"else",
-  "enum",	"extern",	"float",	"for",		"goto",
-  "if",		"inline",	"int",		"long",		"register",
-  "restrict",	"return",	"short",	"signed",	"sizeof",
-  "static",	"struct",	"switch",	"typedef",	"union",
-  "unsigned",	"void",		"volatile",	"while"
-};
-
-#if 0
-/* C++ 2014 n4296.pdf 2.11 (84) */
-static const char *CPP_keywords[] = {
-  "alignas",       "alignof",       "and",           "and_eq",     "asm",
-  "auto",          "bitand",        "bitor",         "bool",       "break",
-  "case",          "catch",         "char",          "char16_t",   "char32_t",
-  "class",         "compl",         "const",         "const_cast", "constexpr",
-  "continue",      "decltype",      "default",       "delete",     "do",
-  "double",        "dynamic_cast",  "else",          "enum",       "explicit",
-  "export",        "extern",        "false",         "float",      "for",
-  "friend",        "goto",          "if",            "inline",     "int",
-  "long",          "mutable",       "namespace",     "new",        "noexcept",
-  "not",           "not_eq",        "nullptr",       "operator",   "or",
-  "or_eq"          "private",       "protected",     "public",     "register",
-  "reinterpret_cast", "return",     "short",         "signed",     "sizeof",
-  "static",        "static_assert", "static_cast",   "struct",     "switch",
-  "template",      "this",          "thread_local",  "throw",      "true",
-  "try",           "typedef",       "typeid",        "typename",   "union",
-  "unsigned",      "using",         "virtual",       "void",       "volatile",
-  "wchar_t",       "while",         "xor",           "xor_eq"
-};
-#endif
-
-/* C++23 n4885.pdf 5.11 (92) */
-static const char *CPP_keywords[] = {
-  "alignas",       "alignof",       "and",           "and_eq",     "asm",
-  "auto",          "bitand",        "bitor",         "bool",       "break",
-  "case",          "catch",         "char",          "char16_t",   "char32_t",
-  "char8_t",       "class",         "co_await",      "co_return",  "co_yield",
-  "compl",         "concept",       "const",         "const_cast", "consteval",
-  "constexpr",     "constinit",     "continue",      "decltype",   "default",
-  "delete",        "do",            "double",        "dynamic_cast", "else",
-  "enum",          "explicit",      "export",        "extern",     "false",
-  "float",         "for",           "friend",        "goto",       "if",
-  "inline",        "int",           "long",          "mutable",    "namespace",
-  "new",           "noexcept",      "not",           "not_eq",     "nullptr",
-  "operator",      "or",            "or_eq"          "private",    "protected",
-  "public",        "register",      "reinterpret_cast", "requires","return",
-  "short",         "signed",        "sizeof",        "static",  "static_assert",
-  "static_cast",   "struct",        "switch",        "template",   "this",
-  "thread_local",  "throw",         "true",          "try",        "typedef",
-  "typeid",        "typename",      "union",         "unsigned",   "using",
-  "virtual",       "void",          "volatile",      "wchar_t",    "while",
-  "xor",           "xor_eq"
-};
-
-/* Java SE 8 (50) (false, true, null are literals) */
-/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */
-static const char *Java_keywords[] = {
-  "abstract", "assert",     "boolean", "break",     "byte",      "case",
-  "catch",    "char",       "class",   "const",     "continue",  "default",
-  "do",       "double",     "else",    "enum",      "extends",   "final",
-  "finally",  "float",      "for",     "goto",      "if",        "implements",
-  "import",   "instanceof", "int",     "interface", "long",      "native",
-  "new",      "package",    "private", "protected", "public",    "return",
-  "short",    "static",     "strictfp","super",     "switch", "synchronized",
-  "this",     "throw",      "throws",  "transient", "try",       "void",
-  "volatile", "while"
-};
-
-#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]);
-
-/* Generic binary search lookup in some keyword table.
-   `word' to be searched must be NUL-terminated C string.
-   `table' is array of const char * of `size' sorted alphabetically.
-   Returns word found (i.e., pointer value in table) or 0.
-*/
-#define lang_is_keyword(lang)						\
-  static const char *lang##_is_keyword(const char *word)		\
-  {									\
-    int i = 0, j = num_keywords(lang);					\
-    while (i < j) {							\
-      int k = (i + j) >> 1 /* / 2 */;					\
-      const char *kw = lang##_keywords[k];				\
-      int cmp = strcmp(word, kw);					\
-      if (!cmp)								\
-	return kw;							\
-      if (cmp < 0) j = k; else i = k + 1;				\
-    }									\
-    return 0;								\
-  }
-
-/* Define individual is_keyword functions per language: */
-/* C_is_keyword */
-lang_is_keyword(C)
-/* CPP_is_keyword */
-lang_is_keyword(CPP)
-/* Java_is_keyword */
-lang_is_keyword(Java)
-
-// Append char cc to token; discard when no more room:
-#define token_add(cc) \
-  do { if (len < MAX_TOKEN) token[len++] = (cc); } while(0)
-
-#define utf8_start(cc)	(((cc)&0xC0)!=0x80)
-#define utf8_follow(cc) (((cc)&0xC0)==0x80)
-
-#define utf8_len(cc) \
-  (((cc)&0xF8)==0xF0 ? 4 : ((cc)&0xF0)==0xE0 ? 3 : ((cc)&0xE0)==0xC0 ? 2 : 1)
-
-/* Let's assume UTF-8 encoding.
-   https://www.cprogramming.com/tutorial/unicode.html
-   https://opensource.apple.com/source/tidy/tidy-2.2/tidy/src/utf8.c.auto.html
-*/
-
-void unget(int cc)
-{
-  if (cc == EOF) return;
-  if (buffered < MAX_BUF) {
-    if (cc == '\n') {
-      linenr--;
-      // column was 0 right after getting the \n
-      // hopefully there are no multiple ungets of \n
-      column = saved_col;
-    }
-    else
-      column--;
-    buffer[buffered++] = cc;
-  }
-  else {
-    fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF);
-    exit(2);
-  }
-}
-
-// Act like getchar().
-// Mind linenr,column apply to physical lines not logical ones.
-int get(void)
-{
-  int cc;
-
- restart:
-  // Get the next character:
-  if (buffered) // chars available in lookahead buffer
-    cc = buffer[--buffered]; // never EOF
-    // cc might be \ and followed by fresh \n
-    // Note: never can have buffered line continuation, i.e., \ \n.
-  else { // must read fresh char
-    cc = getchar();
-    if (cc == EOF) return EOF;
-    // Count all chars, even the \ of a line continuation:
-    char_count++;
-    if (utf8_start(cc)) utf8_count++;
-  }
-
-  // Treat Mac line endings ('\r') as regular newlines:
-  if (cc == '\n' || cc == '\r') {
-    linenr++;
-    saved_col = column;
-    column = 0;
-    return '\n';
-  }
-
-  // Deal with \ line continuations! Must look ahead.
-  if (cc == '\\') {
-    // Must look ahead; mind next char might be buffered!
-    if (buffered)
-      // Never can have \n for next char:
-      assert(buffer[buffered-1] != '\n');
-    else {
-      // Must get fresh character:
-      int nc = getchar(); // do not count yet; maybe must unget
-
-      // Maybe \r \n combination?
-      if (nc == '\r') {
-	// Look ahead for \n:
-	int c2 = getchar(); // do not count yet; maybe must unget
-	if (c2 == '\n') {
-	  // Skip \r but count it:
-	  char_count++;
-	  utf8_count++;
-	  nc = '\n';
-	}
-	else {
-	  unget(c2);
-	  // nc == '\r'
-	}
-      }
-
-      if (nc == '\n') { // 1 logical line: discard \\n combo:
-	char_count++; // counts the newline
-	linenr++;     // on next physical line
-	// never unget a continuation
-	//saved_col = column;
-	column = 0;
-
-	// Still need to get a character.
-	// Could again start a line continuation!
-	goto restart;
-      }
-      // Mind nc not \n but maybe \ or \r, then goes to buffer.
-      unget(nc);
-    }
-    // cc == '\\' a regular backslash
-  }
-  column++;
-  return cc;
-}
-
-/* Tokenization of C++ programming language source text.
-   Recognizes:
-   - identifier
-   - reserved word/keyword
-   - binary, octal, decimal, hexadecimal and floating-point numbers
-   - double-quoted string literal
-   - single-quoted character literal
-   - all single, double, and triple operator and punctuation symbols
-   - the preprocessor tokens # and ##
-   Skips white-space, control characters and comments and flags anything
-   left over as illegal characters.
-
-   (In the order of 20 tests per single character worst-case.)
-
-   Returns 0 upon EOF or error.
-*/
-int tokenize(char *token, const char **type, unsigned *line, unsigned *col)
-{
-  unsigned len;
-  int cc;
-  *type = "";
-
-  do { // infinite loop; after token recognized breaks out.
-    len = 0;
-    cc = get();
-
-  restart:
-    // cc already read.
-
-    /*** WHITE-SPACE ***/
-
-    // Skip (abutted) space and control chars and comments:
-    // [ \t\f\v\n]
-    //    while (cc <= ' ' && cc != EOF)
-    while (isspace(cc) && cc != EOF && cc != '\n')
-      cc = get();
-    if (cc == EOF)
-      return 0;
-    if (cc == '\n') {
-      if (newline_token) {
-	// token is empty.
-	*line = linenr-1;
-	*col  = saved_col;
-	*type = "newline";
-	break;
-      }
-      cc = get();
-      goto restart;
-    }
-    // !isspace(cc) && cc != EOF
-
-    /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/
-    // Java: no preprocessor directives.
-
-    if (cc == '#' && hash_as_comment) {
-      // Skip till end-of-line (\n exclusive):
-      while ((cc = get()) != '\n' && cc != EOF)
-	;
-      // cc == '\n' || cc == EOF
-      goto restart;
-    }
-
-    /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/
-
-    if (cc == '/') {
-      cc = get();
-      if (cc == '/') {
-        // Skip till end-of-line (\n exclusive):
-        while ((cc = get()) != '\n' && cc != EOF)
-          ;
-        // cc == '\n' || cc == EOF
-        goto restart;
-      }
-
-      if (cc == '*') {
-	// Remember start position:
-	unsigned lin = linenr;
-
-        // Skip till */ inclusive:
-        int nc = get(); // if EOF next get will be EOF too
-        do {
-          cc = nc;
-          nc = get();
-          if (nc == EOF) { // Error!
-            fprintf(stderr,
-		    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
-		    filename, lin);
-	    unexpect_eof++;
-            return 0;
-          }
-        } while (cc != '*' || nc != '/');
-        // cc == '*' && nc == '/'
-        cc = get();
-        goto restart;
-      }
-      // seen / but not // or /*
-      unget(cc); // char after /
-      cc = '/'; // restore /
-    }
-
-    // Start collecting a token.
-    // Token should finish with cc being last char of token!
-    *line = linenr;
-    *col = column-1; // 1 char lookahead
-
-    /*** CHAR and STRING PREFIX (C/C++) ***/
-
-    // Allow u,U,L prefix for string and char
-    // FIXME: allow u8 as prefix for string
-    if (cc == 'L' || cc == 'u' || cc == 'U') {
-      token[len++] = cc;
-      cc = get();
-      if (cc == '"')
-        goto string_token;
-      if (cc == '\'')
-        goto char_token;
-      // u,U,L will be interpreted as (start of) identifier.
-      unget(cc); // char after u,U,L
-      cc = token[--len]; // restore original and remove from token
-    }
-
-    /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/
-    // Java: false, true, null are literals
-    // FIXME: Flag to allow .letter as part of identifier?
-    // (compound identifier)
-
-    // Simplistic solution to allowing Unicode: allow any char >= 128 without
-    // actual checking for UTF-8.
-    if (isalpha(cc) || cc == '_' || cc == '$' || cc & 0x80) {
-      // First char always fits.
-      token[len++] = cc;
-      while (isalnum(cc = get()) || cc == '_' || cc == '$' ||
-	     cc != EOF && (cc & 0x80))
-        token_add(cc);
-      unget(cc);
-      token[len] = '\0';
-      *type = is_keyword(token) ? "keyword" : "identifier";
-      break;
-    }
-
-    /*** INTEGER and FLOATING ***/
-    // Java: uses _ in numbers as insignificant separator
-    // Java: decimal suffix: [lL], float suffix: [fFdD]
-    // Java: allows hex float
-
-#if 0
-    // Examples:
-    int bin_num = 0B010101u;
-    int oct_num = 01234567L;
-    int hex_num = 0x123ABCLL;
-    int dec_num = 12345678;
-
-    float flt_num1 = 077.;
-    float flt_num2 = 077.987;
-    float flt_num3 = 77.;
-    float flt_num4 = .77;
-#endif
-
-    // . digits ... floating
-    if (cc == '.') {
-      // Look ahead for a digit:
-      int nc;
-      if (isdigit(nc = get())) {
-	unget(nc);
-	goto start_fraction;
-      }
-      unget(nc);
-      // Could go immediately to operator: goto seen_period
-    }
-
-    if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal
-      // Types of integer literals:
-      enum {
-	BIN, OCT, DEC, HEX
-      } int_lit = cc == '0' ? OCT : DEC;
-
-      // Lookahead:
-      int nc = get();
-      if (int_lit == OCT && (nc == 'x' || nc == 'X')) {
-	int_lit = HEX;
-	token_add(cc); // the 0
-	cc = nc; // the x or X
-      }
-      else
-      if (int_lit == OCT && (nc == 'b' || nc == 'B')) {
-	int_lit = BIN;
-	token_add(cc); // the 0
-	cc = nc; // the b or B
-      }
-      else
-	unget(nc); // isdigit(cc)
-
-      do {
-        token_add(cc);
-        cc = get();
-
-	// Allow for ' between `digits':
-        if (cc == '\'') {
-          // Keep the ' in the token for now:
-          token_add(cc);
-          int nc = get();
-	  if (isdigit(nc) || int_lit == HEX && isxdigit(nc))
-            cc = nc;
-          else { // Error!
-            fprintf(stderr,
-		    "(E): [%s:%u] C++14 only allows ' between digits.\n",
-		    filename, linenr);
-            // what to do?
-          }
-        }
-      } while (isdigit(cc) || int_lit == HEX && isxdigit(cc));
-      // !is[x]digit(cc)
-
-      // FIXME: allow hex floats in C
-      if (int_lit == OCT || int_lit == DEC) {
-	int floating = 0;
-	// Seen digits-sequence. Maybe followed by . or e or E?
-	if (cc == '.') { // fractional part
-	start_fraction:
-	  floating = 1;
-	  token_add(cc);
-	  // digits? FIXME: again allow ' between digits
-	  while (isdigit(cc = get()))
-	    token_add(cc);
-	  // !isdigit(cc)
-	}
-	// cc != '.' || !isdigit(cc)
-	if (cc == 'e' || cc == 'E') { // exponent
-	  floating = 1;
-	  token_add(cc);
-	  if ((cc = get()) == '-' || cc == '+') {
-	    token_add(cc);
-	    cc = get();
-	  }
-	  // FIXME: no check for at least 1 digit
-	  // FIXME: again allow ' between digits
-	  while (isdigit(cc)) {
-	    token_add(cc);
-	    cc = get();
-	  }
-	  // !isdigit(cc)
-	}
-	if (floating) {
-	  if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L')
-	    token_add(cc);
-	  else
-	    unget(cc);
-	  *type = "floating";
-	  break;
-	}
-      }
-
-      // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case
-      if (cc == 'l' || cc == 'L') {
-        token_add(cc);
-        // maybe another l
-        cc = get();
-        if (cc == 'l' || cc == 'L') {
-          token_add(cc);
-          // Here: token is digits[lL][lL]
-          cc = get();
-        }
-        // maybe a u
-        if (cc == 'u' || cc == 'U')
-          // Here: token is digits[lL][lL]?[u|U]
-          token_add(cc);
-        else
-          unget(cc);
-      }
-      else if (cc == 'u' || cc == 'U') {
-        token_add(cc);
-        // maybe an l
-        cc = get();
-        if (cc == 'l' || cc == 'L') {
-          token_add(cc);
-          // Here: token is digits[uU][lL]
-          cc = get();
-        }
-        // maybe another l
-        if (cc == 'l' || cc == 'L')
-          // Here: token is digits[uU][lL]?[lL]
-          token_add(cc);
-        else
-          unget(cc);
-      }
-      else
-        unget(cc);
-      *type = "integer";
-      break;
-    }
-
-    /*** STRING (C/C++/Java) ***/
-
-    if (cc == '"') {
-    string_token:
-      // First char always fits.
-      token[len++] = cc;
-      // Remember start position:
-      unsigned lin = linenr;
-      // Watch out for escaped " inside string.
-      cc = get();
-      while (cc != '"') {
-        if (cc == EOF) { // Error!
-          fprintf(stderr,
-		  "(E): [%s:%u] Unexpected end-of-file in string literal.\n",
-		  filename, lin);
-	  unexpect_eof++;
-          return 0;
-        }
-        token_add(cc);
-        int nc = get();
-
-        if (cc == '\\') {
-	  // FIXME: No check on valid escape char!
-	  // ' " ? \ a b f n r t v
-	  token_add(nc);
-          cc = get();
-        }
-        else
-          cc = nc;
-      }
-      // cc == '"'
-      token_add(cc);
-      *type = "string";
-      break;
-    }
-
-    /*** CHARACTER (C/C++/Java) ***/
-
-    if (cc == '\'') {
-    char_token:
-      // First char always fits.
-      token[len++] = cc;
-      // Watch out for escaped ' inside char.
-      cc = get();
-      // FIXME: Cannot have empty char!
-      while (cc != '\'') {
-        if (cc == EOF) { // Error!
-          fprintf(stderr,
-		  "(E): [%s:%u] Unexpected end-of-file in char literal.\n",
-		  filename, linenr);
-	  unexpect_eof++;
-          return 0;
-        }
-        token_add(cc);
-        int nc = get();
-        if (cc == '\\') {
-          token_add(nc);
-          cc = get();
-          // FIXME: No check on valid escape char!
-          // ' " ? \ a b f n r t v 0[d[d]] xh*
-        }
-        else
-          cc = nc;
-      }
-      // cc == '\''
-      token_add(cc);
-      *type = "character";
-      break;
-    }
-
-    /*** OPERATOR (and PUNCTUATION) (C/C++/Java) ***/
-
-    // Operator and punctuation symbols. Longest match.
-
-    /* Operator or punctuator   Alternative representation
-       {        <%
-       }        %>
-       [        <:
-       ]        :>
-       #        %:      (not supported here)
-       ##       %:%:    (not supported here)
-    */
-
-    // Single char operator or punctuator (C/C++/Java)
-    // { } [ ] ( ) ; : ? . ~ ! + - * / % ^ = & | < > ,
-    // Double char operator or punctuator (C/C++)
-    // <: :> <% %>
-    // Double char operator or punctuator (C/C++/Java)
-    // += -= *= /= %= ^= &= |= == != <= >= && || << >> ++ -- ->
-    // Double char operator or punctuator (C++/Java)
-    // ::
-    // Double char operator or punctuator (C++)
-    // .*
-    // Triple char operator or punctuator (C/C++/Java)
-    // ... <<= >>=
-    // Triple char operator or punctuator (C++)
-    // ->* <=>
-    // Java: @ >>> >>>=
-
-    //seen_period:
-
-    // First char always fits.
-    token[len++] = cc;
-    token[len] = '\0';
-    //token=[cc,0];len=1
-
-    if (strstr("{}[]();?~,@", token)) { // allow @ for Java
-      // Single char operator/punctuator.
-      *type = "operator";
-      break;
-    }
-
-    if (strstr("<:.-+*/%^&|=!>", token)) { // single or start of double/triple
-      // Check second char:
-      int c2 = get();
-      if (c2 != EOF) {
-        token[len++] = c2;
-	//token=[cc,c2];len=2
-
-        // Check third char:
-        int c3 = get();
-        if (c3 != EOF) {
-          token[len++] = c3;
-          token[len] = '\0';
-	  //token=[cc,c2,c3,0];len=3
-	  if (!strcmp(">>>", token)) { // allow >>> for Java
-	    //token=[>,>,>,0];len=3
-	    // Look-ahead for =:
-	    int c4 = get();
-	    if (c4 == '=') // >>>= for Java
-	      token[len++] = c4;
-	      //token=[>,>,>,=];len=4
-	    else
-	      unget(c4);
-  	      //token=[>,>,>,0];len=3
-            *type = "operator";
-            break;
-	  }
-	  //token=[cc,c2,c3,0];len=3
-
-          if (!strcmp("...", token) ||
-              !strcmp("<=>", token) ||
-              !strcmp("->*", token) ||
-              !strcmp("<<=", token)) {
-            // Triple char operator/punctuator.
-            *type = "operator";
-            break;
-          }
-
-          // Maybe double char. Undo the c3 token extension:
-          token[--len] = '\0';
-	  //token=[cc,c2,0];len=2
-        }
-	else
-	  token[len] = '\0';
-	  //token=[cc,c2,0];len=2
-        unget(c3);
-
-        // Maybe double char.
-        static const char * const ops2[] = {
-          "<:", "<%", "<=", "<<", ":>",
-          "::", ".*", "->", "-=", "--",
-          "+=", "++", "*=", "/=", "%>",
-          "%=", "^=", "&=", "&&", "|=",
-          "||", "==", "!=", ">=", ">>"
-        };
-	unsigned size = sizeof(ops2) / sizeof(ops2[0]);
-        unsigned i;
-        for (i = 0; i < size; i++)
-          if (!strcmp(ops2[i], token))
-            break;
-        if (i < size) {
-          *type = "operator";
-          break;
-        }
-	//token=[cc,c2,0];len=2
-
-        // Must be single char. Undo the c2 token extension:
-        token[--len] = '\0';
-	//token=[cc,0];len=1
-      }
-      //else token=[cc,0];len=1
-
-      // Must be single char.
-      unget(c2);
-      *type = "operator";
-      break;
-    }
-    //token=[cc,0];len=1
-
-    /*** PREPROCESSOR (C/C++) ***/
-
-    if (cc == '#') {
-      int nc = get();
-      if (nc != '#')
-        unget(nc);
-      else
-        token[len++] = nc;
-      *type = "preprocessor";
-      break;
-    }
-
-    // What is left here? Illegal chars!
-    if (!nowarn)
-      // Mind non-printing chars!
-      fprintf(stderr,
-	      "(W): [%s:%u] Illegal character `%s%c` (0x%02x) skipped.\n",
-	      filename, linenr, cc<32?"CTRL-":"", cc<32?cc+64:cc, cc);
-    // Count them:
-    illegals++;
-
-  } while (1);
-  // len <= MAX_TOKEN
-  token[len] = '\0';
-  return 1;
-}
-
-// Escape token for output as CSV string.
-void CSV_escape(FILE *out, const char *token)
-{
-  const char *p;
-  // start CSV string:
-  fputc('"', out);
-  for (p = token; *p; p++) {
-    if (*p == '"')
-      fputc('"', out);
-    fputc(*p, out);
-  }
-  // end CSV string:
-  fputc('"', out);
-}
-
-// Escape token for output as JSON string.
-void JSON_escape(FILE *out, const char *token)
-{
-  // C/C++ has escapes: \' \" \? \a \b \f \n \r \t \v \x \0.
-  // To preserve, simply escape the escape and all ":
-  const char *p;
-  for (p = token; *p; p++) {
-    if (*p == '\\' || *p == '"')
-      fputc('\\', out);
-    fputc(*p, out);
-  }
-}
-
-// Escape token for output as XML text.
-void XML_escape(FILE *out, const char *token)
-{
-#if 1
-  // Alternative: escape every <, >, and &:
-  const char *p;
-  for (p = token; *p; p++) {
-    if (*p == '<')
-      fputs("&lt;", out);
-    else
-    if (*p == '>')
-      fputs("&gt;", out);
-    else
-    if (*p == '&')
-      fputs("&amp;", out);
-    else
-      fputc(*p, out);
-  }
-#else
-  // User CDATA construct for escaping.
-  // Impossible to escape ]]> occurring in token!
-  // Must chop up the substring ]]> in ]] and >.
-  const char *p;
-  const char *q = token;
-  // "abc]]>hello" => <![CDATA["abc]]]]><![CDATA[>hello"]]>
-  // "]]>]]>" => <![CDATA[]]]]><!CDATA[>]]]]><![CDATA[>"]]>
-  while ((p = strstr(q, "]]>"))) {
-    int len = p - q; // always > 0
-    fputs("<![CDATA[", out);
-    fwrite(q, 1, len, out);
-    fputs("]]]]>", out);
-    q = p+2; // q start at >...
-  }
-  if (q < token+strlen(token))
-    fprintf(out, "<![CDATA[%s]]>", q);
-#endif
-}
+#include "libtoken.h"
 
 int main(int argc, char *argv[])
 {
@@ -944,19 +104,26 @@ int main(int argc, char *argv[])
   extern int opterr;
   extern int optind;
   int option;
-  char const *opt_str = "1acdhjl:m:no:rsvw";
+  char const *opt_str = "1acdhjkl:m:nNo:rsvwW";
   char usage_str[80];
 
-  char token[MAX_TOKEN+1]; /* leave room for a terminating NUL */
-  const char *type;
+  const char *token;
+  enum TokenClass type;
   unsigned line;
   unsigned col;
+  unsigned pos;
+  unsigned token_len;
+  unsigned num_files = 0;    // number of files read
+  int start_token = 0;       // when 1 start filename pseudo-token
+  int continuous_files = 0;  // when 1 do not reset after each file
 
   char *outfile = 0;
   enum { PLAIN, CSV, JSON, JSONL, XML, RAW } mode = PLAIN;
   int first_time = 1;
+  Language source;
   int explicit_source = 0;
   int append = 0;
+  int suppress_newline = 0;
 
   sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str);
 
@@ -984,7 +151,7 @@ int main(int argc, char *argv[])
 fputs(
 "A tokenizer for C/C++ (and Java) source code with output in 6 formats.\n"
 "Recognizes the following token classes: keyword, identifier, integer,\n"
-"floating, string, character, operator, and preprocessor.\n\n", stdout);
+"floating, string, character, operator, and preprocessor.\n\n", stderr);
 fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
@@ -993,34 +160,33 @@ fputs(
 "-d       : print debug info to stderr; implies -v.\n"
 "-h       : print just this text to stderr and stop.\n"
 "-j       : assume input is Java (deprecated: use -l Java or .java).\n"
+"-k       : output line and block comments as tokens.\n"
 "-l<lang> : specify language explicitly (C, C++, Java).\n"
 "-m<mode> : output mode either plain (default), csv, json, jsonl, xml, or raw.\n"
 "-n       : output newlines as a special pseudo token.\n"
+"-N       : output line continuations as a special pseudo token.\n"
 "-o<file> : write output to this file (instead of stdout).\n"
+"-r       : suppress newline after each token in raw mode.\n"
 "-s       : enable a special start token specifying the filename.\n"
 "-1       : treat all filename arguments as a continuous single input.\n"
 "-v       : print action summary to stderr.\n"
-"-w       : suppress all warning messages.\n",
+"-w       : suppress all warning messages.\n"
+"-W       : output adjacent white-space as a token.\n",
       stderr);
       return 0;
 
     case 'j':
-      source = JAVA;
+      source = set_or_detect_lang("Java");
       explicit_source = 1;
       break;
 
+    case 'k':
+      comment_token = 1;
+      break;
+
     case 'l':
-      if (!strcmp(optarg, "C"))
-	source = C;
-      else if (!strcmp(optarg, "C++"))
-	source = CPP;
-      else if (!strcmp(optarg, "Java"))
-	source = JAVA;
-      else {
-	if (!nowarn)
-        fprintf(stderr, "(W): Unknown source %s (assuming C++).\n", optarg);
-      }
-      explicit_source = 1;
+       source = set_or_detect_lang(optarg);
+       explicit_source = 1;
       break;
 
     case 'm':
@@ -1037,7 +203,7 @@ fputs(
       else if (!strcmp(optarg, "raw"))
         mode = RAW;
       else {
-	if (!nowarn)
+        if (!nowarn)
         fprintf(stderr, "(W): Invalid mode %s (using plain).\n", optarg);
         mode = PLAIN;
       }
@@ -1047,10 +213,18 @@ fputs(
       newline_token = 1;
       break;
 
+    case 'N':
+      continuation_token = 1;
+      break;
+
     case 'o':
       outfile = optarg;
       break;
 
+    case 'r':
+      suppress_newline = 1;
+      break;
+
     case 's':
       start_token = 1;
       break;
@@ -1063,6 +237,10 @@ fputs(
       nowarn = 1;
       break;
 
+    case 'W':
+      whitespace_token = 1;
+      break;
+
     case '?':
     default:
       fputs("(F): unknown option. Stop.\n", stderr);
@@ -1088,34 +266,14 @@ fputs(
       fprintf(stderr, "(W): Cannot read file %s.\n", filename);
       continue;
     }
-    if (!explicit_source) {
-      // Determine language from extension:
-      int len = strlen(filename);
-      if (len > 2 && !strcmp(filename+len-2, ".c"))
-	source = C;
-      else if (len > 4 && !strcmp(filename+len-4, ".cpp"))
-	source = CPP;
-      else if (len > 5 && !strcmp(filename+len-5, ".java"))
-	source = JAVA;
-    }
+
+    if (!explicit_source)
+      source = set_or_detect_lang(0);
 
   doit:
     if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename);
     num_files++;
 
-    // Determine which keyword lookup function to use:
-    switch (source) {
-    case C:
-      is_keyword = C_is_keyword;
-      break;
-    case CPP:
-      is_keyword = CPP_is_keyword;
-      break;
-    case JAVA:
-      is_keyword = Java_is_keyword;
-      break;
-    }
-
     // Header:
     switch (mode) {
     case RAW:
@@ -1127,59 +285,65 @@ fputs(
       break;
     case CSV:
       if (!continuous_files || num_files == 1)
-	fputs("line,column,class,token\n", stdout);
+        fputs("line,column,class,token\n", stdout);
       if (start_token)
         fprintf(stdout, "0,0,filename,\"%s\"\n", filename);
       break;
     case JSON:
     case JSONL:
       if (!continuous_files || num_files == 1) {
-	if (mode == JSON) fputs("[\n", stdout);
+        if (mode == JSON) fputs("[\n", stdout);
       }
       else {
-	if (mode == JSON) fputc(',', stdout);
-	fputc('\n', stdout);
-	first_time = 1;
+        if (mode == JSON) fputc(',', stdout);
+        fputc('\n', stdout);
+        first_time = 1;
       }
       if (start_token) {
         fprintf(stdout,
                 "{ \"line\": 0, \"column\": 0, "
-                "\"class\": \"filename\", \"token\": \"%s\" }",
-                filename);
-	first_time = 0;
+                "\"class\": \"filename\", \"length\": %d, \"token\": \"%s\" }",
+                strlen(filename), filename);
+        first_time = 0;
       }
       break;
     case XML:
       if (!continuous_files || num_files == 1) {
-	fputs("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n", stdout);
-	// standalone="yes"
-	fputs("<tokens>\n", stdout);
+        fputs("<?xml version='1.0' encoding='UTF-8'?>\n", stdout);
+        // standalone='yes'
+        fputs("<tokens>\n", stdout);
       }
       if (start_token) {
-        fprintf(stdout, "<token line=\"0\" column=\"0\" class=\"filename\">");
-	XML_escape(stdout, filename);
+        fprintf(stdout,
+                "<token line='0' column='0' class='filename' length='%d'>",
+                strlen(filename));
+        XML_escape(stdout, filename);
         fputs("</token>\n", stdout);
       }
       break;
     }
 
-    while (tokenize(token, &type, &line, &col)) {
+    while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) {
       switch (mode) {
       case RAW:
         fputs(token, stdout);
-	fputc('\n', stdout);
-	break;
+        if (!suppress_newline) fputc('\n', stdout);
+        break;
       case PLAIN:
-        fprintf(stdout, "(%4u,%3u) %s: %s\n", line, col, type, token);
+        fprintf(stdout, "(%4u,%3u;%6u:%3u) %s: %s\n",
+		line, col, pos, token_len, token_class[type], token);
         break;
       case CSV:
         // Escape , " in token
         // csvkit treats . as null fields even as ".".
-        fprintf(stdout, "%u,%u,%s,", line, col, type);
-        if (!strcmp(type, "string") ||
+        fprintf(stdout, "%u,%u,%s,", line, col, token_class[type]);
+        if (type == STRING ||
             // Do we need this too? Yes!
-            !strcmp(type, "character") && strchr(token, '"') ||
-            !strcmp(type, "character") && strchr(token, ','))
+	    type == CHARACTER && (strchr(token, '"') || strchr(token, ',')) ||
+            type == WHITESPACE && strchr(token, '\n') ||
+            type == NEWLINE ||
+            type == CONTINUATION ||
+            comment_token && (type == LINE_COMMENT || type == BLOCK_COMMENT))
           CSV_escape(stdout, token);
         else if (!strcmp(token, ","))
           fputs("\",\"", stdout);
@@ -1194,24 +358,28 @@ fputs(
         else {
           if (mode == JSON) fputc(',', stdout);
           fputc('\n', stdout);
-	}
+        }
         fprintf(stdout,
                 "{ \"line\": %u, \"column\": %u, "
-                "\"class\": \"%s\", \"token\": \"",
-                line, col, type);
+                "\"class\": \"%s\", \"length\": %u, \"token\": \"",
+                line, col, token_class[type], token_len);
         // token value is always a JSON string.
-        if (!strcmp(type, "string") || !strcmp(type, "character"))
+        if (type == STRING  || type == CHARACTER ||
+            type == NEWLINE || type == WHITESPACE ||
+            type == CONTINUATION)
           JSON_escape(stdout, token);
         else
           fputs(token, stdout);
         fputs("\" }", stdout);
         break;
       case XML:
-        fprintf(stdout, "<token line=\"%u\" column=\"%u\" class=\"%s\">",
-                line, col, type);
-        if (!strcmp(type, "string")
-            || !strcmp(type, "character")
-            || !strcmp(type, "operator"))
+        fprintf(stdout, "<token line='%u' column='%u' class='%s' length='%u'>",
+                line, col, token_class[type], token_len);
+	if (type == STRING ||
+	    type == CHARACTER ||
+	    type == OPERATOR ||
+	    comment_token && (type == LINE_COMMENT ||
+			      type == BLOCK_COMMENT))
           XML_escape(stdout, token);
         else
           fputs(token, stdout);
@@ -1224,25 +392,25 @@ fputs(
       // Trailer:
       switch (mode) {
       case RAW:
-	break;
+        break;
       case PLAIN:
-	break;
+        break;
       case CSV:
-	break;
+        break;
       case JSON:
-	fputs("\n]", stdout);
-	/*FALL THROUGH*/
+        fputs("\n]", stdout);
+        /*FALL THROUGH*/
       case JSONL:
-	fputc('\n', stdout);
-	break;
+        fputc('\n', stdout);
+        break;
       case XML:
-	fputs("</tokens>\n", stdout);
-	break;
+        fputs("</tokens>\n", stdout);
+        break;
       }
 
       if (verbose)
-	fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
-		 char_count, utf8_count);
+        fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
+                 char_count, utf8_count);
 
       // Reset globals:
       char_count = 0;
@@ -1277,7 +445,7 @@ fputs(
 
     if (verbose)
       fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n",
-	      char_count, utf8_count);
+              char_count, utf8_count);
   }
 
   if (num_files > 1 && verbose)
diff --git a/tools/tokenizer/tokenize.py b/tools/tokenizer/tokenize.py
new file mode 100755
index 0000000..f5a162c
--- /dev/null
+++ b/tools/tokenizer/tokenize.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+# Copyright IBM Corporation 2021, 2022
+# Written by Geert Janssen <geert@us.ibm.com>
+
+# Simple ctypes-based Python wrapper of libtoken.so
+# See ctypes documentation: https://docs.python.org/3/library/ctypes.html
+# This Python script works with versions 2.6, 2.7, and 3.5
+
+import sys
+from ctypes import *
+
+# Load the shared object (expects it in current directory):
+libtoken = CDLL('./libtoken.so')
+
+# Define the exported function signatures:
+libtoken.C_tokenize.argtypes = (POINTER(c_char_p),
+                                POINTER(c_char_p),
+                                POINTER(c_uint),
+                                POINTER(c_uint),
+                                POINTER(c_uint))
+libtoken.open_as_stdin.argtypes = (c_char_p,)
+
+# 'Declare' the C function argument types:
+_token  = c_char_p()
+_kind   = c_char_p()
+_linenr = c_uint()
+_column = c_uint()
+_pos    = c_uint()
+
+# Token generator:
+def token():
+    global _token, _kind, _linenr, _column, _pos
+
+    # C_tokenize returns 0 upon end-of-file.
+    while int(libtoken.C_tokenize(byref(_token), byref(_kind), byref(_linenr),
+                                  byref(_column), byref(_pos))):
+        # Turn ctypes into real Python values:
+        lin = _linenr.value
+        col = _column.value
+        pos = _pos.value # not used for now
+        clas = _kind.value.decode()
+        text = _token.value.decode()
+        yield (lin,col,clas,text)
+
+if len(sys.argv) == 1:
+    for tok in token():
+        print('[%u:%u] %s, %s' % tok)
+else:
+    for file in sys.argv[1:]:
+        # Set C filename global and reopen as stdin:
+        b_str = file.encode('utf-8') # need handle b_str to retain as C pointer
+        libtoken.open_as_stdin(b_str)
+
+        # Access C globals:
+        filename = c_char_p.in_dll(libtoken, 'filename')
+        print('[0:0] filename, %s' % filename.value.decode())
+
+        for tok in token():
+            print('[%u:%u] %s, %s' % tok)
+
+        # Reset globals:
+        c_uint.in_dll(libtoken, 'linenr').value = 1
+        c_uint.in_dll(libtoken, 'column').value = 0
+        c_uint.in_dll(libtoken, 'char_count').value = 0
+        c_uint.in_dll(libtoken, 'utf8_count').value = 0
diff --git a/tools/tokenizer/tokml-test.sh b/tools/tokenizer/tokml-test.sh
new file mode 100755
index 0000000..445647f
--- /dev/null
+++ b/tools/tokenizer/tokml-test.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Showcasing the use of tokml and xidel.
+# Works for any Java, C, and C++ source file.
+# Extracts certain tokens and statistics of interest.
+
+# Show command and execute it.
+run() {
+  echo "$ $1"
+  eval "$1"
+  [ $? == 0 ] || die "got non-0 program exit code"
+}
+
+die() {
+    echo "(E) ${@}" 1>&2
+    exit 1
+}
+
+# We need an input file:                                                      
+[ -z "$1" ] && die "expect a C, C++, or Java file as argument"
+
+# Quick check for availabilty of tokml and xidel:
+command -v tokml &>/dev/null
+[ $? == 0 ] || die "tokml not available; please install"
+command -v xidel &>/dev/null
+[ $? == 0 ] || die "xidel not available; please install"
+
+# Create temp file:
+XML="$(mktemp /tmp/${1%.*}-XXX.xml)"
+# Ensure clean up when done:
+trap "/bin/rm -f $XML" EXIT
+echo \# Run tokml to obtain the .xml output file:
+run "tokml $1 > $XML"
+
+echo
+echo \# Count the number of tokens in the arg source file:
+run "xidel -s -e 'count(//source/*)' $XML"
+
+echo
+echo \# Show all unique identifiers \(sorted\):
+run "xidel -s -e '//identifier' $XML | sort | uniq"
+
+echo
+echo \# Show the identifier occurrences of length greater than 10:
+run "xidel -s -e '//identifier[@len>10]' $XML"
+
+echo
+echo \# How many block_comment occurrences are there?
+run "xidel -s -e 'count(//block_comment)' $XML"
+
+echo
+echo \# Which tokens immediately follow the keyword static?
+run "xidel -s -e '//keyword[text()=\"static\"]/following-sibling::*[1]' $XML | sort | uniq"
+
+echo
+echo \# What is the value of the first integer number?
+run "xidel -s -e '//integer[1]' $XML"
+
+echo
+echo \# Convert the XML back to the original source and show 20 lines:
+run "xidel -s -e 'source' $XML | head -n20"
diff --git a/tools/tokenizer/tokml.c b/tools/tokenizer/tokml.c
new file mode 100644
index 0000000..3fdc6aa
--- /dev/null
+++ b/tools/tokenizer/tokml.c
@@ -0,0 +1,223 @@
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
+   Prepared by: Geert Janssen <geert@us.ibm.com>
+
+   Tokenizer for C, C++ and Java with output as annotated XML,
+   much like srcML annotates a parse tree. Any white-space (including
+   newlines) is output as is, without any special XML element.
+   All other tokens (even comments) are output as a stream of XML
+   elements with tag names indicating the type/kind/class of
+   token provided as the enclosed text node.
+
+   <?xml version='1.0' encoding='UTF-8'?>
+   <source language='' filename=''>
+   <@kind@ line='' col='' len=''>...</@kind@>
+   </source>
+
+   Note that end-of-line characters (\r, \n) and sequences (\r \n) are
+   normalized and will always be output as a LINEFEED (LF, 0x0A).
+
+   The characters <, >, and & will be replaced by the special XML entities
+   &lt;, &gt; and &amp; respectively.
+
+   To undo the XML annotation in <file>.xml use either:
+   (this will also correctly revert the XML entities)
+   xmlstarlet sel -T -t -v 'source' <file>.xml, or
+   xidel -s -e 'source' <file>.xml
+
+   Useful xpath queries:
+   (the results show all occurrences and these are not necessarily unique)
+   - all identifiers: //identifier
+   - the length of the last identifier: //identifier[last()]/@len
+   - the value of the first integer: //integer[1]
+   - all comments starting at the beginning of a line:
+     //line_comment[@col=0]|//block_comment[@col=0]
+   - all while keywords: /keyword[text()="while"]
+   - identifiers of length greater than 10: //identifier[@len>10]
+   - tokens immediately following a long identifier:
+     //identifier[@len>15]/following-sibling::*[1]
+   - tokens immediately following the keyword static:
+     //keyword[text()="static"]/following-sibling::*[1]
+*/
+
+#include <unistd.h>             /* getopt() */
+#include <libgen.h>             /* basename() */
+
+#include "libtoken.h"
+
+int main(int argc, char *argv[])
+{
+  extern char *optarg;
+  extern int opterr;
+  extern int optind;
+  int option;
+  char const *opt_str = "1acdhl:o:rvw";
+  char usage_str[80];
+
+  const char *token;
+  enum TokenClass type;
+  unsigned line;
+  unsigned col;
+  unsigned pos;
+  unsigned token_len;
+  unsigned num_files = 0;    // number of files read
+  int continuous_files = 0;  // when 1 do not reset after each file
+
+  char *outfile = 0;
+  Language source;
+  int explicit_source = 0;
+  int append = 0;
+
+  comment_token = 1;
+  whitespace_token = 1;
+
+  sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str);
+
+  /* Process arguments: */
+  while ((option = getopt(argc, argv, opt_str)) != EOF) {
+    switch (option) {
+
+    case '1':
+      continuous_files = 1;
+      break;
+
+    case 'a':
+      append = 1;
+      break;
+
+    case 'c':
+      hash_as_comment = 1;
+      break;
+
+    case 'd':
+      debug = verbose = 1;
+      break;
+
+    case 'h':
+fputs(
+"A tokenizer for C/C++ (and Java) source code with output in XML.\n"
+"Recognizes the following token classes: keyword, identifier, integer,\n"
+"floating, string, character, operator, preprocessor, line_comment,\n"
+"and block_comment.\n\n", stderr);
+fprintf(stderr, usage_str, basename(argv[0]));
+fputs(
+"\nCommand line options are:\n"
+"-a       : append to output file instead of create or overwrite.\n"
+"-c       : treat a # character as the start of a line comment.\n"
+"-d       : print debug info to stderr; implies -v.\n"
+"-h       : print just this text to stderr and stop.\n"
+"-l<lang> : specify language explicitly (C, C++, Java).\n"
+"-o<file> : write output to this file (instead of stdout).\n"
+"-1       : treat all filename arguments as a continuous single input.\n"
+"-v       : print action summary to stderr.\n"
+"-w       : suppress all warning messages.\n",
+      stderr);
+      return 0;
+
+    case 'l':
+       source = set_or_detect_lang(optarg);
+       explicit_source = 1;
+      break;
+
+    case 'o':
+      outfile = optarg;
+      break;
+
+    case 'v':
+      verbose = 1;
+      break;
+
+    case 'w':
+      nowarn = 1;
+      break;
+
+    case '?':
+    default:
+      fputs("(F): unknown option. Stop.\n", stderr);
+      fprintf(stderr, usage_str, argv[0]);
+      return 1;
+    }
+  }
+
+  if (outfile && outfile[0]) {
+    if (!freopen(outfile, append ? "a" : "w", stdout)) {
+      fprintf(stderr, "(F): cannot open %s for writing.\n", outfile);
+      exit(3);
+    }
+  }
+
+  if (optind == argc)
+    goto doit;
+
+  do {
+    filename = argv[optind];
+    if (!freopen(filename, "r", stdin)) {
+      if (!nowarn)
+      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
+      continue;
+    }
+
+    if (!explicit_source)
+      source = set_or_detect_lang(0);
+
+  doit:
+    if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename);
+    num_files++;
+
+    // Header:
+    if (!continuous_files || num_files == 1) {
+      fputs("<?xml version='1.0' encoding='UTF-8'?>\n", stdout);
+      // standalone="yes"
+      fprintf(stdout, "<source language='%s' filename='%s'>",
+	      lang_name(source), filename);
+    }
+
+    while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) {
+      if (type == WHITESPACE) {
+	fputs(token, stdout);
+	continue;
+      }
+      fprintf(stdout, "<%s line='%u' col='%u' len='%u'>",
+	      token_class[type], line, col, token_len);
+      if (type == STRING ||
+	  type == CHARACTER ||
+	  type == OPERATOR ||
+	  type == LINE_COMMENT ||
+	  type == BLOCK_COMMENT)
+	XML_escape(stdout, token);
+      else
+	fputs(token, stdout);
+      fprintf(stdout, "</%s>", token_class[type]);
+    }
+
+    if (!continuous_files) {
+      // Trailer:
+      fputs("</source>\n", stdout);
+
+      if (verbose)
+        fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
+                 char_count, utf8_count);
+
+      // Reset globals:
+      char_count = 0;
+      utf8_count = 0;
+      linenr = 1;
+      column = 0;
+      buffered = 0;
+      saved_col = 0;
+    }
+  } while (++optind < argc);
+
+  if (continuous_files) {
+    // Trailer:
+    fputs("</source>\n", stdout);
+
+    if (verbose)
+      fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n",
+              char_count, utf8_count);
+  }
+
+  if (num_files > 1 && verbose)
+    fprintf(stderr, "(I): Total number of files processed: %u\n", num_files);
+
+  return (illegals || unexpect_eof) ? 1 : 0;
+}