From 953514dc8d710271ab96a5c41d8d81eadc5a95ab Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:23:52 -0400
Subject: [PATCH 01/34] initial commit

---
 tools/tokenizer/filter6.awk              |  222 ++++
 tools/tokenizer/libtoken.c               | 1230 ++++++++++++++++++++++
 tools/tokenizer/libtoken.h               |   72 ++
 tools/tokenizer/schemas/tokml-schema.rnc |   73 ++
 tools/tokenizer/tokenize.py              |   62 ++
 tools/tokenizer/tokml-test.sh            |   61 ++
 tools/tokenizer/tokml.c                  |  224 ++++
 7 files changed, 1944 insertions(+)
 create mode 100755 tools/tokenizer/filter6.awk
 create mode 100644 tools/tokenizer/libtoken.c
 create mode 100644 tools/tokenizer/libtoken.h
 create mode 100644 tools/tokenizer/schemas/tokml-schema.rnc
 create mode 100755 tools/tokenizer/tokenize.py
 create mode 100755 tools/tokenizer/tokml-test.sh
 create mode 100644 tools/tokenizer/tokml.c

diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk
new file mode 100755
index 0000000..64061d0
--- /dev/null
+++ b/tools/tokenizer/filter6.awk
@@ -0,0 +1,222 @@
+#!/usr/bin/awk -f
+
+# Copyright (c) 2021 International Business Machines Corporation
+# Prepared by: Geert Janssen <geert@us.ibm.com>
+
+# Expects a C/C++ tokenizer generated CSV file as input with explicit
+# whitespace and separate newline (and continuation) tokens.
+# (tokenize -W -n [-N] -mcsv)
+# Outputs one possibly modified token (class or literal) per line.
+# Tries to use some context to better discriminate the meaning of some
+# otherwise ambiguous tokens.
+
+# Should use yacc/bison or lemon?
+
+# Ambiguous tokens in C/C++:
+# < > delimiters of filename in preprocessor include directive
+# < > delimiters of template parameters
+# < less than operator
+# > greater than operator
+# " " delimiters of filename in preprocessor include directive
+# " " delimiters of string literal
+# ( ) expression grouping
+# ( ) argument list
+# { } block
+# { } initializer
+# [ ] indexing
+# [ ] lambda capture
+# ~ destructor
+# ~ unary operator
+# - unary operator
+# - binary operator
+# * unary operator
+# * binary operator
+# * pointer declarator
+
+# Simplistic CPP line syntax:
+# "#" directive-name (token)* newline
+
+# #include <sys.h>
+# #include "local"
+# #define identifier-macro-def
+# #define identifier-macro-const val
+# #define identifier-macro-func( ... )
+
+# Using a stack to remember CSV token lines whose output is temporarily
+# suppressed. That way can have unbounded lookahead.
+# Use function to empty and print stack from bottom to top.
+
+function push(record) {
+    stack[sp++]=record
+}
+
+function empty_out() {
+    for (i=0; i<sp; i++)
+	print stack[i]
+    sp=0
+}
+
+BEGIN {
+    # CPP directive-names:
+    directive["include"]=1
+    directive["define"]=1
+    directive["undef"]=1
+    directive["if"]=1
+    directive["ifdef"]=1
+    directive["ifndef"]=1
+    directive["else"]=1
+    directive["elif"]=1
+    directive["endif"]=1
+    directive["line"]=1
+    directive["pragma"]=1
+    directive["error"]=1
+
+    # Empty stack of tokens:
+    sp=0
+    # Start (current) state:
+    state=0
+    # Next state:
+    next_state=-1 # indicates no specific rule matches
+    # Field separator of input record (line):
+    FS=","
+    # Read CSV header line:
+    getline
+    # Echo to output:
+    print #0
+}
+
+# Note: only gawk has switch statement.
+
+# Dispatch on current state and input.
+# Make sure all conditions are mutually exclusive, except last one.
+# Last one is made exclusive by next_state==-1.
+# Must use next_state to avoid immediate action on current line.
+
+
+# Instead of composing new CSV record could also modify $0 via
+# assignments to its fields (like $3="identifier").
+
+# A # followed by an identifier in a macro body means stringize the identifier.
+(state == 0 && $4 == "#") {
+    push($0)
+    next_state=1
+}
+
+# # seen; expect directive or identifier.
+(state == 1 && $3 == "identifier") {
+    push($0)
+    if ($4 in directive) {
+	if ($4 == "include")
+	    next_state=2
+	else
+	if ($4 == "define")
+	    next_state=7
+	else {
+	    empty_out()
+	    next_state=0
+	}
+    }
+    else { # #ident => stringize to "ident"
+	empty_out()
+	next_state=0
+    }
+}
+
+# Handle #include <...
+(state == 2 && $4 == "<") {
+    # Note: suppressing this token.
+    next_state=3
+}
+
+# Handle #include "...".
+(state == 2 && $3 == "string") {
+    # $4 has enclosing " doubled!
+    filename=substr($4,3,length($4)-4)
+    empty_out()
+    print $1 "," $2 ",string-local-filename," filename
+    next_state=0
+}
+
+# (state == 2 && anything else) => default action.
+
+# Collect all tokens after the < till >.
+# Treat first (assume its an identifier) specially to get its coordinates.
+(state == 3 && $3 == "identifier") {
+    id_lin=$1
+    id_col=$2
+    filename=$4
+    # Note: modifying this token.
+    next_state=4
+}
+
+# Keep collecting tokens till >.
+(state == 4 && $4 != ">") {
+    filename=filename $4
+    # Note: suppressing this token.
+    next_state=4
+}
+
+# Seen #include <...>.
+(state == 4 && $4 == ">") {
+    empty_out()
+    print id_lin "," id_col ",string-sys-filename,\"" filename "\""
+    # Note: suppressing this token.
+    next_state=0
+}
+
+# states 5, 6 not used for now.
+
+# Handle #define name.
+(state == 7 && $3 == "identifier") {
+    id_lin=$1
+    id_col=$2
+    macro_name=$4
+    # Note: modifying this token.
+    next_state=8
+}
+
+# Handle #define name(.
+(state == 8 && $4 == "(") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-func," macro_name
+    print $0
+    next_state=0
+}
+
+# Handle #define name (.
+(state == 8 && $3 == "whitespace") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-const," macro_name
+    next_state=0
+}
+
+# Handle #define name.
+(state == 8 && $3 != "whitespace" && $4 != "(") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-def," macro_name
+
+    if ($4 == "#") { # With -n should never happen.
+	push($0)
+	next_state=1
+    }
+    else { # Most probably a newline.
+	print $0
+	next_state=0
+    }
+}
+
+# Default rule; always executed.
+{
+    if (next_state == -1) {
+	# Echo all other tokens as is (ignore whitespace though):
+	if ($3 != "whitespace")
+	    print $0
+	# Do not change state!
+    }
+    else {
+	state=next_state
+	next_state=-1
+    }
+}
+
+END {}
diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
new file mode 100644
index 0000000..36ecc2a
--- /dev/null
+++ b/tools/tokenizer/libtoken.c
@@ -0,0 +1,1230 @@
+/* Copyright (c) 2021 International Business Machines Corporation
+   Prepared by: Geert Janssen <geert@us.ibm.com>
+
+   Code functionality shared by all tokenizers.
+   This obviously avoids code duplication and associated maintenance problems.
+*/
+
+#include "libtoken.h"
+
+// Program globals:
+const char *filename = "stdin";  // current file being parsed
+unsigned linenr = 1;       // physical line number counted from 1
+unsigned column = 0;       // byte position in physical line, from 0
+unsigned char_count = 0;   // total byte count
+unsigned utf8_count = 0;   // total utf-8 encoded unicode codepoints
+
+int buffer[MAX_BUF];       // use buffer as multi-char lookahead.
+unsigned buffered = 0;     // number of buffered bytes
+unsigned saved_col = 0;    // one-place buf for last column on prev line
+
+// Program option settings:
+int debug = 0;             // when 1 debug output to stderr
+int verbose = 0;           // when 1 info output to stderr
+int nowarn = 0;            // when 1 warnings are suppressed
+
+unsigned illegals = 0;     // count number of illegal characters
+unsigned unexpect_eof = 0; // encountered unexpected EOF
+int hash_as_comment = 0;   // when 1 treat # as line comment
+int newline_token = 0;     // when 1 output newline pseudo-token
+int comment_token = 0;     // when 1 output comments as tokens
+int whitespace_token = 0;  // when 1 output adjacent white-space as a token
+int continuation_token = 0;  // when 1 output line continuation pseudo-token
+
+static int logical_lines = 0;     // when 1 ignore line continuations in get)
+
+/* No longer using perfect hash function but simple binary search. */
+
+/* C11 n1570.pdf 6.4.1 (44)
+   C17 n2176.pdf 6.4.1 (A.1.2) (44)
+*/
+static const char *C_keywords[] = {
+  "_Alignas",   "_Alignof",     "_Atomic",      "_Bool",        "_Complex",
+  "_Generic",   "_Imaginary",   "_Noreturn",    "_Static_assert",
+  "_Thread_local",
+
+  "auto",       "break",        "case",         "char",         "const",
+  "continue",   "default",      "do",           "double",       "else",
+  "enum",       "extern",       "float",        "for",          "goto",
+  "if",         "inline",       "int",          "long",         "register",
+  "restrict",   "return",       "short",        "signed",       "sizeof",
+  "static",     "struct",       "switch",       "typedef",      "union",
+  "unsigned",   "void",         "volatile",     "while"
+};
+
+#if 0
+/* C++ 2014 n4296.pdf 2.11 (84) */
+static const char *CPP_keywords[] = {
+  "alignas",       "alignof",       "and",           "and_eq",     "asm",
+  "auto",          "bitand",        "bitor",         "bool",       "break",
+  "case",          "catch",         "char",          "char16_t",   "char32_t",
+  "class",         "compl",         "const",         "const_cast", "constexpr",
+  "continue",      "decltype",      "default",       "delete",     "do",
+  "double",        "dynamic_cast",  "else",          "enum",       "explicit",
+  "export",        "extern",        "false",         "float",      "for",
+  "friend",        "goto",          "if",            "inline",     "int",
+  "long",          "mutable",       "namespace",     "new",        "noexcept",
+  "not",           "not_eq",        "nullptr",       "operator",   "or",
+  "or_eq"          "private",       "protected",     "public",     "register",
+  "reinterpret_cast", "return",     "short",         "signed",     "sizeof",
+  "static",        "static_assert", "static_cast",   "struct",     "switch",
+  "template",      "this",          "thread_local",  "throw",      "true",
+  "try",           "typedef",       "typeid",        "typename",   "union",
+  "unsigned",      "using",         "virtual",       "void",       "volatile",
+  "wchar_t",       "while",         "xor",           "xor_eq"
+};
+#endif
+
+/* C++23 n4885.pdf 5.11 (92) */
+static const char *CPP_keywords[] = {
+  "alignas",       "alignof",       "and",           "and_eq",     "asm",
+  "auto",          "bitand",        "bitor",         "bool",       "break",
+  "case",          "catch",         "char",          "char16_t",   "char32_t",
+  "char8_t",       "class",         "co_await",      "co_return",  "co_yield",
+  "compl",         "concept",       "const",         "const_cast", "consteval",
+  "constexpr",     "constinit",     "continue",      "decltype",   "default",
+  "delete",        "do",            "double",        "dynamic_cast", "else",
+  "enum",          "explicit",      "export",        "extern",     "false",
+  "float",         "for",           "friend",        "goto",       "if",
+  "inline",        "int",           "long",          "mutable",    "namespace",
+  "new",           "noexcept",      "not",           "not_eq",     "nullptr",
+  "operator",      "or",            "or_eq"          "private",    "protected",
+  "public",        "register",      "reinterpret_cast", "requires","return",
+  "short",         "signed",        "sizeof",        "static",  "static_assert",
+  "static_cast",   "struct",        "switch",        "template",   "this",
+  "thread_local",  "throw",         "true",          "try",        "typedef",
+  "typeid",        "typename",      "union",         "unsigned",   "using",
+  "virtual",       "void",          "volatile",      "wchar_t",    "while",
+  "xor",           "xor_eq"
+};
+
+/* Java SE 8 (50) (false, true, null are literals) */
+/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */
+static const char *Java_keywords[] = {
+  "abstract", "assert",     "boolean", "break",     "byte",      "case",
+  "catch",    "char",       "class",   "const",     "continue",  "default",
+  "do",       "double",     "else",    "enum",      "extends",   "final",
+  "finally",  "float",      "for",     "goto",      "if",        "implements",
+  "import",   "instanceof", "int",     "interface", "long",      "native",
+  "new",      "package",    "private", "protected", "public",    "return",
+  "short",    "static",     "strictfp","super",     "switch", "synchronized",
+  "this",     "throw",      "throws",  "transient", "try",       "void",
+  "volatile", "while"
+};
+
+static const char *Python_keywords[] = {
+  "False",  "None",   "True",    "and",      "as",       "assert", "async",
+  "await",  "break",  "class",   "continue", "def",      "del",    "elif",
+  "else",   "except", "finally", "for",      "from",     "global", "if",
+  "import", "in",     "is",      "lambda",   "nonlocal", "not",    "or",
+  "pass",   "raise",  "return",  "try",      "while",    "with",   "yield"
+};
+
+/* Includes future reserved keywords, strict mode reserved words and module
+   code reserved words, as well as all the older standards future reserved
+   words, and the literals null, false, and true.
+*/
+static const char *JavaScript_keywords[] = {
+  "abstract", "await",      "boolean",   "break",        "byte",
+  "case",     "catch",      "char",      "class",        "const",
+  "continue", "debugger",   "default",   "delete",       "do",
+  "double",   "else",       "enum",      "export",       "extends",
+  "false",    "final",      "finally",   "float",        "for",
+  "function", "goto",       "if",        "implements",   "import",
+  "in",       "instanceof", "int",       "interface",    "let",
+  "long",     "native",     "new",       "null",         "package",
+  "private",  "protected",  "public",    "return",       "short",
+  "static",   "super",      "switch",    "synchronized", "this",
+  "throw",    "throws",     "transient", "true",         "try",
+  "typeof",   "var",        "void",      "volatile",     "while",
+  "with",     "yield"
+};
+
+#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]);
+
+/* Generic binary search lookup in some keyword table.
+   `word' to be searched must be NUL-terminated C string.
+   `table' is array of const char * of `size' sorted alphabetically.
+   Returns word found (i.e., pointer value in table) or 0.
+*/
+#define lang_is_keyword(lang)                                           \
+  static const char *lang##_is_keyword(const char *word)                \
+  {                                                                     \
+    int i = 0, j = num_keywords(lang);                                  \
+    while (i < j) {                                                     \
+      int k = (i + j) >> 1 /* / 2 */;                                   \
+      const char *kw = lang##_keywords[k];                              \
+      int cmp = strcmp(word, kw);                                       \
+      if (!cmp)                                                         \
+        return kw;                                                      \
+      if (cmp < 0) j = k; else i = k + 1;                               \
+    }                                                                   \
+    return 0;                                                           \
+  }
+
+/* Define individual is_keyword functions per language: */
+/* C_is_keyword */
+lang_is_keyword(C)
+/* CPP_is_keyword */
+lang_is_keyword(CPP)
+/* Java_is_keyword */
+lang_is_keyword(Java)
+/* Python_is_keyword */
+lang_is_keyword(Python)
+/* JavaScript_is_keyword */
+lang_is_keyword(JavaScript)
+
+const char *(*is_keyword)(const char *) = C_is_keyword;
+
+/* Conversion table from filename extension to language code.
+   To find language code, consider all entries and check each ext
+   against filename; matched language is langs[i].lang.
+   Invariant: langs[X].lang == X for every Language value.
+   String representation of language code is langs[X].name.
+
+   Have certain config settings depend on the language.
+   Use 2 step:
+   1. determine language from name/extension
+   2. look up language config
+*/
+static const struct {
+  const char *ext;
+  Language lang;
+  const char *name;
+}
+  langs[] = {
+    { ".c",    C,          "C" },
+    { ".cpp",  CPP,        "C++" },
+    { ".java", JAVA,       "Java" },
+    { ".js",   JAVASCRIPT, "JavaScript" },
+    { ".py",   PYTHON,     "Python" },
+
+    // Alternatives:
+    { ".h",    C,          "" },
+    { ".C",    CPP,        "" },
+    { ".cc",   CPP,        "" },
+    { ".hh",   CPP,        "" },
+};
+
+const char *lang_name(Language lang)
+{
+  return langs[lang].name;
+}
+
+static const struct {
+  //Language lang; implicit
+  const char *(*is_keyword)(const char *);
+}
+  lang_configs[] = {
+    { C_is_keyword,          },
+    { CPP_is_keyword,        },
+    { Java_is_keyword,       },
+    { JavaScript_is_keyword, },
+    { Python_is_keyword,     },
+};
+
+/* Must be called right after a file is opened as stdin.
+   Will attempt to remove any UTF-8 unicode signature (byte-order mark, BOM)
+   at the beginning of the file.
+   Unicode: U+FEFF
+   UTF-8: EF BB BF
+
+   First bytes Encoding              Must remove?
+   00 00 FE FF UTF-32 big endian     Yes
+   FF FE 00 00 UTF-32 little endian  Yes
+   FE FF       UTF-16 big endian     Yes
+   FF FE       UTF-16 little endian  Yes
+   00 00 00 xx UTF-32 big endian     No
+   xx 00 00 00 UTF-32 little endian  No
+   00 xx       UTF-16 big endian     No
+   xx 00       UTF-16 little endian  No
+   otherwise   UTF-8                 No
+*/
+static void remove_BOM(void)
+{
+  int c1 = getchar();
+  if (c1 == 0xEF) {
+    int c2 = getchar();
+    if (c2 == 0xBB) {
+      int c3 = getchar();
+      if (c3 == 0xBF) {
+        return;
+      }
+      if (c3 != EOF) buffer[buffered++] = c3;
+    }
+    if (c2 != EOF) buffer[buffered++] = c2;
+  }
+  if (c1 != EOF) buffer[buffered++] = c1;
+}
+
+int open_as_stdin(const char *file)
+{
+  filename = file;
+  if (!freopen(filename, "r", stdin)) {
+    if (!nowarn)
+      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
+    return -1;
+  }
+  return set_or_detect_lang(0);
+}
+
+/* Deal with DOS (\r \n) and classic Mac OS (\r) (physical) line endings.
+   In case of CR LF skip (but count) the CR and return LF.
+   In case of CR not followed by LF turns the CR into LF and returns that.
+   All other chars are returned as is.
+   Note: never returns a CR (\r). Line/column counts are not affected here.
+*/
+static int normalize_newline(void)
+{
+  /* No need to recognize Unicode code points here. */
+  int cc = getchar();
+
+  if (cc == '\r') {
+    // Maybe \r \n (CR NL) combination?
+    int nc = getchar();
+    if (nc == '\n') {
+      char_count++; // counts the carriage return
+      utf8_count++;
+      // No use incrementing column.
+      return nc; // return \n; effectively skipping the \r
+    }
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
+    // cc == '\r'; consider a newline as well, so turn into \n:
+    cc = '\n';
+  }
+  return cc;
+}
+
+/* Detects escaped newlines (line continuations) and signals them with the
+   special '\r' character (that otherwise is not used).
+   Keeps track of physical coordinates and absolute location for each character.
+*/
+int get(void)
+{
+  int cc;
+
+ restart:
+  // Get the next character:
+  if (buffered) { // chars available in lookahead buffer
+    cc = buffer[--buffered]; // never EOF
+    // cc maybe '\r' (line continuation); act like '\n':
+    if (cc == '\n' || cc == '\r') {
+      linenr++;
+      saved_col = column;
+      column = 0;
+      return cc;
+    }
+    column++;
+    return cc;
+  }
+
+  // Read a fresh char:
+  cc = normalize_newline(); // cc != '\r'
+  if (cc == EOF) return EOF;
+  char_count++;
+  if (utf8_start(cc)) utf8_count++;
+
+  if (cc == '\n') { // a normalized end-of-line (\r|\r?\n)
+    linenr++;
+    saved_col = column;
+    column = 0;
+    return cc; // \n here signals a logical end-of-line
+  }
+
+  // Deal with explicit \ line continuations!
+  if (cc == '\\') {
+    // Must look ahead (never maintained across get calls!):
+    int nc = normalize_newline(); // cc != '\r'
+    if (nc == '\n') {
+      char_count++; // counts the newline
+      utf8_count++;
+      linenr++;     // on next physical line
+      saved_col = column+1; // +1 for backslash
+      column = 0;
+
+      if (logical_lines)
+        // Still need to get a character.
+        // Could again start a line continuation!
+        goto restart;
+
+      // Signal that this was an escaped newline (= line continuation):
+      return '\r';
+    }
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
+    // cc == '\\' a regular backslash
+  }
+  column++;
+  return cc;
+}
+
+/* Undo action of a get() lookahead call.
+   An attempt at undoing an EOF read has no effect.
+   Since get() encodes logical line endings with \n and continuation
+   line endings with \r, both could be subject to an unget().
+*/
+void unget(int cc)
+{
+  if (cc == EOF) return;
+  if (buffered < MAX_BUF) {
+    if (cc == '\n' || cc == '\r') {
+      linenr--;
+      // column was 0 right after getting the \n
+      // hopefully there are no multiple ungets of \n
+      column = saved_col;
+    }
+    else
+      column--;
+    buffer[buffered++] = cc;
+  }
+  else {
+    fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF);
+    exit(2);
+  }
+}
+
+/* Either set this file's input language explicitly via a string or
+   use the filename extension to determine the language.
+   If neither works out, use the default language C.
+   Uses global filename (maybe stdin).
+   Once the language is known, configs for that language are applied,
+   e.g. the correct keyword table to use.
+*/
+Language set_or_detect_lang(const char *source)
+{
+  int i;
+  Language lang = C; // default language
+
+  if (source) {
+    /* Check if explicit language is known: */
+    for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
+      if (!strcmp(source, langs[i].name)) {
+        lang = langs[i].lang;
+        goto done;
+      }
+    fprintf(stderr, "(E): No support for language `%s'.\n", source);
+  }
+
+  char *p;
+  if (p = strrchr(filename, '.')) {
+    for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
+      if (!strcmp(p, langs[i].ext)) {
+        lang = langs[i].lang;
+        goto done;
+      }
+    fprintf(stderr, "(E): Unknown filename extension `%s'.\n", p);
+  }
+  if (!nowarn)
+    fprintf(stderr, "(W): Assuming default language C.\n");
+
+ done:
+  is_keyword = lang_configs[lang].is_keyword;
+  return lang;
+}
+
+// Dynamically sized token buffer:
+static char *token_buf = 0;
+static unsigned token_alloc = 0;
+static unsigned token_len = 0;
+
+// Resets the token buffer cursor.
+static void token_buf_reset(void)
+{
+  token_len = 0;
+}
+
+// Makes sure there is room in the token buffer.
+static void token_buf_room(void)
+{
+  if (token_len == token_alloc) { // all space used up
+    if (!token_alloc) { // first time allocation
+      token_alloc = 65536;
+      if (!(token_buf = malloc(token_alloc))) {
+        fprintf(stderr, "(F): Allocation of token buffer failed.\n");
+        exit(4);
+      }
+      token_buf[0] = '\0'; // for safety
+      return;
+    }
+
+    token_alloc <<= 1;
+    if (!(token_buf = realloc(token_buf, token_alloc))) {
+      fprintf(stderr, "(F): Reallocation of token buffer failed.\n");
+      exit(4);
+    }
+    //fprintf(stderr, "Realloc-ed token buf.\n");
+  }
+}
+
+// Appends a character to the token buffer, always making sure there is room.
+static void token_buf_push(int cc)
+{
+  token_buf_room();
+  // There is room: token_len < token_alloc
+  token_buf[token_len++] = cc;
+}
+
+// Undoes the push action but only if there is some content.
+static int token_buf_pop(void)
+{
+  return token_len ? token_buf[--token_len] : 0;
+}
+
+// Adds a terminating NUL character which does not change the token length.
+static void token_buf_close(void)
+{
+  token_buf_room();
+  token_buf[token_len] = '\0'; // Note: no advance
+}
+
+/* Tokenization of C++ programming language source text.
+   Recognizes:
+   - identifier
+   - reserved word/keyword
+   - binary, octal, decimal, hexadecimal and floating-point numbers
+   - double-quoted string literal
+   - single-quoted character literal
+   - all single, double, and triple operator and punctuation symbols
+   - the preprocessor tokens # and ##
+   Optionally:
+   - filename       start_token
+   - line_comment   comment_token
+   - block_comment  comment_token
+   - newline        newline_token
+   - continuation   continuation_token
+   - whitespace     whitespace_token
+
+   Normally skips white-space and comments and flags anything
+   left over as illegal characters.
+
+   (Approximately 20 tests per single character worst-case.)
+
+   Returns 0 upon EOF else the token length in bytes.
+   (There are no 0-length tokens!)
+*/
+unsigned C_tokenize(const char **token, const char **type,
+                    unsigned *line, unsigned *col)
+{
+  int cc;
+  *type = "";
+
+  do { // infinite loop; after token recognized breaks out.
+    // Start collecting a token.
+    token_buf_reset();
+    *line = linenr;
+    *col = column;
+    // white-space tokens see continuation lines:
+    logical_lines = 0;
+    cc = get();
+
+  restart:
+    // cc already read; coordinates for it are correct.
+
+    /*** WHITE-SPACE ***/
+
+    /* In principle all consecutive white-space including \n and \r (and some
+       other control chars) are collected and form a single whitespace token.
+       However, when newlines are requested to be reported as separate tokens,
+       they break this pattern. Note that we cannot issues multiple tokens
+       in a single call to this function.
+
+       Token buf will only hold some white-space chars when implicitly
+       requested via whitespace_token; otherwise stays empty.
+       Same for the \n and \r requests.
+     */
+
+    if (cc == '\n' && newline_token) { // end of a logical line
+      // Here we assume the buf is empty.
+      token_buf_push(cc);
+      *type = "newline";
+      break;
+    }
+
+    if (cc == '\r' && continuation_token) { // end of a physical line
+      // Here we assume the buf is empty.
+      token_buf_push('\\');
+      token_buf_push('\n');
+      *type = "continuation";
+      break;
+    }
+
+    // Aggregate as much white-space as possible.
+    // FIXME: officially a NUL should be considered white-space.
+    while (isspace(cc)) {	// i.e., cc in [ \f\n\r\t\v]
+      // Here: !newline_token (!continuation_token)
+      if (whitespace_token)
+        if (cc == '\r') { // line continuation
+          // Convert back to original char sequence:
+          token_buf_push('\\');
+          token_buf_push('\n');
+        }
+        else
+          token_buf_push(cc); // perhaps \n
+      //else: white-space is discarded
+
+      // Here: whitespace_token implies token_len > 0
+
+      cc = get();
+      if (cc == '\n' && newline_token) {
+	// Must issue whitespace token if so requested.
+	if (whitespace_token) {
+	  // Undo lookahead (unget(EOF) has no effect!):
+	  unget(cc); // next token will be newline
+	  *type = "whitespace";
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
+	}
+	// Issue newline token right away:
+	goto restart;
+      }
+
+      if (cc == '\r' && continuation_token) {
+	// Must issue whitespace token if so requested.
+	if (whitespace_token) {
+	  // Undo lookahead (unget(EOF) has no effect!):
+	  unget(cc); // next token will be continuation
+	  *type = "whitespace";
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
+	}
+	// Issue continuation token right away:
+	goto restart;
+      }
+    }
+    // Here: !isspace: must break or start real token.
+
+    if (whitespace_token && token_len) {
+      // Undo lookahead (unget(EOF) has no effect!):
+      unget(cc);
+      *type = "whitespace";
+      break;
+    }
+
+    if (cc == EOF)
+      return 0;
+
+    // Rest of tokens treat line continuations as non-existent:
+    logical_lines = 1;
+
+    // If white-space skipped must reset coordinates:
+    *line = linenr;
+    *col = column-1;
+
+    /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/
+    // Java: no preprocessor directives.
+
+    // NULs (like many other chars) in comments are silently ignored!
+
+    if (cc == '#' && hash_as_comment) {
+      if (comment_token)
+        token_buf_push(cc);
+      // Skip till end-of-line (\n exclusive):
+      while ((cc = get()) != '\n' && cc != EOF)
+        if (comment_token)
+          token_buf_push(cc);
+      // cc == '\n' || cc == EOF
+      // Don't consider \n part of comment.
+      if (comment_token) {
+	// Undo lookahead (unget(EOF) has no effect!):
+        unget(cc);
+        *type = "line_comment";
+        break;
+      }
+      *line = linenr-1;
+      *col = saved_col;
+      goto restart;
+    }
+
+    /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/
+
+    if (cc == '/') {
+      cc = get();
+      if (cc == '/') {
+        if (comment_token) {
+          token_buf_push(cc);
+          token_buf_push(cc);
+        }
+        // Skip till end-of-line (\n exclusive):
+        while ((cc = get()) != '\n' && cc != EOF)
+          if (comment_token)
+            token_buf_push(cc);
+        // cc == '\n' || cc == EOF
+        // Don't consider \n part of comment.
+        if (comment_token) {
+	  // Undo lookahead (unget(EOF) has no effect!):
+          unget(cc);
+          *type = "line_comment";
+          break;
+        }
+	*line = linenr-1;
+	*col = saved_col;
+        goto restart;
+      }
+
+      if (cc == '*') {
+        if (comment_token) {
+          token_buf_push('/');
+          token_buf_push(cc);
+        }
+        // Skip till */ inclusive:
+        int nc = get(); // if EOF next get will be EOF too
+        if (comment_token && nc != EOF)
+          token_buf_push(nc);
+        do {
+          cc = nc;
+          nc = get();
+          if (nc == EOF) { // Error!
+            fprintf(stderr,
+                    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
+                    filename, *line);
+            unexpect_eof++;
+            return 0;
+          }
+          if (comment_token)
+            token_buf_push(nc);
+        } while (cc != '*' || nc != '/');
+        // cc == '*' && nc == '/'
+        // Don't consider char right after */ as part of comment.
+        if (comment_token) {
+          *type = "block_comment";
+          break;
+        }
+	*line = linenr;
+	*col = column;
+        cc = get();
+        goto restart;
+      }
+      // seen / but not // or /*
+      unget(cc); // char after /
+      cc = '/'; // restore /
+    }
+
+    // If white-space and/or comments skipped must reset coordinates:
+    *line = linenr;
+    *col = column-1;
+
+    /*** CHAR and STRING PREFIX (C/C++) ***/
+
+    // Allow u,U,L prefix for string and char
+    // FIXME: allow u8 as prefix for string
+    if (cc == 'L' || cc == 'u' || cc == 'U') {
+      token_buf_push(cc);
+      cc = get();
+      if (cc == '"')
+        goto string_token;
+      if (cc == '\'')
+        goto char_token;
+      // u,U,L will be interpreted as (start of) identifier.
+      unget(cc); // char after u,U,L
+      cc = token_buf_pop(); // restore original and remove from token
+    }
+
+    /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/
+    // Java: false, true, null are literals
+    // FIXME: Flag to allow .letter as part of identifier?
+    // (compound identifier)
+
+    // Simplistic solution to allowing Unicode: allow any char >= 128 without
+    // actual checking for UTF-8.
+    if (isalpha(cc) || cc == '_' || cc == '$' || (cc & 0x80)) {
+      token_buf_push(cc);
+      while (isalnum(cc = get()) || cc == '_' || cc == '$' ||
+             cc != EOF && (cc & 0x80))
+        token_buf_push(cc);
+      unget(cc);
+      token_buf_close();
+      *type = is_keyword(token_buf) ? "keyword" : "identifier";
+      break;
+    }
+
+    /*** INTEGER and FLOATING ***/
+    // Java: uses _ in numbers as insignificant separator
+    // Java: decimal suffix: [lL], float suffix: [fFdD]
+    // Java: allows hex float
+
+#if 0
+    // Examples:
+    int bin_num = 0B010101u;
+    int oct_num = 01234567L;
+    int hex_num = 0x123ABCLL;
+    int dec_num = 12345678;
+
+    float flt_num1 = 077.;
+    float flt_num2 = 077.987;
+    float flt_num3 = 77.;
+    float flt_num4 = .77;
+#endif
+
+    // . digits ... floating
+    if (cc == '.') {
+      // Look ahead for a digit:
+      int nc;
+      if (isdigit(nc = get())) {
+        unget(nc);
+        goto start_fraction;
+      }
+      unget(nc);
+      // Could go immediately to operator: goto seen_period
+    }
+
+    if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal
+      // Types of integer literals:
+      enum {
+        BIN, OCT, DEC, HEX
+      } int_lit = cc == '0' ? OCT : DEC;
+
+      // Lookahead:
+      int nc = get();
+      if (int_lit == OCT && (nc == 'x' || nc == 'X')) {
+        int_lit = HEX;
+        token_buf_push(cc); // the 0
+        cc = nc; // the x or X
+      }
+      else
+      if (int_lit == OCT && (nc == 'b' || nc == 'B')) {
+        int_lit = BIN;
+        token_buf_push(cc); // the 0
+        cc = nc; // the b or B
+      }
+      else
+        unget(nc); // isdigit(cc)
+
+      do {
+        token_buf_push(cc);
+        cc = get();
+
+        // Allow for ' between `digits':
+        if (cc == '\'') {
+          // Keep the ' in the token for now:
+          token_buf_push(cc);
+          int nc = get();
+          if (isdigit(nc) || int_lit == HEX && isxdigit(nc))
+            cc = nc;
+          else { // Error!
+            fprintf(stderr,
+                    "(E): [%s:%u] C++14 only allows ' between digits.\n",
+                    filename, linenr);
+            // what to do?
+          }
+        }
+      } while (isdigit(cc) || int_lit == HEX && isxdigit(cc));
+      // !is[x]digit(cc)
+
+      // FIXME: allow hex floats in C
+      if (int_lit == OCT || int_lit == DEC) {
+        int floating = 0;
+        // Seen digits-sequence. Maybe followed by . or e or E?
+        if (cc == '.') { // fractional part
+        start_fraction:
+          floating = 1;
+          token_buf_push(cc);
+          // digits? FIXME: again allow ' between digits
+          while (isdigit(cc = get()))
+            token_buf_push(cc);
+          // !isdigit(cc)
+        }
+        // cc != '.' || !isdigit(cc)
+        if (cc == 'e' || cc == 'E') { // exponent
+          floating = 1;
+          token_buf_push(cc);
+          if ((cc = get()) == '-' || cc == '+') {
+            token_buf_push(cc);
+            cc = get();
+          }
+          // FIXME: no check for at least 1 digit
+          // FIXME: again allow ' between digits
+          while (isdigit(cc)) {
+            token_buf_push(cc);
+            cc = get();
+          }
+          // !isdigit(cc)
+        }
+        if (floating) {
+          if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L')
+            token_buf_push(cc);
+          else
+            unget(cc);
+          *type = "floating";
+          break;
+        }
+      }
+
+      // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case
+      if (cc == 'l' || cc == 'L') {
+        token_buf_push(cc);
+        // maybe another l
+        cc = get();
+        if (cc == 'l' || cc == 'L') {
+          token_buf_push(cc);
+          // Here: token is digits[lL][lL]
+          cc = get();
+        }
+        // maybe a u
+        if (cc == 'u' || cc == 'U')
+          // Here: token is digits[lL][lL]?[u|U]
+          token_buf_push(cc);
+        else
+          unget(cc);
+      }
+      else if (cc == 'u' || cc == 'U') {
+        token_buf_push(cc);
+        // maybe an l
+        cc = get();
+        if (cc == 'l' || cc == 'L') {
+          token_buf_push(cc);
+          // Here: token is digits[uU][lL]
+          cc = get();
+        }
+        // maybe another l
+        if (cc == 'l' || cc == 'L')
+          // Here: token is digits[uU][lL]?[lL]
+          token_buf_push(cc);
+        else
+          unget(cc);
+      }
+      else
+        unget(cc);
+      *type = "integer";
+      break;
+    }
+
+    /*** STRING (C/C++/Java) ***/
+
+    if (cc == '"') {
+    string_token:
+      token_buf_push(cc);
+      // Watch out for escaped " inside string.
+      cc = get();
+      while (cc != '"') {
+        if (cc == EOF) { // Error!
+          fprintf(stderr,
+                  "(E): [%s:%u] Unexpected end-of-file in string literal.\n",
+                  filename, *line);
+          unexpect_eof++;
+          return 0;
+        }
+        token_buf_push(cc);
+        int nc = get();
+
+        if (cc == '\\') {
+          // FIXME: No check on valid escape char!
+          // ' " ? \ a b f n r t v
+          token_buf_push(nc);
+          cc = get();
+        }
+        else
+          cc = nc;
+      }
+      // cc == '"'
+      token_buf_push(cc);
+      *type = "string";
+      break;
+    }
+
+    /*** CHARACTER (C/C++/Java) ***/
+
+    if (cc == '\'') {
+    char_token:
+      token_buf_push(cc);
+      // Watch out for escaped ' inside char.
+      cc = get();
+      // Cannot have empty char!
+      if (cc == '\'') {
+	fprintf(stderr,
+		"(E): [%s:%u] Cannot have an empty character literal.\n",
+		filename, linenr);
+	// Output as token anyway, but count as illegal:
+	token_buf_push(cc);
+	*type = "character";
+	illegals++;
+	break;
+      }
+
+      // FIXME: Avoid including too many chars.
+      while (cc != '\'') {
+        if (cc == EOF) { // Error!
+          fprintf(stderr,
+                  "(E): [%s:%u] Unexpected end-of-file in character literal.\n",
+                  filename, linenr);
+          unexpect_eof++;
+	  // Note: partial character literal is lost.
+          return 0;
+        }
+        if (cc == '\n') { // Error!
+          fprintf(stderr,
+                 "(E): [%s:%u] Cannot have end-of-line in character literal.\n",
+                  filename, linenr);
+	  illegals++;
+	  // Immediately terminate character literal as if ' present.
+	  // cc = '\''; make into valid literal??? No!
+	  break;
+        }
+	token_buf_push(cc);
+        int nc = get();
+        if (cc == '\\') {
+          token_buf_push(nc);
+          cc = get();
+          // FIXME: No check on valid escape char!
+          // ' " ? \ a b f n r t v 0[d[d]] xh*
+        }
+        else {
+	  cc = nc;
+	  // If first char then expect no more.
+	  if (token_len == 2) {
+	    if (nc != '\'') {
+	      fprintf(stderr,
+		      "(E): [%s:%u] Cannot have multi-character literal.\n",
+		      filename, linenr);
+	      illegals++;
+	      // Immediately terminate character literal as if ' present.
+	      // cc = '\''; make into valid literal???
+	      break;
+	    }
+	  }
+	}
+      }
+      if (cc == '\'')
+	token_buf_push(cc);
+      else
+	unget(cc);
+      *type = "character";
+      break;
+    }
+
+    /*** OPERATOR (and PUNCTUATION) (C/C++/Java) ***/
+
+    // Operator and punctuation symbols. Longest match.
+
+    /* Operator or punctuator   Alternative representation
+       {        <%
+       }        %>
+       [        <:
+       ]        :>
+       #        %:      (not supported here)
+       ##       %:%:    (not supported here)
+    */
+
+    // Single char operator or punctuator (C/C++/Java)
+    // { } [ ] ( ) ; : ? . ~ ! + - * / % ^ = & | < > ,
+    // Double char operator or punctuator (C/C++)
+    // <: :> <% %>
+    // Double char operator or punctuator (C/C++/Java)
+    // += -= *= /= %= ^= &= |= == != <= >= && || << >> ++ -- ->
+    // Double char operator or punctuator (C++/Java)
+    // ::
+    // Double char operator or punctuator (C++)
+    // .*
+    // Triple char operator or punctuator (C/C++/Java)
+    // ... <<= >>=
+    // Triple char operator or punctuator (C++)
+    // ->* <=>
+    // Java: @ >>> >>>=
+
+    //seen_period:
+
+    token_buf_push(cc);
+    token_buf_close();
+    //token=[cc,0];len=1
+
+    if (strstr("{}[]();?~,@", token_buf)) { // allow @ for Java
+      // Single char operator/punctuator.
+      *type = "operator";
+      break;
+    }
+
+    if (strstr("<:.-+*/%^&|=!>", token_buf)) { // single or start of double/triple
+      // Check second char:
+      int c2 = get();
+      if (c2 != EOF) {
+        token_buf_push(c2);
+        //token=[cc,c2];len=2
+
+        // Check third char:
+        int c3 = get();
+        if (c3 != EOF) {
+          token_buf_push(c3);
+          token_buf_close();
+          //token=[cc,c2,c3,0];len=3
+          if (!strcmp(">>>", token_buf)) { // allow >>> for Java
+            //token=[>,>,>,0];len=3
+            // Look-ahead for =:
+            int c4 = get();
+            if (c4 == '=') // >>>= for Java
+              token_buf_push(c4);
+              //token=[>,>,>,=];len=4
+            else
+              unget(c4);
+              //token=[>,>,>,0];len=3
+            *type = "operator";
+            break;
+          }
+          //token=[cc,c2,c3,0];len=3
+
+          if (!strcmp("...", token_buf) ||
+              !strcmp("<=>", token_buf) ||
+              !strcmp("->*", token_buf) ||
+              !strcmp("<<=", token_buf)) {
+            // Triple char operator/punctuator.
+            *type = "operator";
+            break;
+          }
+
+          // Maybe double char. Undo the c3 token extension:
+          token_buf_pop();
+          token_buf_close();
+          //token=[cc,c2,0];len=2
+        }
+        else
+          token_buf_close();
+          //token=[cc,c2,0];len=2
+        unget(c3);
+
+        // Maybe double char.
+        static const char * const ops2[] = {
+          "<:", "<%", "<=", "<<", ":>",
+          "::", ".*", "->", "-=", "--",
+          "+=", "++", "*=", "/=", "%>",
+          "%=", "^=", "&=", "&&", "|=",
+          "||", "==", "!=", ">=", ">>"
+        };
+        unsigned size = sizeof(ops2) / sizeof(ops2[0]);
+        unsigned i;
+        for (i = 0; i < size; i++)
+          if (!strcmp(ops2[i], token_buf))
+            break;
+        if (i < size) {
+          *type = "operator";
+          break;
+        }
+        //token=[cc,c2,0];len=2
+
+        // Must be single char. Undo the c2 token extension:
+        token_buf_pop();
+        token_buf_close();
+        //token=[cc,0];len=1
+      }
+      //else token=[cc,0];len=1
+
+      // Must be single char.
+      unget(c2);
+      *type = "operator";
+      break;
+    }
+    //token=[cc,0];len=1
+
+    /*** PREPROCESSOR (C/C++) ***/
+
+    if (cc == '#') {
+      int nc = get();
+      if (nc != '#')
+        unget(nc);
+      else
+        token_buf_push(nc);
+      *type = "preprocessor";
+      break;
+    }
+
+    // What is left here? Illegal chars!
+    if (!nowarn)
+      // Mind non-printing chars!
+      fprintf(stderr,
+              "(W): [%s:%u] Illegal character `%s%c` (0x%02x) skipped.\n",
+              filename, linenr, cc<32?"CTRL-":"", cc<32?cc+64:cc, cc);
+    // Count them:
+    illegals++;
+
+  } while (1);
+  token_buf_close();
+  *token = token_buf;
+  return token_len;
+}
+
+// Escape hard newlines in a string.
+void RAW_escape(FILE *out, const char *token)
+{
+  const char *p;
+  for (p = token; *p; p++) {
+    if (*p == '\n') {
+      fputs("\\n", out);
+      continue;
+    }
+    fputc(*p, out);
+  }
+}
+
+// Escape token for output as CSV string.
+void CSV_escape(FILE *out, const char *token)
+{
+  const char *p;
+  // start CSV string:
+  fputc('"', out);
+  for (p = token; *p; p++) {
+    if (*p == '\n') { // escape embedded real new lines
+      fputs("\\n", out);
+      continue;
+    }
+    if (*p == '"')
+      fputc('"', out);
+    fputc(*p, out);
+  }
+  // end CSV string:
+  fputc('"', out);
+}
+
+// Escape token for output as JSON string.
+void JSON_escape(FILE *out, const char *token)
+{
+  // C/C++ has escapes: \' \" \? \a \b \f \n \r \t \v \x \0.
+  // To preserve, simply escape the backslash and all ":
+  const char *p;
+  for (p = token; *p; p++) {
+    if (*p == '\n') { // escape embedded real new lines
+      fputs("\\n", out);
+      continue;
+    }
+    if (*p == '\\' || *p == '"')
+      fputc('\\', out);
+    fputc(*p, out);
+  }
+}
+
+// Escape token for output as XML text.
+void XML_escape(FILE *out, const char *token)
+{
+#if 1
+  // Alternative: escape every <, >, and &:
+  const char *p;
+  for (p = token; *p; p++) {
+    if (*p == '<')
+      fputs("&lt;", out);
+    else
+    if (*p == '>')
+      fputs("&gt;", out);
+    else
+    if (*p == '&')
+      fputs("&amp;", out);
+    else
+      fputc(*p, out);
+  }
+#else
+  // User CDATA construct for escaping.
+  // Impossible to escape ]]> occurring in token!
+  // Must chop up the substring ]]> in ]] and >.
+  const char *p;
+  const char *q = token;
+  // "abc]]>hello" => <![CDATA["abc]]]]><![CDATA[>hello"]]>
+  // "]]>]]>" => <![CDATA[]]]]><!CDATA[>]]]]><![CDATA[>"]]>
+  while ((p = strstr(q, "]]>"))) {
+    int len = p - q; // always > 0
+    fputs("<![CDATA[", out);
+    fwrite(q, 1, len, out);
+    fputs("]]]]>", out);
+    q = p+2; // q start at >...
+  }
+  if (q < token+strlen(token))
+    fprintf(out, "<![CDATA[%s]]>", q);
+#endif
+}
diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h
new file mode 100644
index 0000000..94f0195
--- /dev/null
+++ b/tools/tokenizer/libtoken.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2021 International Business Machines Corporation
+   Prepared by: Geert Janssen <geert@us.ibm.com>
+
+   Code functionality shared by all tokenizers.
+*/
+
+#ifndef LIBTOKEN_H
+#define LIBTOKEN_H
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+#define MAX_BUF       8  // maximum lookahead in chars
+
+/* Let's assume UTF-8 encoding.
+   https://www.cprogramming.com/tutorial/unicode.html
+   https://opensource.apple.com/source/tidy/tidy-2.2/tidy/src/utf8.c.auto.html
+*/
+
+// Test for start of UTF-8 sequence.
+#define utf8_start(cc)  (((cc)&0xC0)!=0x80)
+#define utf8_follow(cc) (((cc)&0xC0)==0x80)
+
+#define utf8_len(cc) \
+  (((cc)&0xF8)==0xF0 ? 4 : ((cc)&0xF0)==0xE0 ? 3 : ((cc)&0xE0)==0xC0 ? 2 : 1)
+
+typedef enum { C, CPP, JAVA, JAVASCRIPT, PYTHON } Language;
+
+// Program globals:
+extern const char *filename/*= "stdin"*/;  // current file being parsed
+extern unsigned linenr/*= 1*/;       // physical line number counted from 1
+extern unsigned column/*= 0*/;       // char position in physical line, from 0
+extern unsigned saved_col/*= 0*/;    // 1-place buf for last column on prev line
+extern unsigned char_count/*= 0*/;   // total char/byte count
+extern unsigned utf8_count/*= 0*/;   // total utf-8 char count
+extern unsigned buffered/*= 0*/;     // number of buffered chars
+extern int buffer[MAX_BUF];          // use buffer as multi-char lookahead.
+
+// Program option settings:
+extern int debug/*= 0*/;             // when 1 debug output to stderr
+extern int verbose/*= 0*/;           // when 1 info output to stderr
+extern int nowarn/*= 0*/;            // when 1 warnings are suppressed
+
+extern unsigned illegals/*= 0*/;     // count number of illegal characters
+extern unsigned unexpect_eof/*= 0*/; // encountered unexpected EOF
+extern int hash_as_comment/*= 0*/;   // when 1 treat # as line comment
+extern int newline_token/*= 0*/;     // when 1 output newline pseudo-token
+extern int comment_token/*= 0*/;     // when 1 output comments as tokens
+extern int whitespace_token/*= 0*/;  // when 1 output adjacent white-space as a token
+extern int continuation_token/*= 0*/; // when 1 output line continuation pseudo-token
+
+// keyword lookup function (pointer variable):
+// (initialized by set_or_detect_lang())
+extern const char *(*is_keyword)(const char *);
+
+extern int get(void);
+extern void unget(int cc);
+extern Language set_or_detect_lang(const char *source);
+extern const char *lang_name(Language lang);
+extern int open_as_stdin(const char *file);
+
+extern unsigned C_tokenize(const char **token, const char **type, unsigned *line, unsigned *col);
+
+extern void  RAW_escape(FILE *out, const char *token);
+extern void  CSV_escape(FILE *out, const char *token);
+extern void JSON_escape(FILE *out, const char *token);
+extern void  XML_escape(FILE *out, const char *token);
+
+#endif /* LIBTOKEN_H */
diff --git a/tools/tokenizer/schemas/tokml-schema.rnc b/tools/tokenizer/schemas/tokml-schema.rnc
new file mode 100644
index 0000000..55e1165
--- /dev/null
+++ b/tools/tokenizer/schemas/tokml-schema.rnc
@@ -0,0 +1,73 @@
+# XML RNC schema for tokML 1.0
+# Copyright IBM Corporation 2021
+# Prepared by Geert Janssen <geert@us.ibm.com>
+
+datatypes xsd = 'http://www.w3.org/2001/XMLSchema-datatypes'
+
+#default namespace = "https://www.ibm.com/tokml"
+
+start = source
+
+# Children are token elements interspersed with white-space.
+source = element source {
+   attribute language { "C" | "C++" | "Java" },
+   attribute filename { xsd:string }?,
+   (  line_comment |
+      block_comment |
+      keyword |
+      identifier |
+      integer |
+      floating |
+      \string |
+      character |
+      operator |
+      preprocessor |
+      text )*
+}
+
+# Attributes common to all token elements.
+common-attrs = 
+   (  attribute line { xsd:unsignedInt },
+      attribute col { xsd:unsignedInt },
+      attribute len { xsd:unsignedInt } )
+
+line_comment = element line_comment {
+   common-attrs,
+   text
+}
+block_comment = element block_comment {
+   common-attrs,
+   text
+}
+keyword = element keyword {
+   common-attrs,
+   text
+}
+identifier = element identifier {
+   common-attrs,
+   text
+}
+integer = element integer {
+   common-attrs,
+   text
+}
+floating = element floating {
+   common-attrs,
+   text
+}
+\string = element string {
+   common-attrs,
+   text
+}
+character = element character {
+   common-attrs,
+   text
+}
+operator = element operator {
+   common-attrs,
+   text
+}
+preprocessor = element preprocessor {
+   common-attrs,
+   text
+}
diff --git a/tools/tokenizer/tokenize.py b/tools/tokenizer/tokenize.py
new file mode 100755
index 0000000..e848a51
--- /dev/null
+++ b/tools/tokenizer/tokenize.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+
+# Copyright IBM Corporation 2021
+# Written by Geert Janssen <geert@us.ibm.com>
+
+# Simple ctypes-based Python wrapper of libtoken.so
+# See ctypes documentation: https://docs.python.org/3/library/ctypes.html
+# This Python script works with versions 2.6, 2.7, and 3.5
+
+import sys
+from ctypes import *
+
+# Load the shared object (expects it in current directory):
+libtoken = CDLL('./libtoken.so')
+
+# Define the exported function signatures:
+libtoken.C_tokenize.argtypes = (POINTER(c_char_p),
+                                POINTER(c_char_p),
+                                POINTER(c_uint),
+                                POINTER(c_uint))
+libtoken.open_as_stdin.argtypes = (c_char_p,)
+
+# 'Declare' the C function argument types:
+_token  = c_char_p()
+_kind   = c_char_p()
+_linenr = c_uint()
+_column = c_uint()
+
+# Token generator:
+def token():
+    global _token, _kind, _linenr, _column
+
+    # C_tokenize returns 0 upon end-of-file.
+    while int(libtoken.C_tokenize(byref(_token), byref(_kind), byref(_linenr), byref(_column))):
+        # Turn ctypes into real Python values:
+        lin = _linenr.value
+        col = _column.value
+        clas = _kind.value.decode()
+        text = _token.value.decode()
+        yield (lin,col,clas,text)
+
+if len(sys.argv) == 1:
+    for tok in token():
+        print('[%u:%u] %s, %s' % tok)
+else:
+    for file in sys.argv[1:]:
+        # Set C filename global and reopen as stdin:
+        b_str = file.encode('utf-8') # need handle b_str to retain as C pointer
+        libtoken.open_as_stdin(b_str)
+
+        # Access C globals:
+        filename = c_char_p.in_dll(libtoken, 'filename')
+        print('[0:0] filename, %s' % filename.value.decode())
+
+        for tok in token():
+            print('[%u:%u] %s, %s' % tok)
+
+        # Reset globals:
+        c_uint.in_dll(libtoken, 'linenr').value = 1
+        c_uint.in_dll(libtoken, 'column').value = 0
+        c_uint.in_dll(libtoken, 'char_count').value = 0
+        c_uint.in_dll(libtoken, 'utf8_count').value = 0
diff --git a/tools/tokenizer/tokml-test.sh b/tools/tokenizer/tokml-test.sh
new file mode 100755
index 0000000..445647f
--- /dev/null
+++ b/tools/tokenizer/tokml-test.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Showcasing the use of tokml and xidel.
+# Works for any Java, C, and C++ source file.
+# Extracts certain tokens and statistics of interest.
+
+# Show command and execute it.
+run() {
+  echo "$ $1"
+  eval "$1"
+  [ $? == 0 ] || die "got non-0 program exit code"
+}
+
+die() {
+    echo "(E) ${@}" 1>&2
+    exit 1
+}
+
+# We need an input file:                                                      
+[ -z "$1" ] && die "expect a C, C++, or Java file as argument"
+
+# Quick check for availabilty of tokml and xidel:
+command -v tokml &>/dev/null
+[ $? == 0 ] || die "tokml not available; please install"
+command -v xidel &>/dev/null
+[ $? == 0 ] || die "xidel not available; please install"
+
+# Create temp file:
+XML="$(mktemp /tmp/${1%.*}-XXX.xml)"
+# Ensure clean up when done:
+trap "/bin/rm -f $XML" EXIT
+echo \# Run tokml to obtain the .xml output file:
+run "tokml $1 > $XML"
+
+echo
+echo \# Count the number of tokens in the arg source file:
+run "xidel -s -e 'count(//source/*)' $XML"
+
+echo
+echo \# Show all unique identifiers \(sorted\):
+run "xidel -s -e '//identifier' $XML | sort | uniq"
+
+echo
+echo \# Show the identifier occurrences of length greater than 10:
+run "xidel -s -e '//identifier[@len>10]' $XML"
+
+echo
+echo \# How many block_comment occurrences are there?
+run "xidel -s -e 'count(//block_comment)' $XML"
+
+echo
+echo \# Which tokens immediately follow the keyword static?
+run "xidel -s -e '//keyword[text()=\"static\"]/following-sibling::*[1]' $XML | sort | uniq"
+
+echo
+echo \# What is the value of the first integer number?
+run "xidel -s -e '//integer[1]' $XML"
+
+echo
+echo \# Convert the XML back to the original source and show 20 lines:
+run "xidel -s -e 'source' $XML | head -n20"
diff --git a/tools/tokenizer/tokml.c b/tools/tokenizer/tokml.c
new file mode 100644
index 0000000..7fb96d9
--- /dev/null
+++ b/tools/tokenizer/tokml.c
@@ -0,0 +1,224 @@
+/* Copyright (c) 2021 International Business Machines Corporation
+   Prepared by: Geert Janssen <geert@us.ibm.com>
+
+   Tokenizer for C, C++ and Java with output as annotated XML,
+   much like srcML annotates a parse tree. Any white-space (including
+   newlines) is output as is, without any special XML element.
+   All other tokens (even comments) are output as a stream of XML
+   elements with tag names indicating the type/kind/class of
+   token provided as the enclosed text node.
+
+   <?xml version='1.0' encoding='UTF-8'?>
+   <source language='' filename=''>
+   <@kind@ line='' col='' len=''>...</@kind@>
+   </source>
+
+   Note that end-of-line characters (\r, \n) and sequences (\r \n) are
+   normalized and will always be output as a LINEFEED (LF, 0x0A).
+
+   The characters <, >, and & will be replaced by the special XML entities
+   &lt;, &gt; and &amp; respectively.
+
+   To undo the XML annotation use either:
+   (this will also correctly revert the XML entities)
+   xmlstarlet sel -T -t -v 'source' libtoken.xml
+   xidel -s -e 'source'
+
+   Useful xpath queries:
+   (the results show all occurrences and these are not necessarily unique)
+   - all identifiers: //identifier
+   - the length of the last identifier: //identifier[last()]/@len
+   - the value of the first integer: //integer[1]
+   - all comments starting at the beginning of a line:
+     //line_comment[@col=0]|//block_comment[@col=0]
+   - all while keywords: /keyword[text()="while"]
+   - identifiers of length greater than 10: //identifier[@len>10]
+   - tokens immediately following a long identifier:
+     //identifier[@len>15]/following-sibling::*[1]
+   - tokens immediately following the keyword static:
+     //keyword[text()="static"]/following-sibling::*[1]
+*/
+
+#include <unistd.h>             /* getopt() */
+#include <libgen.h>             /* basename() */
+
+#include "libtoken.h"
+
+int main(int argc, char *argv[])
+{
+  extern char *optarg;
+  extern int opterr;
+  extern int optind;
+  int option;
+  char const *opt_str = "1acdhl:o:rvw";
+  char usage_str[80];
+
+  const char *token;
+  const char *type;
+  unsigned line;
+  unsigned col;
+  unsigned token_len;
+  unsigned num_files = 0;    // number of files read
+  int continuous_files = 0;  // when 1 do not reset after each file
+
+  char *outfile = 0;
+  int first_time = 1;
+  Language source;
+  int explicit_source = 0;
+  int append = 0;
+
+  comment_token = 1;
+  whitespace_token = 1;
+
+  sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str);
+
+  /* Process arguments: */
+  while ((option = getopt(argc, argv, opt_str)) != EOF) {
+    switch (option) {
+
+    case '1':
+      continuous_files = 1;
+      break;
+
+    case 'a':
+      append = 1;
+      break;
+
+    case 'c':
+      hash_as_comment = 1;
+      break;
+
+    case 'd':
+      debug = verbose = 1;
+      break;
+
+    case 'h':
+fputs(
+"A tokenizer for C/C++ (and Java) source code with output in XML.\n"
+"Recognizes the following token classes: keyword, identifier, integer,\n"
+"floating, string, character, operator, preprocessor, line_comment,\n"
+"and block_comment.\n\n", stderr);
+fprintf(stderr, usage_str, basename(argv[0]));
+fputs(
+"\nCommand line options are:\n"
+"-a       : append to output file instead of create or overwrite.\n"
+"-c       : treat a # character as the start of a line comment.\n"
+"-d       : print debug info to stderr; implies -v.\n"
+"-h       : print just this text to stderr and stop.\n"
+"-l<lang> : specify language explicitly (C, C++, Java).\n"
+"-o<file> : write output to this file (instead of stdout).\n"
+"-1       : treat all filename arguments as a continuous single input.\n"
+"-v       : print action summary to stderr.\n"
+"-w       : suppress all warning messages.\n",
+      stderr);
+      return 0;
+
+    case 'l':
+       source = set_or_detect_lang(optarg);
+       explicit_source = 1;
+      break;
+
+    case 'o':
+      outfile = optarg;
+      break;
+
+    case 'v':
+      verbose = 1;
+      break;
+
+    case 'w':
+      nowarn = 1;
+      break;
+
+    case '?':
+    default:
+      fputs("(F): unknown option. Stop.\n", stderr);
+      fprintf(stderr, usage_str, argv[0]);
+      return 1;
+    }
+  }
+
+  if (outfile && outfile[0]) {
+    if (!freopen(outfile, append ? "a" : "w", stdout)) {
+      fprintf(stderr, "(F): cannot open %s for writing.\n", outfile);
+      exit(3);
+    }
+  }
+
+  if (optind == argc)
+    goto doit;
+
+  do {
+    filename = argv[optind];
+    if (!freopen(filename, "r", stdin)) {
+      if (!nowarn)
+      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
+      continue;
+    }
+
+    if (!explicit_source)
+      source = set_or_detect_lang(0);
+
+  doit:
+    if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename);
+    num_files++;
+
+    // Header:
+    if (!continuous_files || num_files == 1) {
+      fputs("<?xml version='1.0' encoding='UTF-8'?>\n", stdout);
+      // standalone="yes"
+      fprintf(stdout, "<source language='%s' filename='%s'>",
+	      lang_name(source), filename);
+    }
+
+    while ((token_len = C_tokenize(&token, &type, &line, &col))) {
+      if (!strcmp(type, "whitespace")) {
+	fputs(token, stdout);
+	continue;
+      }
+      fprintf(stdout, "<%s line='%u' col='%u' len='%u'>",
+	      type, line, col, token_len);
+      if (!strcmp(type, "string")
+	  || !strcmp(type, "character")
+	  || !strcmp(type, "operator")
+	  || !strcmp(type, "line_comment")
+	  || !strcmp(type, "block_comment"))
+	XML_escape(stdout, token);
+      else
+	fputs(token, stdout);
+      fprintf(stdout, "</%s>", type);
+    }
+
+    if (!continuous_files) {
+      // Trailer:
+      fputs("</source>\n", stdout);
+
+      if (verbose)
+        fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
+                 char_count, utf8_count);
+
+      // Reset globals:
+      char_count = 0;
+      utf8_count = 0;
+      linenr = 1;
+      column = 0;
+      buffered = 0;
+      saved_col = 0;
+      first_time = 1;
+    }
+  } while (++optind < argc);
+
+  if (continuous_files) {
+    // Trailer:
+    fputs("</source>\n", stdout);
+
+    if (verbose)
+      fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n",
+              char_count, utf8_count);
+  }
+
+  if (num_files > 1 && verbose)
+    fprintf(stderr, "(I): Total number of files processed: %u\n", num_files);
+
+  return (illegals || unexpect_eof) ? 1 : 0;
+}

From 18d7fc76c3ef63dc1d7ffb393397fd8c57c0773a Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:25:36 -0400
Subject: [PATCH 02/34] added the new pseudo tokens

---
 tools/tokenizer/schemas/schema.json | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/tokenizer/schemas/schema.json b/tools/tokenizer/schemas/schema.json
index 61909db..0723d70 100644
--- a/tools/tokenizer/schemas/schema.json
+++ b/tools/tokenizer/schemas/schema.json
@@ -1,7 +1,7 @@
 {
   "$schema": "http://json-schema.org/draft-04/schema#",
   "title": "JSON Schema for Tokenizer JSON Output",
-  "description": "Prepared by Geert Janssen <geert@us.ibm.com>\nCopyright IBM Corporation 2020.",
+  "description": "Prepared by Geert Janssen <geert@us.ibm.com>\nCopyright IBM Corporation 2020, 2021.",
 
   "type": "array",
   "items": {
@@ -12,8 +12,10 @@
       "class": { "enum": [
         "identifier", "keyword", "integer", "floating",
         "string", "character", "operator", "preprocessor",
-	"filename"
+	"filename", "line_comment", "block_comment", "newline",
+	"continuation", "whitespace"
       ] },
+      "length": { "$ref": "#/definitions/unsignedInt" },
       "token": { "type": "string" }
     },
     "required": [ "line", "column", "class", "token" ],

From b219a6b8f976f5926b890619af47ce133f4b1239 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:26:15 -0400
Subject: [PATCH 03/34] added the new pseudo tokens

---
 tools/tokenizer/schemas/schema.rnc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/tokenizer/schemas/schema.rnc b/tools/tokenizer/schemas/schema.rnc
index 3adaad8..1eb43d7 100644
--- a/tools/tokenizer/schemas/schema.rnc
+++ b/tools/tokenizer/schemas/schema.rnc
@@ -1,5 +1,5 @@
 # Compact RELAX NG (RNC) Schema for Tokenizer XML Output
-# Copyright IBM Corporation 2020
+# Copyright IBM Corporation 2020, 2021
 # Prepared by Geert Janssen <geert@us.ibm.com>
 
 datatypes xsd = 'http://www.w3.org/2001/XMLSchema-datatypes'
@@ -16,9 +16,11 @@ doc =
       attribute line   { xsd:unsignedInt },
       attribute column { xsd:unsignedInt },
       attribute class  { token-classes },
+      attribute length { xsd:unsignedInt },
       text
    }
 
 token-classes =
    "identifier" | "keyword" | "integer" | "floating" | "string" |
-   "character" | "operator" | "preprocessor" | "filename"
+   "character" | "operator" | "preprocessor" | "filename" |
+   "line_comment" | "block_comment" | "newline" | "continuation" | "whitespace"

From c5cff7c74465f9cd0153a2a7c498565ca7ff874e Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:27:02 -0400
Subject: [PATCH 04/34] added building of tokml and libtoken.so

---
 tools/tokenizer/Makefile | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tools/tokenizer/Makefile b/tools/tokenizer/Makefile
index b6ca860..9851522 100644
--- a/tools/tokenizer/Makefile
+++ b/tools/tokenizer/Makefile
@@ -3,16 +3,31 @@
 
 INCLUDES =
 CPPFLAGS = $(INCLUDES)
-CFLAGS   = -O2
+CFLAGS   = -g -O2 -fPIC
 LDFLAGS  =
 
-PROGS = tokenize antlr4tojson pytokenize jstokenize
+PROGS = tokenize antlr4tojson pytokenize jstokenize tokml libtoken.so
 
 .PHONY: all
 all: $(PROGS)
 
-tokenize: tokenize.o
-tokenize.o: tokenize.c
+tokenize: tokenize.o libtoken.a
+tokenize.o: tokenize.c libtoken.h
+
+tokml: tokml.o libtoken.a
+tokml.o: tokml.c libtoken.h
+
+libtoken.o: libtoken.c libtoken.h
+
+.PHONY: lib
+lib: libtoken.a libtoken.so
+
+libtoken.a: libtoken.o
+	ar r $@ $^
+	ranlib $@
+
+libtoken.so: libtoken.o
+	$(CC) -shared -Wl,-soname,$@.1 -o $@ $^
 
 antlr4tojson: antlr4tojson.o
 antlr4tojson.o: antlr4tojson.c
@@ -27,5 +42,5 @@ token_common.o: token_common.c token_common.h
 
 .PHONY: clean
 clean:
-	@-rm -f *.o
+	@-rm -f *.o *.a
 	@-rm -f $(PROGS)

From a82adcc1d89709b5ade2dccb73a346357fe8268c Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:27:41 -0400
Subject: [PATCH 05/34] new section on tokml; more info on new pseudo tokens;
 new program options

---
 tools/tokenizer/README.md | 107 +++++++++++++++++++++++++++-----------
 1 file changed, 76 insertions(+), 31 deletions(-)

diff --git a/tools/tokenizer/README.md b/tools/tokenizer/README.md
index 6008eec..4c3a8f6 100644
--- a/tools/tokenizer/README.md
+++ b/tools/tokenizer/README.md
@@ -8,13 +8,13 @@ This same repository also offers separate programs for a Python tokenizer
 of the command-line options and have the same output formats.
 
 Here we focus on the C/C++/Java tokenizer (`tokenize`), but most of this
-documentation equally applies to the other tokenizer program. The `Makefile`
-builds them all.
+documentation equally applies to the other tokenizer program.
+The `Makefile` builds them all.
 
 The following lexeme classes are recognized:
 
 - identifier
-- reserved word/keyword
+- reserved word/keyword of the language of the input source
 - binary, octal, decimal, hexadecimal and floating-point numbers
 - double-quoted string literal
 - single-quoted character literal
@@ -24,13 +24,18 @@ The following lexeme classes are recognized:
 For each correctly recognized token, the program determines its class/type and
 the exact coordinates (line number and column) in the input text of
 its starting character. All token literals are output exactly as they appear in
-the source text, without any interpretation of escaped characters.
+the source text, without any interpretation of possibly escaped characters.
 
-A newline is defined as a single linefeed character `\n` or the combination
-carriage return `\r` followed by linefeed `\n`.
-Line continuations (a backslash immediately followed by a newline) are handled
+A newline is defined as a single linefeed character `\n`, a carriage return
+`\r`, or the combination carriage return `\r` followed by linefeed `\n`.
+Line continuations, i.e., a backslash immediately followed by a newline, are handled
 at the character input level, so the token recognizers will only see logical
-lines. Line and column reflect positions in the physical line structure, not the logical one.
+lines. Line and column coordinates reflect positions in the physical line
+structure, not the logical one. When requested, logical line endings are
+output as `newline` pseudo tokens and will be represented by a linefeed
+character. Similarly, when requested, continuations are output as
+`continuation` pseudo tokens and will be represented by a backslash-escaped
+linefeed `\\n`.
 
 For instance the appearance of a line continuation inside a string literal:
 
@@ -45,8 +50,8 @@ upon output as a token becomes:
 	"A long string literal that is broken here to stretch over two lines."
 ```
 
-Moreover, white-space, control characters and comments are skipped and
-anything left over is flagged as illegal characters.
+White-space (SPACE and TAB characters), certain control characters, and comments are
+normally skipped and anything left over is flagged as illegal characters.
 
 Since Java at the lexical level is very close to C and C++, this tokenizer
 can also be used for Java, albeit that some literal pecularities are not
@@ -54,9 +59,7 @@ recognized. The program looks at the file name extension to determine the
 language. This can be overridden (and must be specified in case of using
 standard input) by the `-l` option.
 Depending on the language setting, the proper set of keywords will be
-recognized. For C and C++ their
-combined set of (95) keywords is recognized, assuming that a C program will not
-inadvertently use C++ keywords as regular identifiers.
+recognized.
 
 ## Program options
 
@@ -68,22 +71,24 @@ A tokenizer for C/C++ (and Java) source code with output in 6 formats.
 Recognizes the following token classes: keyword, identifier, integer,
 floating, string, character, operator, and preprocessor.
 
-usage: tokenize [ -1acdhjl:m:no:rsvw ] [ FILES ]
+usage: tokenize [ -1acdhjkl:m:nNo:rsvwW ] [ FILES ]
 
 Command line options are:
 -a       : append to output file instead of create or overwrite.
 -c       : treat a # character as the start of a line comment.
 -d       : print debug info to stderr; implies -v.
 -h       : print just this text to stderr and stop.
--j       : assume input is Java (deprecated: use -l Java or .java).
+-k       : output line and block comments as tokens.
 -l<lang> : specify language explicitly (C, C++, Java).
 -m<mode> : output mode either plain (default), csv, json, jsonl, xml, or raw.
 -n       : output newlines as a special pseudo token.
+-N       : output line continuations as a special pseudo token.
 -o<file> : write output to this file (instead of stdout).
 -s       : enable a special start token specifying the filename.
 -1       : treat all filename arguments as a continuous single input.
 -v       : print action summary to stderr.
 -w       : suppress all warning messages.
+-W       : output adjacent white-space as a token.
 ```
 
 The program reads multiple files. Depending on the `-1` option, the files
@@ -95,7 +100,7 @@ the mode setting.
 ## Multiple output modes
 
 The tokenizer has multiple output modes. They are plain text, CSV, JSON, JSONL
-and XML. A sample of plain text output looks like this:
+XML, and RAW mode. A sample of plain text output looks like this:
 
 ```text
 (  62,  0) preprocessor: #
@@ -139,26 +144,52 @@ and XML. A sample of plain text output looks like this:
 Line numbers are 1 based, columns start at 0 (Emacs-style).
 The token classes are:
 
-| Class:       | Description:
-|--------------|------------
-| identifier   | any identifier
-| keyword      | a reserved word
-| integer      | integer number irrespective of notation
-| floating     | a floating-point number
-| string       | a double-quoted string (maybe empty)
-| character    | a single-quoted character
-| operator     | any operator or punctuator symbol
-| preprocessor | either # or ##
-| filename     | pseudo token: start of a new file
-| newline      | pseudo token: end of logical line
+| Class:        | Description:
+|---------------|------------
+| identifier    | any identifier
+| keyword       | a reserved word
+| integer       | integer number irrespective of notation
+| floating      | a floating-point number
+| string        | a double-quoted string (maybe empty)
+| character     | a single-quoted character
+| operator      | any operator or punctuator symbol
+| preprocessor  | either `#` or `##`
+
+The following classes are only recognized when the appropriate switch has been set:
+
+| Class:        | Description:                           | Switch:
+|---------------|----------------------------------------|---------
+| line_comment  | treat `#` till end of line as comment  | -c -k
+| line_comment  | a comment that starts with `//`        | -k
+| block_comment | a comment enclosed in `/*` and `*/`    | -k
+| filename      | pseudo token: start of a new file      | -s
+| newline       | pseudo token `\n`: end of logical line | -n
+| continuation  | pseudo token `\\n`: line continuation  | -N
+| whitespace    | adjacent white-space                   | -W
 
 The `filename` token is optional. It will be included when the `-s` option is
 provided. It is a pseudo token that provides the filename of the input as the
 first token. Similarly, the `newline` is a pseudo token and appears only with
 the `-n` option. It signals the end of a logical line. Mind that multiple
-newlines occurring in sequence are not suppressed. The `newline` token has no
-textual representation, e.g. in XML mode output it will appear as an empty
-text element.
+newlines occurring in sequence are not suppressed nor aggregated but appear as
+separate newline tokens (the same holds for continuations).
+The `newline` token will
+be represented by a linefeed character (LF). Depending on the output mode this
+will be escaped appropriately. The `-W` would normally also collect any
+newlines except when `-n` is a also set and continuations except when `-N` is
+set in which case they are treated as separate tokens. To summarize, the valid
+combinations of these options and their effect are:
+
+| Switches: | Effect on output:
+|-----------|------------------
+|           | all white-space, line endings inclusive, discarded
+| -n        | newline tokens for logical lines
+| -N        | continuation tokens
+| -W        | whitespace tokens inclusive all physical line endings
+| -n -N     | newline and continuation tokens
+| -W -n     | whitespace tokens and newline tokens separately
+| -W -N     | whitespace tokens and continuation tokens separately
+| -W -n -N  | whitespace, newline, and continuation all separately
 
 ### CSV output
 
@@ -178,6 +209,8 @@ line,column,class,token
 
 The operator token `,` is escaped with double quotes, like so `","`.
 String tokens are escaped as well and any original double quote is doubled.
+A newline on its own or as part of whitespace will appear escaped as `\n`.
+A whitespace token text will appear inside double quotes.
 
 ### JSON output
 
@@ -222,6 +255,18 @@ tokens. (An alternative would be to use the CDATA construct.)
 
 ```
 
+## tokML
+
+Recently a new program has been added: `tokml`. As the name suggests the
+output is in XML format but unlike the `-mxml` option to `tokenize`, `tokml`
+outputs the original source code annotated with XML elements that supply the
+token information. This is an approach identical to what `srcML` does for a
+parse tree. The precise XML syntax used is defined by the RelaxNG schema in
+the file `tokml-schema.rnc`.
+
+The XML annotation makes it very convenient to apply XPath and XQuery queries
+to the token stream, e.g. by using tools like `xidel` and `xmlstarlet`.
+
 ## References
 
 > <a id="1">[1]</a>

From 70d3033fd9ea1e2591f8ef9b89775569c512195c Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:28:57 -0400
Subject: [PATCH 06/34] untabify

---
 tools/tokenizer/antlr4tojson.c | 100 ++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/tools/tokenizer/antlr4tojson.c b/tools/tokenizer/antlr4tojson.c
index f2d2b9c..7cdb254 100644
--- a/tools/tokenizer/antlr4tojson.c
+++ b/tools/tokenizer/antlr4tojson.c
@@ -45,9 +45,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <regex.h>
-#include <unistd.h>		/* getopt() */
-#include <libgen.h>		/* basename() */
-#include <ctype.h>		/* tolower() */
+#include <unistd.h>             /* getopt() */
+#include <libgen.h>             /* basename() */
+#include <ctype.h>              /* tolower() */
 
 // POSIX Extended Regular Expressions for all parts of token output.
 
@@ -87,15 +87,15 @@
   class_RE "),(channel=(" posint_RE "),)?(" line_RE "):(" column_RE ")\\]$"
 
 // Program option settings:
-static int debug = 0;		// when 1 debug output to stderr
+static int debug = 0;           // when 1 debug output to stderr
 static int verbose = 0;         // when 1 info output to stderr
-static int nowarn = 0;		// when 1 warnings are suppressed
-static int start_token = 0;	// when 1 start filename pseudo-token
+static int nowarn = 0;          // when 1 warnings are suppressed
+static int start_token = 0;     // when 1 start filename pseudo-token
 static int continuous_files = 0;// when 1 do not reset after each file
 
 // Program globals:
 static char *filename = "stdin";// current file being parsed
-static unsigned num_files = 0;	// number of files read
+static unsigned num_files = 0;  // number of files read
 static unsigned linenr = 1;     // line number counted from 1
 static enum { CSV, JSON, JSONL, RAW } mode = JSON;
 
@@ -179,10 +179,10 @@ static void JSON_escape(FILE *out, const char *p, unsigned len)
       const char peek = len ? *(p+1) : anything_but_valid_escape; // look ahead
       fputc('\\', out);
       if (strchr("\\\"bfnrt", peek)) {
-	// An valid JSON escape. Output it and skip peek:
-	c = peek;
-	p++;
-	len--;
+        // An valid JSON escape. Output it and skip peek:
+        c = peek;
+        p++;
+        len--;
       }
       //else Not a correct JSON escape, a standalone backslash; double it.
     }
@@ -214,7 +214,7 @@ static unsigned get(char const *text)
   if (regexec(re, text, nmatch, pmatch, REG_NOTEOL) == REG_NOMATCH) {
     // Warn about the failed match:
     fprintf(stderr, "(W) [%s:%u] not a valid token; skipped.\n",
-	    filename, linenr);
+            filename, linenr);
     // Cannot recover; no more input.
     return 0;
   }
@@ -256,22 +256,22 @@ static unsigned get(char const *text)
     case CLASS_IDENT:
       // CSV output does not need the quoting. 
       if (mode == JSON || mode == JSONL)
-	fputc('"', stdout);
+        fputc('"', stdout);
       // Undo the capitalization?
       fputc(tolower(*p), stdout);
       fwrite(p+1, 1, len-1, stdout);
       if (mode == JSON || mode == JSONL)
-	fputc('"', stdout);
+        fputc('"', stdout);
       break;
     case TEXT:
       // CSV output benefits from quoting; must escape the "
       fputc('"', stdout);
       // Strip off the enclosing single quotes.
       if (mode == JSON || mode == JSONL)
-	JSON_escape(stdout, p+1, len-2);
+        JSON_escape(stdout, p+1, len-2);
       else
       if (mode == CSV)
-	CSV_escape(stdout, p+1, len-2);
+        CSV_escape(stdout, p+1, len-2);
       fputc('"', stdout);
       break;
     case CLASS_STRING:
@@ -281,10 +281,10 @@ static unsigned get(char const *text)
       // Keep the enclosing single quotes!
       fputc('"', stdout);
       if (mode == JSON || mode == JSONL)
-	JSON_escape(stdout, p, len);
+        JSON_escape(stdout, p, len);
       else
       if (mode == CSV)
-	CSV_escape(stdout, p, len);
+        CSV_escape(stdout, p, len);
       fputc('"', stdout);
       break;
     case CHANNEL:
@@ -333,7 +333,7 @@ main(int argc, char *argv[])
 
     case 'h':
 fputs(
-"A converter for the ANTLR4 token output format.\n\n", stdout);
+"A converter for the ANTLR4 token output format.\n\n", stderr);
  fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
@@ -358,7 +358,7 @@ fputs(
       else if (!strcmp(optarg, "raw"))
         mode = RAW;
       else {
-	if (!nowarn)
+        if (!nowarn)
         fprintf(stderr, "(W): Invalid mode %s (using csv).\n", optarg);
         mode = CSV;
       }
@@ -419,10 +419,10 @@ fputs(
       break;
     case CSV:
       if (!continuous_files || num_files == 1)
-	fputs("seqnr,start,stop,text,class,channel,line,column\n", stdout);
+        fputs("seqnr,start,stop,text,class,channel,line,column\n", stdout);
       else {
-	fputc('\n', stdout);
-	first_time = 1;
+        fputc('\n', stdout);
+        first_time = 1;
       }
       if (start_token) {
         fprintf(stdout, "0,0,0,%s,File,0,1,0\n", filename);
@@ -431,20 +431,20 @@ fputs(
     case JSON:
     case JSONL:
       if (!continuous_files || num_files == 1) {
-	if (mode == JSON) fputs("[\n", stdout);
+        if (mode == JSON) fputs("[\n", stdout);
       }
       else {
-	if (mode == JSON) fputc(',', stdout);
-	fputc('\n', stdout);
-	first_time = 1;
+        if (mode == JSON) fputc(',', stdout);
+        fputc('\n', stdout);
+        first_time = 1;
       }
       if (start_token) {
-	// Must quote filename:
+        // Must quote filename:
         fprintf(stdout,
-	    "{\"seqnr\":0, \"start\":0, \"stop\":0, \"text\":\"%s\","
-	    " \"class\":\"File\", \"line\":1, \"column\":0}",
-		filename);
-	first_time = 0;
+            "{\"seqnr\":0, \"start\":0, \"stop\":0, \"text\":\"%s\","
+            " \"class\":\"File\", \"line\":1, \"column\":0}",
+                filename);
+        first_time = 0;
       }
       break;
     }
@@ -452,19 +452,19 @@ fputs(
     while (getline(&line, &len, stdin) != -1) {
       // If already did some output must close that previous line:
       if (first_time)
-	first_time = 0;
+        first_time = 0;
       else {
-	switch (mode) {
-	case RAW:
-	  break;
-	case JSON:
-	  fputc(',', stdout);
-	  /*FALL THROUGH*/
-	case CSV:
-	case JSONL:
-	  fputc('\n', stdout);
-	  break;
-	}
+        switch (mode) {
+        case RAW:
+          break;
+        case JSON:
+          fputc(',', stdout);
+          /*FALL THROUGH*/
+        case CSV:
+        case JSONL:
+          fputc('\n', stdout);
+          break;
+        }
       }
       get(line); // no , and/or \n output yet
       linenr++;
@@ -476,15 +476,15 @@ fputs(
       // Trailer:
       switch (mode) {
       case RAW:
-	break;
+        break;
       case JSON:
-	// no last comma!
-	fputs("\n]", stdout);
-	/*FALL THROUGH*/
+        // no last comma!
+        fputs("\n]", stdout);
+        /*FALL THROUGH*/
       case CSV:
       case JSONL:
-	fputc('\n', stdout);
-	break;
+        fputc('\n', stdout);
+        break;
       }
       first_time = 1;
     }

From 101cec453a8d6916e9bb149f5b982e848678cb5f Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:29:52 -0400
Subject: [PATCH 07/34] untabify

---
 tools/tokenizer/jstokenize.c | 220 +++++++++++++++++------------------
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/tools/tokenizer/jstokenize.c b/tools/tokenizer/jstokenize.c
index c935837..e9604a8 100644
--- a/tools/tokenizer/jstokenize.c
+++ b/tools/tokenizer/jstokenize.c
@@ -108,20 +108,20 @@ static int tokenize(char *token, const char **type,
         // Skip till end-of-line (\n exclusive):
         while ((cc = get()) != EOF && cc != '\n' && cc != '\r')
           ;
-	// cc == '\n' || cc == '\r' || cc == EOF
-	if (cc == '\r') {
-	  if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected continuation in line comment.\n");
-	  // Effectively ignore any \ and terminate logical line:
-	  cc == '\n';
-	}
+        // cc == '\n' || cc == '\r' || cc == EOF
+        if (cc == '\r') {
+          if (!nowarn)
+            fprintf(stderr,
+                    "(W): Unexpected continuation in line comment.\n");
+          // Effectively ignore any \ and terminate logical line:
+          cc == '\n';
+        }
         goto restart;
       }
 
       if (cc == '*') {
-	// Remember start position:
-	unsigned lin = linenr;
+        // Remember start position:
+        unsigned lin = linenr;
 
         // Skip till */ inclusive:
         int nc = get(); // if EOF next get will be EOF too
@@ -130,9 +130,9 @@ static int tokenize(char *token, const char **type,
           nc = get();
           if (nc == EOF) { // Error!
             fprintf(stderr,
-		    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
-		    filename, lin);
-	    unexpect_eof++;
+                    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
+                    filename, lin);
+            unexpect_eof++;
             return 0;
           }
         } while (cc != '*' || nc != '/');
@@ -153,13 +153,13 @@ static int tokenize(char *token, const char **type,
         // Skip till end-of-line (\n exclusive):
         while ((cc = get()) != EOF && cc != '\n' && cc != '\r')
           ;
-	if (cc == '\r') {
-	  if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected continuation in hashbang comment.\n");
-	  // Effectively ignore any \ and terminate logical line:
-	  cc == '\n';
-	}
+        if (cc == '\r') {
+          if (!nowarn)
+            fprintf(stderr,
+                    "(W): Unexpected continuation in hashbang comment.\n");
+          // Effectively ignore any \ and terminate logical line:
+          cc == '\n';
+        }
         goto restart;
       }
       // seen # but not #!
@@ -201,37 +201,37 @@ static int tokenize(char *token, const char **type,
       int pc;
       do {
         token_add(cc);
-	pc = cc;
-	cc = get();
-	if (cc == '\r') {
-	  if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected continuation in regex literal.\n");
-	  // Effectively ignore:
-	  cc = get();
-	}
-
-	if (cc == '\n') {
+        pc = cc;
+        cc = get();
+        if (cc == '\r') {
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected newline in regular expression literal.\n");
-	  // discard:
-	  cc = get();	  
-	}
+            fprintf(stderr,
+                    "(W): Unexpected continuation in regex literal.\n");
+          // Effectively ignore:
+          cc = get();
+        }
+
+        if (cc == '\n') {
+          if (!nowarn)
+            fprintf(stderr,
+                    "(W): Unexpected newline in regular expression literal.\n");
+          // discard:
+          cc = get();     
+        }
 
-	if (cc == EOF) {
+        if (cc == EOF) {
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected EOF in regular expression literal.\n");
+            fprintf(stderr,
+                    "(W): Unexpected EOF in regular expression literal.\n");
           unexpect_eof++;
-	  break;
-	}
+          break;
+        }
       } while (cc != '/' || pc == '\\');
       token_add(cc); // the /
       cc = get();
       while (strchr("gimsuy", cc)) {
         token_add(cc);
-	cc = get();
+        cc = get();
       }
       unget(cc);
       *type = "regex";
@@ -259,15 +259,15 @@ static int tokenize(char *token, const char **type,
       int nesting = 0; // keep track of ${} nesting
       do {
         token_add(cc);
-	// For template can have nesting inside placeholder ${...}
-	// FIXME: no check for nested paired ``; same for {}
-	if (qc == '`') {
-	  if (pc == '$' && cc == '{')
-	    nesting++;
-	  else
-	  if (cc == '}')
-	    nesting--;
-	}
+        // For template can have nesting inside placeholder ${...}
+        // FIXME: no check for nested paired ``; same for {}
+        if (qc == '`') {
+          if (pc == '$' && cc == '{')
+            nesting++;
+          else
+          if (cc == '}')
+            nesting--;
+        }
 
         // Assume \ is not escaped itself.
         if (pc != '\\' && cc == qc && !nesting) { // unescaped quote
@@ -283,16 +283,16 @@ static int tokenize(char *token, const char **type,
 
         if (cc == '\n' && qc != '`') { // Ok in template
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected unescaped newline in string.\n");
+            fprintf(stderr,
+                    "(W): Unexpected unescaped newline in string.\n");
           // discard
           cc = get();
         }
 
         if (cc == EOF) {
           if (!nowarn)
-	    fprintf(stderr,
-		    "(W): Unexpected EOF in string/template.\n");
+            fprintf(stderr,
+                    "(W): Unexpected EOF in string/template.\n");
           unexpect_eof++;
           break;
         }
@@ -312,11 +312,11 @@ static int tokenize(char *token, const char **type,
       unget(cc);
       token[len] = '\0';
       if (is_keyword(token, keywords, num_keywords)) {
-	*type = "keyword";
-	regex_ok = !!is_keyword(token, regex_preceders, num_preceders);
+        *type = "keyword";
+        regex_ok = !!is_keyword(token, regex_preceders, num_preceders);
       }
       else
-	*type = "identifier";
+        *type = "identifier";
       break;
     }
 
@@ -340,16 +340,16 @@ static int tokenize(char *token, const char **type,
       } int_lit = DEC; // assume decimal number
 
       /* BIN: 0[bB][01](_?[01])*
-	 LEGACY_OCT: 0[0-7]+
-	 OCT: 0[oO][0-7](_?[0-7])*
-	 DEC: 0|[1-9](_?[0-9])*
-	 HEX: 0[xX][0-9a-fA-F](_?[0-9a-fA-F])*
+         LEGACY_OCT: 0[0-7]+
+         OCT: 0[oO][0-7](_?[0-7])*
+         DEC: 0|[1-9](_?[0-9])*
+         HEX: 0[xX][0-9a-fA-F](_?[0-9a-fA-F])*
 
-	 EXP: [eE][+-]?[0-9](_?[0-9])*
+         EXP: [eE][+-]?[0-9](_?[0-9])*
 
-	 FLOATING: .[0-9][_0-9]*EXP?
-	         | DEC.([0-9][_0-9]*)?EXP?
-	         | DEC EXP
+         FLOATING: .[0-9][_0-9]*EXP?
+                 | DEC.([0-9][_0-9]*)?EXP?
+                 | DEC EXP
        */
 
       if (cc == '0') {
@@ -368,14 +368,14 @@ static int tokenize(char *token, const char **type,
           int_lit = HEX;
           break;
         default:
-	  if ('0' <= nc && nc <= '7') {
-	    token_add(cc); // the 0
-	    int_lit = LEGACY_OCT;
-	  }
-	  else {
-	    unget(nc);
-	    nc = cc;
-	  }
+          if ('0' <= nc && nc <= '7') {
+            token_add(cc); // the 0
+            int_lit = LEGACY_OCT;
+          }
+          else {
+            unget(nc);
+            nc = cc;
+          }
           break;
         }
         cc = nc;
@@ -454,9 +454,9 @@ static int tokenize(char *token, const char **type,
       }
 
       if (cc == 'n') // BigInt
-	token_add(cc);
+        token_add(cc);
       else
-	unget(cc);
+        unget(cc);
 
       *type = "integer";
       break;
@@ -492,28 +492,28 @@ static int tokenize(char *token, const char **type,
       if (strchr("*+-<>&|?.=", cc) && c2 == cc) { // double or triple
         // ** ++ -- << >> && || ?? .. ==
 
-	// special case ++ and --
-	if (c2 == '+' || c2 == '-') {
+        // special case ++ and --
+        if (c2 == '+' || c2 == '-') {
             token_add(c2);
             *type = "operator";
             break;
-	}
+        }
 
         // ** << >> && || ?? .. ==
         int c3 = get();
 
-	// special case . and ...
+        // special case . and ...
         if (c2 == '.') {
           if (c3 == '.') {
             // ...
             token_add(c2);
             token_add(c3);
           }
-	  else {
-	    // ..x
-	    unget(c3);
-	    unget(c2);
-	  }
+          else {
+            // ..x
+            unget(c3);
+            unget(c2);
+          }
           // .
           *type = "operator";
           break;
@@ -530,18 +530,18 @@ static int tokenize(char *token, const char **type,
 
         // ** << >> && || ?? ==
 
-	if (c2 == '>' && c3 == c2) {
-	  // >>>
-	  int c4 = get();
+        if (c2 == '>' && c3 == c2) {
+          // >>>
+          int c4 = get();
           token_add(c3);
-	  if (c4 == '=')
-	    // >>>=
-	    token_add(c4);
-	  else
-	    unget(c4);
-	}
-	else
-	  unget(c3);
+          if (c4 == '=')
+            // >>>=
+            token_add(c4);
+          else
+            unget(c4);
+        }
+        else
+          unget(c3);
 
         // ** << >> && || ?? ==
         *type = "operator";
@@ -552,7 +552,7 @@ static int tokenize(char *token, const char **type,
       // also missing => ?. !== <= >= == != += -= *= %= &= |= ^= /=
 
       if (cc == '?' && c2 == '.' ||
-	  cc == '=' && c2 == '>') {
+          cc == '=' && c2 == '>') {
         // ?. =>
         token_add(c2);
         *type = "operator";
@@ -562,20 +562,20 @@ static int tokenize(char *token, const char **type,
       // still missing !== <= >= == != += -= *= %= &= |= ^= /=
 
       if (c2 == '=') {
-	// <= >= == != += -= *= %= &= |= ^= /=
-	token_add(c2);
-	if (cc == '!') {
-	  // !=
-	  int c3 = get();
-	  if (c3 == '=')
-	    // !==
-	    token_add(c3);
-	  else
-	    unget(c3);
-	}
+        // <= >= == != += -= *= %= &= |= ^= /=
+        token_add(c2);
+        if (cc == '!') {
+          // !=
+          int c3 = get();
+          if (c3 == '=')
+            // !==
+            token_add(c3);
+          else
+            unget(c3);
+        }
       }
       else
-	unget(c2);
+        unget(c2);
       *type = "operator";
       break;
     }
@@ -711,7 +711,7 @@ int main(int argc, char *argv[])
 fputs(
 "A tokenizer for JavaScript source code with output in 6 formats.\n"
 "Recognizes the following token classes: keyword, identifier, integer,\n"
-"floating, string, regex, and operator.\n\n", stdout);
+"floating, string, regex, and operator.\n\n", stderr);
 fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"

From 6de081222dbe522d6cce78ff456ccf1222a7d6d2 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:30:10 -0400
Subject: [PATCH 08/34] untabify

---
 tools/tokenizer/ntokenize.c | 99 ++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/tools/tokenizer/ntokenize.c b/tools/tokenizer/ntokenize.c
index b5625fe..3d1e67e 100644
--- a/tools/tokenizer/ntokenize.c
+++ b/tools/tokenizer/ntokenize.c
@@ -58,60 +58,60 @@
 // 96 chars (omitted are e.g.: @ $ `)
 //                                     3  5  67         8         9       9
 //                        1234 5 6 7   3  9  9012345678901234567890123 4 56
-#define basic_char0_RE	"[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]"
+#define basic_char0_RE  "[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]"
 
 // all basic chars except \n and >
-#define h_chars_RE	"[][ \t\v\fa-zA-Z0-9_{}#()<%:;.?*+/^&|~!=,\\\"'-]+"
+#define h_chars_RE      "[][ \t\v\fa-zA-Z0-9_{}#()<%:;.?*+/^&|~!=,\\\"'-]+"
 // all basic chars except \n and \"
-#define q_chars_RE	"[][ \t\v\fa-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\'-]+"
-#define header_RE	"<"h_chars_RE">|\""q_chars_RE"\""
-#define pp_number_RE	"\\.?[0-9]('?[a-zA-Z_0-9]|[eE][+-]|\\.)*"
-
-#define unichar_RE	"\\\\u[0-9a-fA-F]{4}|\\\\U[0-9a-fA-F]{8}"
-
-//#define identifier_RE	"[_a-zA-Z][_a-zA-Z0-9]*"
-#define identifier_RE	"([_a-zA-Z]|"unichar_RE")([_a-zA-Z0-9]|"unichar_RE")*"
-
-#define suffix_RE	"([uU]ll?|[uU]LL?|ll?[uU]?|LL?[uU]?)?"
-#define binary_RE	"0[bB][01]('?[01])*"suffix_RE
-#define octal_RE	"0('?[0-7])*"suffix_RE
-#define decimal_RE	"[1-9]('?[0-9])*"suffix_RE
-#define hexadecimal_RE	"0[xX][0-9a-fA-F]('?[0-9a-fA-F])*"suffix_RE
-#define integer_RE	binary_RE"|"octal_RE"|"decimal_RE"|"hexadecimal_RE
-
-#define dec_part_RE	"[0-9]('?[0-9])*"
-#define exponent_RE	"[eE][-+]?[0-9]('?[0-9])*"
-#define floating_RE	"(\\."dec_part_RE"("exponent_RE")?|"\
-	                dec_part_RE"\\.("dec_part_RE")?("exponent_RE")?|"\
-        	        dec_part_RE exponent_RE")[fFlL]?"
-
-#define oct_char_RE	"\\\\[0-7]{1,3}"
-#define hex_char_RE	"\\\\x[0-9a-fA-F]+"
-#define escape_RE	"\\\\['\"?abfnrtv\\]|"oct_char_RE"|"hex_char_RE
-#define character_RE	"[uUL]?'([^'\\\n]|"escape_RE"|"unichar_RE")'"
-#define string_RE	"[uUL]?\"([^\"\\\n]|"escape_RE"|"unichar_RE")*\""
+#define q_chars_RE      "[][ \t\v\fa-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\'-]+"
+#define header_RE       "<"h_chars_RE">|\""q_chars_RE"\""
+#define pp_number_RE    "\\.?[0-9]('?[a-zA-Z_0-9]|[eE][+-]|\\.)*"
+
+#define unichar_RE      "\\\\u[0-9a-fA-F]{4}|\\\\U[0-9a-fA-F]{8}"
+
+//#define identifier_RE "[_a-zA-Z][_a-zA-Z0-9]*"
+#define identifier_RE   "([_a-zA-Z]|"unichar_RE")([_a-zA-Z0-9]|"unichar_RE")*"
+
+#define suffix_RE       "([uU]ll?|[uU]LL?|ll?[uU]?|LL?[uU]?)?"
+#define binary_RE       "0[bB][01]('?[01])*"suffix_RE
+#define octal_RE        "0('?[0-7])*"suffix_RE
+#define decimal_RE      "[1-9]('?[0-9])*"suffix_RE
+#define hexadecimal_RE  "0[xX][0-9a-fA-F]('?[0-9a-fA-F])*"suffix_RE
+#define integer_RE      binary_RE"|"octal_RE"|"decimal_RE"|"hexadecimal_RE
+
+#define dec_part_RE     "[0-9]('?[0-9])*"
+#define exponent_RE     "[eE][-+]?[0-9]('?[0-9])*"
+#define floating_RE     "(\\."dec_part_RE"("exponent_RE")?|"\
+                        dec_part_RE"\\.("dec_part_RE")?("exponent_RE")?|"\
+                        dec_part_RE exponent_RE")[fFlL]?"
+
+#define oct_char_RE     "\\\\[0-7]{1,3}"
+#define hex_char_RE     "\\\\x[0-9a-fA-F]+"
+#define escape_RE       "\\\\['\"?abfnrtv\\]|"oct_char_RE"|"hex_char_RE
+#define character_RE    "[uUL]?'([^'\\\n]|"escape_RE"|"unichar_RE")'"
+#define string_RE       "[uUL]?\"([^\"\\\n]|"escape_RE"|"unichar_RE")*\""
 
 // should really be: any basic source char except ) followed by delimiter
-#define r_chars_RE	"[^)]*"
+#define r_chars_RE      "[^)]*"
 // delimiter; first and second occurrence in rawstring must be the same
 // use back reference \3:
-#define d_chars_RE	"([^ ()\\\t\v\f\n]{0,16})"
-#define rawstring_RE	"[uUL]?R\""d_chars_RE"\\("r_chars_RE"\\)\\3\""
+#define d_chars_RE      "([^ ()\\\t\v\f\n]{0,16})"
+#define rawstring_RE    "[uUL]?R\""d_chars_RE"\\("r_chars_RE"\\)\\3\""
 
-#define operator_RE	"[][{}();?~,]|<=>|<<=|\\.\\.\\.|->\\*|>>=|"\
-  			"[*/!=^]=?|<[:%=<]?|:[:>]?|\\.[*]?|-[->=]?|\\+[=+]?|"\
-			"%[>=]?|&[=&]?|>[>=]?|\\|[|=]?"
+#define operator_RE     "[][{}();?~,]|<=>|<<=|\\.\\.\\.|->\\*|>>=|"\
+                        "[*/!=^]=?|<[:%=<]?|:[:>]?|\\.[*]?|-[->=]?|\\+[=+]?|"\
+                        "%[>=]?|&[=&]?|>[>=]?|\\|[|=]?"
 
 #define preprocessor_RE "##?"
 
-#define token_RE	"^"ws_RE"(("rawstring_RE")|("identifier_RE")|("\
+#define token_RE        "^"ws_RE"(("rawstring_RE")|("identifier_RE")|("\
                         integer_RE")|("floating_RE")|("string_RE")|("\
                         character_RE")|("operator_RE")|("preprocessor_RE"))"
 
 #define NMATCH 34
 
 // Guarded against overflow but not full-proof!
-#define MAX_LINE 4096	// maximum logical line length in chars (\0 exclusive)
+#define MAX_LINE 4096   // maximum logical line length in chars (\0 exclusive)
 
 #define utf8_start(cc)          (((cc)&0xC0)!=0x80)
 
@@ -204,7 +204,7 @@ unsigned get_token(char const *text, unsigned start)
   if (regexec(re, text, nmatch, pmatch, REG_NOTEOL) == REG_NOMATCH) {
     // Warn about the failed match:
     fprintf(stderr, "(W) [%u:%u] not a valid token; skipped.\n",
-	    linenrs[start],columns[start]);
+            linenrs[start],columns[start]);
     // Cannot recover; no more input.
     return 0;
   }
@@ -239,7 +239,6 @@ unsigned get_token(char const *text, unsigned start)
 int normalize_newline(void)
 {
   int cc = getchar();
-  if (cc == EOF || cc == '\n') return cc;
 
   if (cc == '\r') {
     // Maybe \r \n (CR NL) combination?
@@ -397,22 +396,22 @@ int buffer_fill(void)
     if (cc == '"') {
       // Switch to unfiltered input till unescaped closing ":
       if ((cc = get()) == '"') {
-	buffer_add(cc);
-	// An empty string literal.
-	continue;
+        buffer_add(cc);
+        // An empty string literal.
+        continue;
       }
       if (cc == EOF || cc == '\n')
-	// unexpected EOF or newline in string
-	break;
+        // unexpected EOF or newline in string
+        break;
       buffer_add(cc);
       int pc;
       do {
-	pc = cc;
-	cc = get();
-	if (cc == EOF || cc == '\n')
-	  // unexpected EOF or newline in string
-	  goto break_outer;
-	buffer_add(cc);
+        pc = cc;
+        cc = get();
+        if (cc == EOF || cc == '\n')
+          // unexpected EOF or newline in string
+          goto break_outer;
+        buffer_add(cc);
       } while (pc == '\\' || cc != '"');
       // pc != '\\' && cc == '"'
     }

From 9e73e0588f6dcc04f7a6cfae6007c8ff866f04e6 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:30:13 -0400
Subject: [PATCH 09/34] untabify

---
 tools/tokenizer/pytokenize.c | 86 ++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/tools/tokenizer/pytokenize.c b/tools/tokenizer/pytokenize.c
index fb787a9..96dd491 100644
--- a/tools/tokenizer/pytokenize.c
+++ b/tools/tokenizer/pytokenize.c
@@ -39,7 +39,7 @@ static int first_time = 1;
 static int start_token = 0;       // when 1 start filename pseudo-token
 static int continuous_files = 0;  // when 1 do not reset after each file
 static enum { PLAIN, CSV, JSON, JSONL, XML, RAW } mode = PLAIN;
-static int output_layout = 0;	  // when 1 output layout pseudo tokens
+static int output_layout = 0;     // when 1 output layout pseudo tokens
 
 static const char *keywords[] = {
   "False",  "None",   "True",    "and",      "as",       "assert", "async",
@@ -67,18 +67,18 @@ static void emit(const char *s, unsigned line,  unsigned col)
     case JSON:
     case JSONL:
       if (first_time)
-	first_time = 0;
+        first_time = 0;
       else {
-	if (mode == JSON) fputc(',', stdout);
-	fputc('\n', stdout);
+        if (mode == JSON) fputc(',', stdout);
+        fputc('\n', stdout);
       }
       fprintf(stdout, "{ \"line\": %u, \"column\": %u, "
-	      "\"class\": \"layout\", \"token\": \"%s\" }", line, col, s);
+              "\"class\": \"layout\", \"token\": \"%s\" }", line, col, s);
       break;
     case XML:
       fprintf(stdout,
-	      "<token line=\"%u\" column=\"%u\" class=\"layout\">%s</token>\n",
-	      line, col, s);
+              "<token line=\"%u\" column=\"%u\" class=\"layout\">%s</token>\n",
+              line, col, s);
       break;
     }
   }
@@ -88,12 +88,12 @@ static void emit(const char *s, unsigned line,  unsigned col)
 #define MAX_INDENTS 128
 static unsigned indents[MAX_INDENTS];
 static unsigned *sp = indents;
-#define indents_reset()	do { sp = indents; } while(0)
-#define indents_empty()	(sp == indents)
-#define indents_full()	(sp == indents+MAX_INDENTS)
-#define indents_top()	(indents_empty() ? 0 : *(sp-1))
-#define indents_push(i)	do { assert(!indents_full()); *sp++ = (i); } while(0)
-#define indents_pop()	do { assert(!indents_empty()); sp--; } while(0)
+#define indents_reset() do { sp = indents; } while(0)
+#define indents_empty() (sp == indents)
+#define indents_full()  (sp == indents+MAX_INDENTS)
+#define indents_top()   (indents_empty() ? 0 : *(sp-1))
+#define indents_push(i) do { assert(!indents_full()); *sp++ = (i); } while(0)
+#define indents_pop()   do { assert(!indents_empty()); sp--; } while(0)
 
 // emit NEWLINE and deal with indentation
 static void process_newline(unsigned indent)
@@ -171,7 +171,7 @@ static int utf8_codepoint(int cc, int *len, int bytes[4])
   else { /* invalid utf-8 start byte */
     if (!nowarn)
       fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 start byte 0x%02x.\n",
-	      filename, linenr, cc);
+              filename, linenr, cc);
     return cc;
   }
   /* collect all follow bytes: */
@@ -179,15 +179,15 @@ static int utf8_codepoint(int cc, int *len, int bytes[4])
     cc = get();
     if (cc == EOF) { /* unexpected EOF in utf-8 sequence */
       if (!nowarn)
-	fprintf(stderr, "(W): [%s:%u] Unexpected EOF in UTF-8 sequence.\n",
-		filename, linenr);
+        fprintf(stderr, "(W): [%s:%u] Unexpected EOF in UTF-8 sequence.\n",
+                filename, linenr);
       return EOF;
     }
     bytes[(*len)++] = cc;
     if ((cc & 0xC0) != 0x80) { /* invalid utf-8 follow byte */
       if (!nowarn)
-	fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 follow byte 0x%02x.\n",
-		filename, linenr, cc);
+        fprintf(stderr, "(W): [%s:%u] Invalid UTF-8 follow byte 0x%02x.\n",
+                filename, linenr, cc);
       return cc;
     }
     cp <<= 6;
@@ -199,7 +199,7 @@ static int utf8_codepoint(int cc, int *len, int bytes[4])
     /* invalid Unicode code point. */
     if (!nowarn)
       fprintf(stderr, "(W): [%s:%u] Invalid Unicode code point 0x%04x.\n",
-	      filename, linenr, cp);
+              filename, linenr, cp);
   }
   return cp;
 }
@@ -261,7 +261,7 @@ static int tokenize(char *token, const char **type,
       cc = get();
       // Maybe EOF!
       if (!brackets_opened && !strchr(" \t\n#\r\f", cc))
-	process_newline(0);
+        process_newline(0);
       goto restart;
     }
 
@@ -274,8 +274,8 @@ static int tokenize(char *token, const char **type,
     if (cc == EOF) {
       // Undo any outstanding indents:
       while (!indents_empty()) {
-	emit("DEDENT", linenr, column);
-	indents_pop();
+        emit("DEDENT", linenr, column);
+        indents_pop();
       }
       return 0;
     }
@@ -385,9 +385,9 @@ static int tokenize(char *token, const char **type,
 
             token_add(cc);
             // Assume \ is not escaped itself. Happens though!
-	    if (pc == '\\') // escape next char; no check
-	      cc = '\0';
-	    else
+            if (pc == '\\') // escape next char; no check
+              cc = '\0';
+            else
             if (cc == qc) { // a first unescaped quote
               int q2 = get();
               token_add(q2);
@@ -419,8 +419,8 @@ static int tokenize(char *token, const char **type,
       do {
         token_add(cc);
         if (pc == '\\') // escape next char; no check
-	  cc = '\0';
-	else
+          cc = '\0';
+        else
         if (cc == qc) { // unescaped quote
           *type = "string";
           break;
@@ -459,29 +459,29 @@ static int tokenize(char *token, const char **type,
     if (is_id_start(cp, utf8_len)) {
       int i;
       for (i = 0; i < utf8_len; i++)
-	token_add(utf8_bytes[i]);
+        token_add(utf8_bytes[i]);
     ident_token:
       cc = get();
       cp = utf8_codepoint(cc, &utf8_len, utf8_bytes);
       if (cp == EOF) // bad code point; already reported.
-	break;
+        break;
       all_ascii &= utf8_len == 1;
       while (is_id_follow(cp, utf8_len)) {
-	int i;
-	for (i = 0; i < utf8_len; i++)
-	  token_add(utf8_bytes[i]);
-	cc = get();
-	cp = utf8_codepoint(cc, &utf8_len, utf8_bytes);
-	if (cp == EOF) // bad code point; already reported.
-	  break;
-	all_ascii &= utf8_len == 1;
+        int i;
+        for (i = 0; i < utf8_len; i++)
+          token_add(utf8_bytes[i]);
+        cc = get();
+        cp = utf8_codepoint(cc, &utf8_len, utf8_bytes);
+        if (cp == EOF) // bad code point; already reported.
+          break;
+        all_ascii &= utf8_len == 1;
       }
       // Undo look ahead:
       while (utf8_len)
-	unget(utf8_bytes[--utf8_len]);
+        unget(utf8_bytes[--utf8_len]);
       token[len] = '\0';
       *type = all_ascii && is_keyword(token, keywords, num_keywords)
-	? "keyword" : "identifier";
+        ? "keyword" : "identifier";
       break;
     }
 
@@ -827,7 +827,7 @@ int main(int argc, char *argv[])
 fputs(
 "A tokenizer for Python (3) source code with output in 6 formats.\n"
 "Recognizes the following token classes: keyword, identifier, integer,\n"
-"floating, imaginary, string, and operator.\n\n", stdout);
+"floating, imaginary, string, and operator.\n\n", stderr);
 fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
@@ -963,11 +963,11 @@ fputs(
     while (tokenize(token, &type, &line, &col)) {
       switch (mode) {
       case RAW:
-	// Watch out for multi-line strings
+        // Watch out for multi-line strings
         if (!strcmp(type, "string"))
           RAW_escape(stdout, token);
-	else
-	  fputs(token, stdout);
+        else
+          fputs(token, stdout);
         fputc('\n', stdout);
         break;
       case PLAIN:

From 3166e88c3ebcd4d00a1232fc182d2bf3f0995742 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:31:26 -0400
Subject: [PATCH 10/34] moved major code to libtoken.c; new options

---
 tools/tokenizer/tokenize.c | 1030 ++++--------------------------------
 1 file changed, 96 insertions(+), 934 deletions(-)

diff --git a/tools/tokenizer/tokenize.c b/tools/tokenizer/tokenize.c
index b1dc55f..7822a30 100644
--- a/tools/tokenizer/tokenize.c
+++ b/tools/tokenizer/tokenize.c
@@ -1,11 +1,11 @@
-/* Copyright (c) 2020, 2021 International Business Machines Corporation
+/* Copyright (c) 2021 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
-   Simple C/C++ (and Java) Tokenizer.
+   Simple C/C++ and Java Tokenizer.
    For the most part assumes that the input source text is grammatically
-   correct C or C++ code.
-   (Since Java at the lexical level is very close, could in principle
-   also be used as Java tokenizer, albeit that not all of its keywords
+   correct C, C++, or Java code.
+   (Since Java at the lexical level is very close to C, we here sort of misuse
+   it as a Java tokenizer, albeit that not all of its keywords
    and some literal pecularities are not recognized.)
 
    Recognizes the following lexeme classes:
@@ -30,10 +30,11 @@
    its starting character. Line and column reflect positions in the
    physical line structure, not the logical one.
    All token literals are output exactly as they appear in the source text,
-   without any interpretation of escaped characters etc.
+   without any interpretation of escaped characters etc. However, the particular
+   output format will enforce certain escaping as needed.
 
-   Moreover, skips white-space, control characters and comments and
-   flags anything left over as illegal characters.
+   Moreover, white-space, control characters and comments are normally skipped
+   and anything left over is flagged as illegal characters.
 
    See these refs for details on the lexical definitions:
    C++14 Final Working Draft: n4140.pdf
@@ -44,7 +45,6 @@
    (A TAB is counted as a single character position. A CR causes a transition
    to a new line.)
    No trigraph sequences (??x) are recognized.
-   No alternative tokens except keyword ones for certain operators.
    No universal characters (\u and \U) in an identifier.
    Raw strings with R prefix are not supported.
    No preprocessing is attempted: phrases like #include <stdio.h> are
@@ -69,6 +69,7 @@
    1: illegal character(s) or premature EOF detected
    2: look-ahead buffer overflow
    3: output file cannot be opened
+   4: could not (re-)allocate token buffer
 
    C++ Token categories as Regular Expressions:
    (\b = [01], \o = [0-7], \d = [0-9], \x = [a-fA-F0-9],
@@ -78,11 +79,11 @@
    - identifier: [_a-zA-Z][_a-zA-Z0-9]*
    - integer   : 0[bB]\b('?\b])*\s?
                | 0('?\o)*\s?
-	       | 0[xX]\x('?\x)*\s?
-	       | [1-9]('?\d)*\s?
+               | 0[xX]\x('?\x)*\s?
+               | [1-9]('?\d)*\s?
    - floating  : .\d('?\d)*([eE][-+]?\d('?\d)*)?[fFlL]?
                | \d('?\d)*.(\d('?\d)*)?([eE][-+]?\d('?\d)*)?[fFlL]?
-	       | \d('?\d)*[eE][-+]?\d('?\d)*[fFlL]?
+               | \d('?\d)*[eE][-+]?\d('?\d)*[fFlL]?
    - string    : [uUL]?"([^"\\\n]|\\.|\\\n)*"
    - character : [uUL]?'([^']|\\.)'
    - operator  : one of these operator and punctuation symbols:
@@ -92,851 +93,10 @@
    - preprocessor : # | ##
 */
 
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include <unistd.h>		/* getopt() */
-#include <libgen.h>		/* basename() */
+#include <unistd.h>             /* getopt() */
+#include <libgen.h>             /* basename() */
 
-/* Let's introduce more parameters so that it becomes easier to
-   configure the state-machines for the various tokens.
-   Use a NUL character to disable the parameter, i.e., a NUL value
-   means "this char is not in effect; a test for it fails".
-
-   FIXME: not yet used!
-*/
-// Character that may be used to group digits in a number:
-#define CFG_DIGITS_SEP		'\''
-// Extra character that may start an identifier:
-#define CFG_ID_START_EXTRA	'_'
-// Extra character that may continue an identifier:
-// Maybe allows a set of characters, like also $?
-#define CFG_ID_CONT_EXTRA	'_'
-// May a floating-point number start with a decimal point:
-//#define CFG_FLOAT_DOT
-
-// FIXME: make token size dynamic.
-#define MAX_TOKEN 65535         // maximum token length in chars (\0 exclusive)
-#define MAX_BUF 8               // maximum buffer size in chars
-
-// Program globals:
-static char *filename = "stdin";// current file being parsed
-static unsigned linenr = 1;     // line number counted from 1
-static unsigned column = 0;     // char position in line, counted from 0
-static unsigned char_count = 0; // total char/byte count
-static unsigned utf8_count = 0; // total utf-8 char count
-static char buffer[MAX_BUF];    // use buffer as multi-char lookahead.
-static unsigned buffered = 0;   // number of buffered chars
-static unsigned saved_col = 0;  // one-place buf for last column on prev line
-static unsigned illegals = 0;	// count number of illegal characters
-static unsigned unexpect_eof = 0; // encountered unexpected EOF
-static unsigned num_files = 0;	// number of files read
-// keyword lookup function:
-static const char *(*is_keyword)(const char *);
-
-// Program option settings:
-static int debug = 0;		// when 1 debug output to stderr
-static int verbose = 0;         // when 1 info output to stderr
-static int nowarn = 0;		// when 1 warnings are suppressed
-static int hash_as_comment = 0;	// when 1 treat # as line comment
-static int start_token = 0;	// when 1 start filename pseudo-token
-static int newline_token = 0;	// when 1 output newline pseudo-token
-static int continuous_files = 0;// when 1 do not reset after each file
-static enum { C, CPP, JAVA } source = CPP;
-
-/* No longer using perfect hash function but simple binary search. */
-
-/* C11 n1570.pdf 6.4.1 (44)
-   C17 n2176.pdf 6.4.1 (A.1.2) (44)
-*/
-static const char *C_keywords[] = {
-  "_Alignas",	"_Alignof",	"_Atomic",	"_Bool",	"_Complex",
-  "_Generic",	"_Imaginary",	"_Noreturn",	"_Static_assert",
-  "_Thread_local",
-
-  "auto",	"break",	"case",		"char",		"const",
-  "continue",	"default",	"do",		"double",	"else",
-  "enum",	"extern",	"float",	"for",		"goto",
-  "if",		"inline",	"int",		"long",		"register",
-  "restrict",	"return",	"short",	"signed",	"sizeof",
-  "static",	"struct",	"switch",	"typedef",	"union",
-  "unsigned",	"void",		"volatile",	"while"
-};
-
-#if 0
-/* C++ 2014 n4296.pdf 2.11 (84) */
-static const char *CPP_keywords[] = {
-  "alignas",       "alignof",       "and",           "and_eq",     "asm",
-  "auto",          "bitand",        "bitor",         "bool",       "break",
-  "case",          "catch",         "char",          "char16_t",   "char32_t",
-  "class",         "compl",         "const",         "const_cast", "constexpr",
-  "continue",      "decltype",      "default",       "delete",     "do",
-  "double",        "dynamic_cast",  "else",          "enum",       "explicit",
-  "export",        "extern",        "false",         "float",      "for",
-  "friend",        "goto",          "if",            "inline",     "int",
-  "long",          "mutable",       "namespace",     "new",        "noexcept",
-  "not",           "not_eq",        "nullptr",       "operator",   "or",
-  "or_eq"          "private",       "protected",     "public",     "register",
-  "reinterpret_cast", "return",     "short",         "signed",     "sizeof",
-  "static",        "static_assert", "static_cast",   "struct",     "switch",
-  "template",      "this",          "thread_local",  "throw",      "true",
-  "try",           "typedef",       "typeid",        "typename",   "union",
-  "unsigned",      "using",         "virtual",       "void",       "volatile",
-  "wchar_t",       "while",         "xor",           "xor_eq"
-};
-#endif
-
-/* C++23 n4885.pdf 5.11 (92) */
-static const char *CPP_keywords[] = {
-  "alignas",       "alignof",       "and",           "and_eq",     "asm",
-  "auto",          "bitand",        "bitor",         "bool",       "break",
-  "case",          "catch",         "char",          "char16_t",   "char32_t",
-  "char8_t",       "class",         "co_await",      "co_return",  "co_yield",
-  "compl",         "concept",       "const",         "const_cast", "consteval",
-  "constexpr",     "constinit",     "continue",      "decltype",   "default",
-  "delete",        "do",            "double",        "dynamic_cast", "else",
-  "enum",          "explicit",      "export",        "extern",     "false",
-  "float",         "for",           "friend",        "goto",       "if",
-  "inline",        "int",           "long",          "mutable",    "namespace",
-  "new",           "noexcept",      "not",           "not_eq",     "nullptr",
-  "operator",      "or",            "or_eq"          "private",    "protected",
-  "public",        "register",      "reinterpret_cast", "requires","return",
-  "short",         "signed",        "sizeof",        "static",  "static_assert",
-  "static_cast",   "struct",        "switch",        "template",   "this",
-  "thread_local",  "throw",         "true",          "try",        "typedef",
-  "typeid",        "typename",      "union",         "unsigned",   "using",
-  "virtual",       "void",          "volatile",      "wchar_t",    "while",
-  "xor",           "xor_eq"
-};
-
-/* Java SE 8 (50) (false, true, null are literals) */
-/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */
-static const char *Java_keywords[] = {
-  "abstract", "assert",     "boolean", "break",     "byte",      "case",
-  "catch",    "char",       "class",   "const",     "continue",  "default",
-  "do",       "double",     "else",    "enum",      "extends",   "final",
-  "finally",  "float",      "for",     "goto",      "if",        "implements",
-  "import",   "instanceof", "int",     "interface", "long",      "native",
-  "new",      "package",    "private", "protected", "public",    "return",
-  "short",    "static",     "strictfp","super",     "switch", "synchronized",
-  "this",     "throw",      "throws",  "transient", "try",       "void",
-  "volatile", "while"
-};
-
-#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]);
-
-/* Generic binary search lookup in some keyword table.
-   `word' to be searched must be NUL-terminated C string.
-   `table' is array of const char * of `size' sorted alphabetically.
-   Returns word found (i.e., pointer value in table) or 0.
-*/
-#define lang_is_keyword(lang)						\
-  static const char *lang##_is_keyword(const char *word)		\
-  {									\
-    int i = 0, j = num_keywords(lang);					\
-    while (i < j) {							\
-      int k = (i + j) >> 1 /* / 2 */;					\
-      const char *kw = lang##_keywords[k];				\
-      int cmp = strcmp(word, kw);					\
-      if (!cmp)								\
-	return kw;							\
-      if (cmp < 0) j = k; else i = k + 1;				\
-    }									\
-    return 0;								\
-  }
-
-/* Define individual is_keyword functions per language: */
-/* C_is_keyword */
-lang_is_keyword(C)
-/* CPP_is_keyword */
-lang_is_keyword(CPP)
-/* Java_is_keyword */
-lang_is_keyword(Java)
-
-// Append char cc to token; discard when no more room:
-#define token_add(cc) \
-  do { if (len < MAX_TOKEN) token[len++] = (cc); } while(0)
-
-#define utf8_start(cc)	(((cc)&0xC0)!=0x80)
-#define utf8_follow(cc) (((cc)&0xC0)==0x80)
-
-#define utf8_len(cc) \
-  (((cc)&0xF8)==0xF0 ? 4 : ((cc)&0xF0)==0xE0 ? 3 : ((cc)&0xE0)==0xC0 ? 2 : 1)
-
-/* Let's assume UTF-8 encoding.
-   https://www.cprogramming.com/tutorial/unicode.html
-   https://opensource.apple.com/source/tidy/tidy-2.2/tidy/src/utf8.c.auto.html
-*/
-
-void unget(int cc)
-{
-  if (cc == EOF) return;
-  if (buffered < MAX_BUF) {
-    if (cc == '\n') {
-      linenr--;
-      // column was 0 right after getting the \n
-      // hopefully there are no multiple ungets of \n
-      column = saved_col;
-    }
-    else
-      column--;
-    buffer[buffered++] = cc;
-  }
-  else {
-    fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF);
-    exit(2);
-  }
-}
-
-// Act like getchar().
-// Mind linenr,column apply to physical lines not logical ones.
-int get(void)
-{
-  int cc;
-
- restart:
-  // Get the next character:
-  if (buffered) // chars available in lookahead buffer
-    cc = buffer[--buffered]; // never EOF
-    // cc might be \ and followed by fresh \n
-    // Note: never can have buffered line continuation, i.e., \ \n.
-  else { // must read fresh char
-    cc = getchar();
-    if (cc == EOF) return EOF;
-    // Count all chars, even the \ of a line continuation:
-    char_count++;
-    if (utf8_start(cc)) utf8_count++;
-  }
-
-  // Treat Mac line endings ('\r') as regular newlines:
-  if (cc == '\n' || cc == '\r') {
-    linenr++;
-    saved_col = column;
-    column = 0;
-    return '\n';
-  }
-
-  // Deal with \ line continuations! Must look ahead.
-  if (cc == '\\') {
-    // Must look ahead; mind next char might be buffered!
-    if (buffered)
-      // Never can have \n for next char:
-      assert(buffer[buffered-1] != '\n');
-    else {
-      // Must get fresh character:
-      int nc = getchar(); // do not count yet; maybe must unget
-
-      // Maybe \r \n combination?
-      if (nc == '\r') {
-	// Look ahead for \n:
-	int c2 = getchar(); // do not count yet; maybe must unget
-	if (c2 == '\n') {
-	  // Skip \r but count it:
-	  char_count++;
-	  utf8_count++;
-	  nc = '\n';
-	}
-	else {
-	  unget(c2);
-	  // nc == '\r'
-	}
-      }
-
-      if (nc == '\n') { // 1 logical line: discard \\n combo:
-	char_count++; // counts the newline
-	linenr++;     // on next physical line
-	// never unget a continuation
-	//saved_col = column;
-	column = 0;
-
-	// Still need to get a character.
-	// Could again start a line continuation!
-	goto restart;
-      }
-      // Mind nc not \n but maybe \ or \r, then goes to buffer.
-      unget(nc);
-    }
-    // cc == '\\' a regular backslash
-  }
-  column++;
-  return cc;
-}
-
-/* Tokenization of C++ programming language source text.
-   Recognizes:
-   - identifier
-   - reserved word/keyword
-   - binary, octal, decimal, hexadecimal and floating-point numbers
-   - double-quoted string literal
-   - single-quoted character literal
-   - all single, double, and triple operator and punctuation symbols
-   - the preprocessor tokens # and ##
-   Skips white-space, control characters and comments and flags anything
-   left over as illegal characters.
-
-   (In the order of 20 tests per single character worst-case.)
-
-   Returns 0 upon EOF or error.
-*/
-int tokenize(char *token, const char **type, unsigned *line, unsigned *col)
-{
-  unsigned len;
-  int cc;
-  *type = "";
-
-  do { // infinite loop; after token recognized breaks out.
-    len = 0;
-    cc = get();
-
-  restart:
-    // cc already read.
-
-    /*** WHITE-SPACE ***/
-
-    // Skip (abutted) space and control chars and comments:
-    // [ \t\f\v\n]
-    //    while (cc <= ' ' && cc != EOF)
-    while (isspace(cc) && cc != EOF && cc != '\n')
-      cc = get();
-    if (cc == EOF)
-      return 0;
-    if (cc == '\n') {
-      if (newline_token) {
-	// token is empty.
-	*line = linenr-1;
-	*col  = saved_col;
-	*type = "newline";
-	break;
-      }
-      cc = get();
-      goto restart;
-    }
-    // !isspace(cc) && cc != EOF
-
-    /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/
-    // Java: no preprocessor directives.
-
-    if (cc == '#' && hash_as_comment) {
-      // Skip till end-of-line (\n exclusive):
-      while ((cc = get()) != '\n' && cc != EOF)
-	;
-      // cc == '\n' || cc == EOF
-      goto restart;
-    }
-
-    /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/
-
-    if (cc == '/') {
-      cc = get();
-      if (cc == '/') {
-        // Skip till end-of-line (\n exclusive):
-        while ((cc = get()) != '\n' && cc != EOF)
-          ;
-        // cc == '\n' || cc == EOF
-        goto restart;
-      }
-
-      if (cc == '*') {
-	// Remember start position:
-	unsigned lin = linenr;
-
-        // Skip till */ inclusive:
-        int nc = get(); // if EOF next get will be EOF too
-        do {
-          cc = nc;
-          nc = get();
-          if (nc == EOF) { // Error!
-            fprintf(stderr,
-		    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
-		    filename, lin);
-	    unexpect_eof++;
-            return 0;
-          }
-        } while (cc != '*' || nc != '/');
-        // cc == '*' && nc == '/'
-        cc = get();
-        goto restart;
-      }
-      // seen / but not // or /*
-      unget(cc); // char after /
-      cc = '/'; // restore /
-    }
-
-    // Start collecting a token.
-    // Token should finish with cc being last char of token!
-    *line = linenr;
-    *col = column-1; // 1 char lookahead
-
-    /*** CHAR and STRING PREFIX (C/C++) ***/
-
-    // Allow u,U,L prefix for string and char
-    // FIXME: allow u8 as prefix for string
-    if (cc == 'L' || cc == 'u' || cc == 'U') {
-      token[len++] = cc;
-      cc = get();
-      if (cc == '"')
-        goto string_token;
-      if (cc == '\'')
-        goto char_token;
-      // u,U,L will be interpreted as (start of) identifier.
-      unget(cc); // char after u,U,L
-      cc = token[--len]; // restore original and remove from token
-    }
-
-    /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/
-    // Java: false, true, null are literals
-    // FIXME: Flag to allow .letter as part of identifier?
-    // (compound identifier)
-
-    // Simplistic solution to allowing Unicode: allow any char >= 128 without
-    // actual checking for UTF-8.
-    if (isalpha(cc) || cc == '_' || cc == '$' || cc & 0x80) {
-      // First char always fits.
-      token[len++] = cc;
-      while (isalnum(cc = get()) || cc == '_' || cc == '$' ||
-	     cc != EOF && (cc & 0x80))
-        token_add(cc);
-      unget(cc);
-      token[len] = '\0';
-      *type = is_keyword(token) ? "keyword" : "identifier";
-      break;
-    }
-
-    /*** INTEGER and FLOATING ***/
-    // Java: uses _ in numbers as insignificant separator
-    // Java: decimal suffix: [lL], float suffix: [fFdD]
-    // Java: allows hex float
-
-#if 0
-    // Examples:
-    int bin_num = 0B010101u;
-    int oct_num = 01234567L;
-    int hex_num = 0x123ABCLL;
-    int dec_num = 12345678;
-
-    float flt_num1 = 077.;
-    float flt_num2 = 077.987;
-    float flt_num3 = 77.;
-    float flt_num4 = .77;
-#endif
-
-    // . digits ... floating
-    if (cc == '.') {
-      // Look ahead for a digit:
-      int nc;
-      if (isdigit(nc = get())) {
-	unget(nc);
-	goto start_fraction;
-      }
-      unget(nc);
-      // Could go immediately to operator: goto seen_period
-    }
-
-    if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal
-      // Types of integer literals:
-      enum {
-	BIN, OCT, DEC, HEX
-      } int_lit = cc == '0' ? OCT : DEC;
-
-      // Lookahead:
-      int nc = get();
-      if (int_lit == OCT && (nc == 'x' || nc == 'X')) {
-	int_lit = HEX;
-	token_add(cc); // the 0
-	cc = nc; // the x or X
-      }
-      else
-      if (int_lit == OCT && (nc == 'b' || nc == 'B')) {
-	int_lit = BIN;
-	token_add(cc); // the 0
-	cc = nc; // the b or B
-      }
-      else
-	unget(nc); // isdigit(cc)
-
-      do {
-        token_add(cc);
-        cc = get();
-
-	// Allow for ' between `digits':
-        if (cc == '\'') {
-          // Keep the ' in the token for now:
-          token_add(cc);
-          int nc = get();
-	  if (isdigit(nc) || int_lit == HEX && isxdigit(nc))
-            cc = nc;
-          else { // Error!
-            fprintf(stderr,
-		    "(E): [%s:%u] C++14 only allows ' between digits.\n",
-		    filename, linenr);
-            // what to do?
-          }
-        }
-      } while (isdigit(cc) || int_lit == HEX && isxdigit(cc));
-      // !is[x]digit(cc)
-
-      // FIXME: allow hex floats in C
-      if (int_lit == OCT || int_lit == DEC) {
-	int floating = 0;
-	// Seen digits-sequence. Maybe followed by . or e or E?
-	if (cc == '.') { // fractional part
-	start_fraction:
-	  floating = 1;
-	  token_add(cc);
-	  // digits? FIXME: again allow ' between digits
-	  while (isdigit(cc = get()))
-	    token_add(cc);
-	  // !isdigit(cc)
-	}
-	// cc != '.' || !isdigit(cc)
-	if (cc == 'e' || cc == 'E') { // exponent
-	  floating = 1;
-	  token_add(cc);
-	  if ((cc = get()) == '-' || cc == '+') {
-	    token_add(cc);
-	    cc = get();
-	  }
-	  // FIXME: no check for at least 1 digit
-	  // FIXME: again allow ' between digits
-	  while (isdigit(cc)) {
-	    token_add(cc);
-	    cc = get();
-	  }
-	  // !isdigit(cc)
-	}
-	if (floating) {
-	  if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L')
-	    token_add(cc);
-	  else
-	    unget(cc);
-	  *type = "floating";
-	  break;
-	}
-      }
-
-      // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case
-      if (cc == 'l' || cc == 'L') {
-        token_add(cc);
-        // maybe another l
-        cc = get();
-        if (cc == 'l' || cc == 'L') {
-          token_add(cc);
-          // Here: token is digits[lL][lL]
-          cc = get();
-        }
-        // maybe a u
-        if (cc == 'u' || cc == 'U')
-          // Here: token is digits[lL][lL]?[u|U]
-          token_add(cc);
-        else
-          unget(cc);
-      }
-      else if (cc == 'u' || cc == 'U') {
-        token_add(cc);
-        // maybe an l
-        cc = get();
-        if (cc == 'l' || cc == 'L') {
-          token_add(cc);
-          // Here: token is digits[uU][lL]
-          cc = get();
-        }
-        // maybe another l
-        if (cc == 'l' || cc == 'L')
-          // Here: token is digits[uU][lL]?[lL]
-          token_add(cc);
-        else
-          unget(cc);
-      }
-      else
-        unget(cc);
-      *type = "integer";
-      break;
-    }
-
-    /*** STRING (C/C++/Java) ***/
-
-    if (cc == '"') {
-    string_token:
-      // First char always fits.
-      token[len++] = cc;
-      // Remember start position:
-      unsigned lin = linenr;
-      // Watch out for escaped " inside string.
-      cc = get();
-      while (cc != '"') {
-        if (cc == EOF) { // Error!
-          fprintf(stderr,
-		  "(E): [%s:%u] Unexpected end-of-file in string literal.\n",
-		  filename, lin);
-	  unexpect_eof++;
-          return 0;
-        }
-        token_add(cc);
-        int nc = get();
-
-        if (cc == '\\') {
-	  // FIXME: No check on valid escape char!
-	  // ' " ? \ a b f n r t v
-	  token_add(nc);
-          cc = get();
-        }
-        else
-          cc = nc;
-      }
-      // cc == '"'
-      token_add(cc);
-      *type = "string";
-      break;
-    }
-
-    /*** CHARACTER (C/C++/Java) ***/
-
-    if (cc == '\'') {
-    char_token:
-      // First char always fits.
-      token[len++] = cc;
-      // Watch out for escaped ' inside char.
-      cc = get();
-      // FIXME: Cannot have empty char!
-      while (cc != '\'') {
-        if (cc == EOF) { // Error!
-          fprintf(stderr,
-		  "(E): [%s:%u] Unexpected end-of-file in char literal.\n",
-		  filename, linenr);
-	  unexpect_eof++;
-          return 0;
-        }
-        token_add(cc);
-        int nc = get();
-        if (cc == '\\') {
-          token_add(nc);
-          cc = get();
-          // FIXME: No check on valid escape char!
-          // ' " ? \ a b f n r t v 0[d[d]] xh*
-        }
-        else
-          cc = nc;
-      }
-      // cc == '\''
-      token_add(cc);
-      *type = "character";
-      break;
-    }
-
-    /*** OPERATOR (and PUNCTUATION) (C/C++/Java) ***/
-
-    // Operator and punctuation symbols. Longest match.
-
-    /* Operator or punctuator   Alternative representation
-       {        <%
-       }        %>
-       [        <:
-       ]        :>
-       #        %:      (not supported here)
-       ##       %:%:    (not supported here)
-    */
-
-    // Single char operator or punctuator (C/C++/Java)
-    // { } [ ] ( ) ; : ? . ~ ! + - * / % ^ = & | < > ,
-    // Double char operator or punctuator (C/C++)
-    // <: :> <% %>
-    // Double char operator or punctuator (C/C++/Java)
-    // += -= *= /= %= ^= &= |= == != <= >= && || << >> ++ -- ->
-    // Double char operator or punctuator (C++/Java)
-    // ::
-    // Double char operator or punctuator (C++)
-    // .*
-    // Triple char operator or punctuator (C/C++/Java)
-    // ... <<= >>=
-    // Triple char operator or punctuator (C++)
-    // ->* <=>
-    // Java: @ >>> >>>=
-
-    //seen_period:
-
-    // First char always fits.
-    token[len++] = cc;
-    token[len] = '\0';
-    //token=[cc,0];len=1
-
-    if (strstr("{}[]();?~,@", token)) { // allow @ for Java
-      // Single char operator/punctuator.
-      *type = "operator";
-      break;
-    }
-
-    if (strstr("<:.-+*/%^&|=!>", token)) { // single or start of double/triple
-      // Check second char:
-      int c2 = get();
-      if (c2 != EOF) {
-        token[len++] = c2;
-	//token=[cc,c2];len=2
-
-        // Check third char:
-        int c3 = get();
-        if (c3 != EOF) {
-          token[len++] = c3;
-          token[len] = '\0';
-	  //token=[cc,c2,c3,0];len=3
-	  if (!strcmp(">>>", token)) { // allow >>> for Java
-	    //token=[>,>,>,0];len=3
-	    // Look-ahead for =:
-	    int c4 = get();
-	    if (c4 == '=') // >>>= for Java
-	      token[len++] = c4;
-	      //token=[>,>,>,=];len=4
-	    else
-	      unget(c4);
-  	      //token=[>,>,>,0];len=3
-            *type = "operator";
-            break;
-	  }
-	  //token=[cc,c2,c3,0];len=3
-
-          if (!strcmp("...", token) ||
-              !strcmp("<=>", token) ||
-              !strcmp("->*", token) ||
-              !strcmp("<<=", token)) {
-            // Triple char operator/punctuator.
-            *type = "operator";
-            break;
-          }
-
-          // Maybe double char. Undo the c3 token extension:
-          token[--len] = '\0';
-	  //token=[cc,c2,0];len=2
-        }
-	else
-	  token[len] = '\0';
-	  //token=[cc,c2,0];len=2
-        unget(c3);
-
-        // Maybe double char.
-        static const char * const ops2[] = {
-          "<:", "<%", "<=", "<<", ":>",
-          "::", ".*", "->", "-=", "--",
-          "+=", "++", "*=", "/=", "%>",
-          "%=", "^=", "&=", "&&", "|=",
-          "||", "==", "!=", ">=", ">>"
-        };
-	unsigned size = sizeof(ops2) / sizeof(ops2[0]);
-        unsigned i;
-        for (i = 0; i < size; i++)
-          if (!strcmp(ops2[i], token))
-            break;
-        if (i < size) {
-          *type = "operator";
-          break;
-        }
-	//token=[cc,c2,0];len=2
-
-        // Must be single char. Undo the c2 token extension:
-        token[--len] = '\0';
-	//token=[cc,0];len=1
-      }
-      //else token=[cc,0];len=1
-
-      // Must be single char.
-      unget(c2);
-      *type = "operator";
-      break;
-    }
-    //token=[cc,0];len=1
-
-    /*** PREPROCESSOR (C/C++) ***/
-
-    if (cc == '#') {
-      int nc = get();
-      if (nc != '#')
-        unget(nc);
-      else
-        token[len++] = nc;
-      *type = "preprocessor";
-      break;
-    }
-
-    // What is left here? Illegal chars!
-    if (!nowarn)
-      // Mind non-printing chars!
-      fprintf(stderr,
-	      "(W): [%s:%u] Illegal character `%s%c` (0x%02x) skipped.\n",
-	      filename, linenr, cc<32?"CTRL-":"", cc<32?cc+64:cc, cc);
-    // Count them:
-    illegals++;
-
-  } while (1);
-  // len <= MAX_TOKEN
-  token[len] = '\0';
-  return 1;
-}
-
-// Escape token for output as CSV string.
-void CSV_escape(FILE *out, const char *token)
-{
-  const char *p;
-  // start CSV string:
-  fputc('"', out);
-  for (p = token; *p; p++) {
-    if (*p == '"')
-      fputc('"', out);
-    fputc(*p, out);
-  }
-  // end CSV string:
-  fputc('"', out);
-}
-
-// Escape token for output as JSON string.
-void JSON_escape(FILE *out, const char *token)
-{
-  // C/C++ has escapes: \' \" \? \a \b \f \n \r \t \v \x \0.
-  // To preserve, simply escape the escape and all ":
-  const char *p;
-  for (p = token; *p; p++) {
-    if (*p == '\\' || *p == '"')
-      fputc('\\', out);
-    fputc(*p, out);
-  }
-}
-
-// Escape token for output as XML text.
-void XML_escape(FILE *out, const char *token)
-{
-#if 1
-  // Alternative: escape every <, >, and &:
-  const char *p;
-  for (p = token; *p; p++) {
-    if (*p == '<')
-      fputs("&lt;", out);
-    else
-    if (*p == '>')
-      fputs("&gt;", out);
-    else
-    if (*p == '&')
-      fputs("&amp;", out);
-    else
-      fputc(*p, out);
-  }
-#else
-  // User CDATA construct for escaping.
-  // Impossible to escape ]]> occurring in token!
-  // Must chop up the substring ]]> in ]] and >.
-  const char *p;
-  const char *q = token;
-  // "abc]]>hello" => <![CDATA["abc]]]]><![CDATA[>hello"]]>
-  // "]]>]]>" => <![CDATA[]]]]><!CDATA[>]]]]><![CDATA[>"]]>
-  while ((p = strstr(q, "]]>"))) {
-    int len = p - q; // always > 0
-    fputs("<![CDATA[", out);
-    fwrite(q, 1, len, out);
-    fputs("]]]]>", out);
-    q = p+2; // q start at >...
-  }
-  if (q < token+strlen(token))
-    fprintf(out, "<![CDATA[%s]]>", q);
-#endif
-}
+#include "libtoken.h"
 
 int main(int argc, char *argv[])
 {
@@ -944,17 +104,22 @@ int main(int argc, char *argv[])
   extern int opterr;
   extern int optind;
   int option;
-  char const *opt_str = "1acdhjl:m:no:rsvw";
+  char const *opt_str = "1acdhjkl:m:nNo:rsvwW";
   char usage_str[80];
 
-  char token[MAX_TOKEN+1]; /* leave room for a terminating NUL */
+  const char *token;
   const char *type;
   unsigned line;
   unsigned col;
+  unsigned token_len;
+  unsigned num_files = 0;    // number of files read
+  int start_token = 0;       // when 1 start filename pseudo-token
+  int continuous_files = 0;  // when 1 do not reset after each file
 
   char *outfile = 0;
   enum { PLAIN, CSV, JSON, JSONL, XML, RAW } mode = PLAIN;
   int first_time = 1;
+  Language source;
   int explicit_source = 0;
   int append = 0;
 
@@ -984,7 +149,7 @@ int main(int argc, char *argv[])
 fputs(
 "A tokenizer for C/C++ (and Java) source code with output in 6 formats.\n"
 "Recognizes the following token classes: keyword, identifier, integer,\n"
-"floating, string, character, operator, and preprocessor.\n\n", stdout);
+"floating, string, character, operator, and preprocessor.\n\n", stderr);
 fprintf(stderr, usage_str, basename(argv[0]));
 fputs(
 "\nCommand line options are:\n"
@@ -993,34 +158,32 @@ fputs(
 "-d       : print debug info to stderr; implies -v.\n"
 "-h       : print just this text to stderr and stop.\n"
 "-j       : assume input is Java (deprecated: use -l Java or .java).\n"
+"-k       : output line and block comments as tokens.\n"
 "-l<lang> : specify language explicitly (C, C++, Java).\n"
 "-m<mode> : output mode either plain (default), csv, json, jsonl, xml, or raw.\n"
 "-n       : output newlines as a special pseudo token.\n"
+"-N       : output line continuations as a special pseudo token.\n"
 "-o<file> : write output to this file (instead of stdout).\n"
 "-s       : enable a special start token specifying the filename.\n"
 "-1       : treat all filename arguments as a continuous single input.\n"
 "-v       : print action summary to stderr.\n"
-"-w       : suppress all warning messages.\n",
+"-w       : suppress all warning messages.\n"
+"-W       : output adjacent white-space as a token.\n",
       stderr);
       return 0;
 
     case 'j':
-      source = JAVA;
+      source = set_or_detect_lang("Java");
       explicit_source = 1;
       break;
 
+    case 'k':
+      comment_token = 1;
+      break;
+
     case 'l':
-      if (!strcmp(optarg, "C"))
-	source = C;
-      else if (!strcmp(optarg, "C++"))
-	source = CPP;
-      else if (!strcmp(optarg, "Java"))
-	source = JAVA;
-      else {
-	if (!nowarn)
-        fprintf(stderr, "(W): Unknown source %s (assuming C++).\n", optarg);
-      }
-      explicit_source = 1;
+       source = set_or_detect_lang(optarg);
+       explicit_source = 1;
       break;
 
     case 'm':
@@ -1037,7 +200,7 @@ fputs(
       else if (!strcmp(optarg, "raw"))
         mode = RAW;
       else {
-	if (!nowarn)
+        if (!nowarn)
         fprintf(stderr, "(W): Invalid mode %s (using plain).\n", optarg);
         mode = PLAIN;
       }
@@ -1047,6 +210,10 @@ fputs(
       newline_token = 1;
       break;
 
+    case 'N':
+      continuation_token = 1;
+      break;
+
     case 'o':
       outfile = optarg;
       break;
@@ -1063,6 +230,10 @@ fputs(
       nowarn = 1;
       break;
 
+    case 'W':
+      whitespace_token = 1;
+      break;
+
     case '?':
     default:
       fputs("(F): unknown option. Stop.\n", stderr);
@@ -1088,34 +259,14 @@ fputs(
       fprintf(stderr, "(W): Cannot read file %s.\n", filename);
       continue;
     }
-    if (!explicit_source) {
-      // Determine language from extension:
-      int len = strlen(filename);
-      if (len > 2 && !strcmp(filename+len-2, ".c"))
-	source = C;
-      else if (len > 4 && !strcmp(filename+len-4, ".cpp"))
-	source = CPP;
-      else if (len > 5 && !strcmp(filename+len-5, ".java"))
-	source = JAVA;
-    }
+
+    if (!explicit_source)
+      source = set_or_detect_lang(0);
 
   doit:
     if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename);
     num_files++;
 
-    // Determine which keyword lookup function to use:
-    switch (source) {
-    case C:
-      is_keyword = C_is_keyword;
-      break;
-    case CPP:
-      is_keyword = CPP_is_keyword;
-      break;
-    case JAVA:
-      is_keyword = Java_is_keyword;
-      break;
-    }
-
     // Header:
     switch (mode) {
     case RAW:
@@ -1127,48 +278,50 @@ fputs(
       break;
     case CSV:
       if (!continuous_files || num_files == 1)
-	fputs("line,column,class,token\n", stdout);
+        fputs("line,column,class,token\n", stdout);
       if (start_token)
         fprintf(stdout, "0,0,filename,\"%s\"\n", filename);
       break;
     case JSON:
     case JSONL:
       if (!continuous_files || num_files == 1) {
-	if (mode == JSON) fputs("[\n", stdout);
+        if (mode == JSON) fputs("[\n", stdout);
       }
       else {
-	if (mode == JSON) fputc(',', stdout);
-	fputc('\n', stdout);
-	first_time = 1;
+        if (mode == JSON) fputc(',', stdout);
+        fputc('\n', stdout);
+        first_time = 1;
       }
       if (start_token) {
         fprintf(stdout,
                 "{ \"line\": 0, \"column\": 0, "
-                "\"class\": \"filename\", \"token\": \"%s\" }",
-                filename);
-	first_time = 0;
+                "\"class\": \"filename\", \"length\": %d, \"token\": \"%s\" }",
+                strlen(filename), filename);
+        first_time = 0;
       }
       break;
     case XML:
       if (!continuous_files || num_files == 1) {
-	fputs("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n", stdout);
-	// standalone="yes"
-	fputs("<tokens>\n", stdout);
+        fputs("<?xml version='1.0' encoding='UTF-8'?>\n", stdout);
+        // standalone='yes'
+        fputs("<tokens>\n", stdout);
       }
       if (start_token) {
-        fprintf(stdout, "<token line=\"0\" column=\"0\" class=\"filename\">");
-	XML_escape(stdout, filename);
+        fprintf(stdout,
+                "<token line='0' column='0' class='filename' length='%d'>",
+                strlen(filename));
+        XML_escape(stdout, filename);
         fputs("</token>\n", stdout);
       }
       break;
     }
 
-    while (tokenize(token, &type, &line, &col)) {
+    while ((token_len = C_tokenize(&token, &type, &line, &col))) {
       switch (mode) {
       case RAW:
         fputs(token, stdout);
-	fputc('\n', stdout);
-	break;
+        fputc('\n', stdout);
+        break;
       case PLAIN:
         fprintf(stdout, "(%4u,%3u) %s: %s\n", line, col, type, token);
         break;
@@ -1179,7 +332,12 @@ fputs(
         if (!strcmp(type, "string") ||
             // Do we need this too? Yes!
             !strcmp(type, "character") && strchr(token, '"') ||
-            !strcmp(type, "character") && strchr(token, ','))
+            !strcmp(type, "character") && strchr(token, ',') ||
+            !strcmp(type, "whitespace") && strchr(token, '\n') ||
+            !strcmp(type, "newline") ||
+            !strcmp(type, "continuation") ||
+            comment_token && (!strcmp(type, "line_comment") ||
+                              !strcmp(type, "block_comment")))
           CSV_escape(stdout, token);
         else if (!strcmp(token, ","))
           fputs("\",\"", stdout);
@@ -1194,24 +352,28 @@ fputs(
         else {
           if (mode == JSON) fputc(',', stdout);
           fputc('\n', stdout);
-	}
+        }
         fprintf(stdout,
                 "{ \"line\": %u, \"column\": %u, "
-                "\"class\": \"%s\", \"token\": \"",
-                line, col, type);
+                "\"class\": \"%s\", \"length\": %u, \"token\": \"",
+                line, col, type, token_len);
         // token value is always a JSON string.
-        if (!strcmp(type, "string") || !strcmp(type, "character"))
+        if (!strcmp(type, "string") || !strcmp(type, "character") ||
+            !strcmp(type, "newline") || !strcmp(type, "whitespace") ||
+            !strcmp(type, "continuation"))
           JSON_escape(stdout, token);
         else
           fputs(token, stdout);
         fputs("\" }", stdout);
         break;
       case XML:
-        fprintf(stdout, "<token line=\"%u\" column=\"%u\" class=\"%s\">",
-                line, col, type);
-        if (!strcmp(type, "string")
-            || !strcmp(type, "character")
-            || !strcmp(type, "operator"))
+        fprintf(stdout, "<token line='%u' column='%u' class='%s' length='%u'>",
+                line, col, type, token_len);
+            if (!strcmp(type, "string") ||
+                !strcmp(type, "character") ||
+                !strcmp(type, "operator") ||
+                comment_token && (!strcmp(type, "line_comment") ||
+                                  !strcmp(type, "block_comment")))
           XML_escape(stdout, token);
         else
           fputs(token, stdout);
@@ -1224,25 +386,25 @@ fputs(
       // Trailer:
       switch (mode) {
       case RAW:
-	break;
+        break;
       case PLAIN:
-	break;
+        break;
       case CSV:
-	break;
+        break;
       case JSON:
-	fputs("\n]", stdout);
-	/*FALL THROUGH*/
+        fputs("\n]", stdout);
+        /*FALL THROUGH*/
       case JSONL:
-	fputc('\n', stdout);
-	break;
+        fputc('\n', stdout);
+        break;
       case XML:
-	fputs("</tokens>\n", stdout);
-	break;
+        fputs("</tokens>\n", stdout);
+        break;
       }
 
       if (verbose)
-	fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
-		 char_count, utf8_count);
+        fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
+                 char_count, utf8_count);
 
       // Reset globals:
       char_count = 0;
@@ -1277,7 +439,7 @@ fputs(
 
     if (verbose)
       fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n",
-	      char_count, utf8_count);
+              char_count, utf8_count);
   }
 
   if (num_files > 1 && verbose)

From 2623241f9d048d054bfdc2ea3af1027efb21c58a Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:32:18 -0400
Subject: [PATCH 11/34] untabify

---
 tools/tokenizer/token_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/tokenizer/token_common.h b/tools/tokenizer/token_common.h
index 1ea4706..2cfe81e 100644
--- a/tools/tokenizer/token_common.h
+++ b/tools/tokenizer/token_common.h
@@ -47,7 +47,7 @@ extern int nowarn/*= 0*/;            // when 1 warnings are suppressed
 extern Language source/*= C*/;       // language mode
 
 extern const char *is_keyword(const char *word,
-			      const char *table[], unsigned size);
+                              const char *table[], unsigned size);
 
 extern int get(void);
 extern void unget(int cc);

From 21b15b571c4f9ab3699d039c149af3aaf464e057 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 12:32:42 -0400
Subject: [PATCH 12/34] untabify

---
 tools/tokenizer/token_common.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/tokenizer/token_common.c b/tools/tokenizer/token_common.c
index 6eefa7e..ba57ba1 100644
--- a/tools/tokenizer/token_common.c
+++ b/tools/tokenizer/token_common.c
@@ -25,7 +25,7 @@ unsigned num_files = 0;    // number of files read
 int debug = 0;             // when 1 debug output to stderr
 int verbose = 0;           // when 1 info output to stderr
 int nowarn = 0;            // when 1 warnings are suppressed
-Language source = C;	   // language mode
+Language source = C;       // language mode
 
 /* Conversion table from filename extension to language code.
    To find language code, consider all entries and check each ext
@@ -52,7 +52,7 @@ static const struct { const char *ext; Language lang; const char *name; }
    Returns word found (i.e., pointer value in table) or 0.
 */
 const char *is_keyword(const char *word,
-		       const char *table[], unsigned size)
+                       const char *table[], unsigned size)
 {
   int i = 0, j = size;
   while (i < j) {
@@ -90,7 +90,7 @@ void remove_BOM(void)
     if (c2 == 0xBB) {
       int c3 = getchar();
       if (c3 == 0xBF) {
-	return;
+        return;
       }
       if (c3 != EOF) buffer[buffered++] = c3;
     }
@@ -217,7 +217,7 @@ Language detect_lang(void)
     int i;
     for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
       if (!strcmp(p, langs[i].ext))
-	return langs[i].lang;
+        return langs[i].lang;
   }
   return C;
 }

From 23652eb33453338fde0c9b3362e90cd90af47350 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 16:31:43 -0400
Subject: [PATCH 13/34] more comments; better #define handling; started
 handling template <

---
 tools/tokenizer/filter6.awk | 92 +++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk
index 64061d0..6d5e3ea 100755
--- a/tools/tokenizer/filter6.awk
+++ b/tools/tokenizer/filter6.awk
@@ -14,11 +14,15 @@
 
 # Ambiguous tokens in C/C++:
 # < > delimiters of filename in preprocessor include directive
+# Resolved by using preceding #include context
 # < > delimiters of template parameters
 # < less than operator
+# Resolve: preceding context keyword template, template <+
 # > greater than operator
+# Resolve: preceding context keyword template <
 # " " delimiters of filename in preprocessor include directive
 # " " delimiters of string literal
+# Resolved by using preceding #include context
 # ( ) expression grouping
 # ( ) argument list
 # { } block
@@ -29,9 +33,11 @@
 # ~ unary operator
 # - unary operator
 # - binary operator
-# * unary operator
-# * binary operator
+# Resolve: no white-space after - then unary?
+# * unary operator (dereference pointer)
+# * binary operator (multiplication)
 # * pointer declarator
+# Can of worms: overloaded operator symbols
 
 # Simplistic CPP line syntax:
 # "#" directive-name (token)* newline
@@ -91,7 +97,7 @@ BEGIN {
 # Make sure all conditions are mutually exclusive, except last one.
 # Last one is made exclusive by next_state==-1.
 # Must use next_state to avoid immediate action on current line.
-
+# All rules that match must set next_state to something other than -1.
 
 # Instead of composing new CSV record could also modify $0 via
 # assignments to its fields (like $3="identifier").
@@ -102,6 +108,12 @@ BEGIN {
     next_state=1
 }
 
+# The keyword template provides context for some < and > disambiguation.
+(state == 0 && $4 == "template") {
+    print $0
+    next_state=0 # switched off for now
+}
+
 # # seen; expect directive or identifier.
 (state == 1 && $3 == "identifier") {
     push($0)
@@ -137,8 +149,6 @@ BEGIN {
     next_state=0
 }
 
-# (state == 2 && anything else) => default action.
-
 # Collect all tokens after the < till >.
 # Treat first (assume its an identifier) specially to get its coordinates.
 (state == 3 && $3 == "identifier") {
@@ -149,29 +159,42 @@ BEGIN {
     next_state=4
 }
 
-# Keep collecting tokens till >.
-(state == 4 && $4 != ">") {
+# Keep collecting tokens till > or newline.
+(state == 4 && $3 != "newline" && $4 != ">") { # eats up anything
     filename=filename $4
     # Note: suppressing this token.
     next_state=4
 }
 
-# Seen #include <...>.
-(state == 4 && $4 == ">") {
+# Seen #include <...>, or #include <...newline.
+(state == 4 && ($3 == "newline" || $4 == ">")) {
+    # When newline it's an error, but act as if > was present:
     empty_out()
     print id_lin "," id_col ",string-sys-filename,\"" filename "\""
     # Note: suppressing this token.
     next_state=0
 }
 
-# states 5, 6 not used for now.
+# Handle template <.
+(state == 5 && $4 == "<") {
+    $3="start-template-paramlist"
+    print $0
+    next_state=6
+}
+
+# Handle template < >, explicit specialization.
+(state == 6 && $4 == ">") {
+    $3="end-template-paramlist"
+    print $0
+    next_state=0
+}
 
 # Handle #define name.
 (state == 7 && $3 == "identifier") {
     id_lin=$1
     id_col=$2
     macro_name=$4
-    # Note: modifying this token.
+    # Note: modifying this token later.
     next_state=8
 }
 
@@ -183,35 +206,46 @@ BEGIN {
     next_state=0
 }
 
-# Handle #define name (.
+# Handle #define name whitespace
 (state == 8 && $3 == "whitespace") {
-    empty_out()
-    print id_lin "," id_col ",identifier-macro-const," macro_name
-    next_state=0
+    # Note: suppressing this token.
+    next_state=9
 }
 
-# Handle #define name.
-(state == 8 && $3 != "whitespace" && $4 != "(") {
+# Handle #define name [whitespace] newline
+((state == 8 || state == 9) && $3 == "newline") {
     empty_out()
     print id_lin "," id_col ",identifier-macro-def," macro_name
+    print $0
+    next_state=0
+}
 
-    if ($4 == "#") { # With -n should never happen.
-	push($0)
-	next_state=1
-    }
-    else { # Most probably a newline.
-	print $0
-	next_state=0
-    }
+# Handle #define name whitespace !newline.
+(state == 9 && $3 != "newline") {
+    empty_out()
+    print id_lin "," id_col ",identifier-macro-const," macro_name
+    print $0
+    next_state=0
 }
 
-# Default rule; always executed.
+# Default rule; always executed:
+# 1. no prior rule matched:
+#    - print current token except for whitespace
+#    - back to state 0 to quickly recover for any errors in input
+#    - stay in same state only for whitespace, newline, and continuation;
+#      this allows for their presence without explicit mention in rules
+# 2. some rule matched:
+#    - simply move on to next state as stated in that rule
+#    - reset next_state to -1
 {
     if (next_state == -1) {
-	# Echo all other tokens as is (ignore whitespace though):
-	if ($3 != "whitespace")
+	# Echo the current token as is (ignore whitespace though):
+	if ($3 != "whitespace") {
 	    print $0
-	# Do not change state!
+	    if ($3 != "newline" && $3 != "continuation")
+		state=0
+	}
+	# otherwise: Do not change state!
     }
     else {
 	state=next_state

From f601a6040cec92d91e61df00b507a5d9b8d5a81f Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 17:32:25 -0400
Subject: [PATCH 14/34] minor changes in comments; empty_out when error

---
 tools/tokenizer/filter6.awk | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk
index 6d5e3ea..c803030 100755
--- a/tools/tokenizer/filter6.awk
+++ b/tools/tokenizer/filter6.awk
@@ -97,7 +97,7 @@ BEGIN {
 # Make sure all conditions are mutually exclusive, except last one.
 # Last one is made exclusive by next_state==-1.
 # Must use next_state to avoid immediate action on current line.
-# All rules that match must set next_state to something other than -1.
+# All rules when matched must set next_state to something other than -1.
 
 # Instead of composing new CSV record could also modify $0 via
 # assignments to its fields (like $3="identifier").
@@ -166,12 +166,14 @@ BEGIN {
     next_state=4
 }
 
-# Seen #include <...>, or #include <...newline.
+# Handling #include <...>, or #include <...newline.
 (state == 4 && ($3 == "newline" || $4 == ">")) {
     # When newline it's an error, but act as if > was present:
     empty_out()
     print id_lin "," id_col ",string-sys-filename,\"" filename "\""
-    # Note: suppressing this token.
+    if ($3 == "newline")
+	print $0
+    # else suppressing the > token.
     next_state=0
 }
 
@@ -212,7 +214,7 @@ BEGIN {
     next_state=9
 }
 
-# Handle #define name [whitespace] newline
+# Handle #define name whitespace? newline
 ((state == 8 || state == 9) && $3 == "newline") {
     empty_out()
     print id_lin "," id_col ",identifier-macro-def," macro_name
@@ -230,10 +232,11 @@ BEGIN {
 
 # Default rule; always executed:
 # 1. no prior rule matched:
-#    - print current token except for whitespace
-#    - back to state 0 to quickly recover for any errors in input
 #    - stay in same state only for whitespace, newline, and continuation;
 #      this allows for their presence without explicit mention in rules
+#    - output any previously suppressed tokens (to not lose them)
+#    - print current token except for whitespace
+#    - back to state 0 to quickly recover for any errors in input
 # 2. some rule matched:
 #    - simply move on to next state as stated in that rule
 #    - reset next_state to -1
@@ -241,9 +244,11 @@ BEGIN {
     if (next_state == -1) {
 	# Echo the current token as is (ignore whitespace though):
 	if ($3 != "whitespace") {
-	    print $0
-	    if ($3 != "newline" && $3 != "continuation")
+	    if ($3 != "newline" && $3 != "continuation") {
+		empty_out()
 		state=0
+	    }
+	    print $0
 	}
 	# otherwise: Do not change state!
     }

From 11db8b84a756735c14a6bdad0f5cfcb0359e9289 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Mon, 4 Oct 2021 22:21:19 -0400
Subject: [PATCH 15/34] missed >>= as operator, output as >> =; fixed

---
 tools/tokenizer/libtoken.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 36ecc2a..f6b188e 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -1066,7 +1066,8 @@ unsigned C_tokenize(const char **token, const char **type,
           if (!strcmp("...", token_buf) ||
               !strcmp("<=>", token_buf) ||
               !strcmp("->*", token_buf) ||
-              !strcmp("<<=", token_buf)) {
+              !strcmp("<<=", token_buf) ||
+              !strcmp(">>=", token_buf)) {
             // Triple char operator/punctuator.
             *type = "operator";
             break;

From 0179b7e4d7691365c5380a4cdf591dd67b106cf5 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Wed, 6 Oct 2021 12:38:09 -0400
Subject: [PATCH 16/34] #include <> can have keyword inside; fixed

---
 tools/tokenizer/filter6.awk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk
index c803030..2e26d54 100755
--- a/tools/tokenizer/filter6.awk
+++ b/tools/tokenizer/filter6.awk
@@ -37,6 +37,8 @@
 # * unary operator (dereference pointer)
 # * binary operator (multiplication)
 # * pointer declarator
+# & bitwise and operator
+# & address of operator
 # Can of worms: overloaded operator symbols
 
 # Simplistic CPP line syntax:
@@ -150,8 +152,8 @@ BEGIN {
 }
 
 # Collect all tokens after the < till >.
-# Treat first (assume its an identifier) specially to get its coordinates.
-(state == 3 && $3 == "identifier") {
+# Treat first specially to get its coordinates.
+(state == 3 && ($3 == "identifier" || $3 == "keyword")) {
     id_lin=$1
     id_col=$2
     filename=$4

From e943aacb9af5559ecf0d3d5962697cc96809c732 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 7 Oct 2021 16:35:29 -0400
Subject: [PATCH 17/34] macro name can be keyword; fixed

---
 tools/tokenizer/filter6.awk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/tokenizer/filter6.awk b/tools/tokenizer/filter6.awk
index 2e26d54..3105a63 100755
--- a/tools/tokenizer/filter6.awk
+++ b/tools/tokenizer/filter6.awk
@@ -194,7 +194,7 @@ BEGIN {
 }
 
 # Handle #define name.
-(state == 7 && $3 == "identifier") {
+(state == 7 && ($3 == "identifier" || $3 == "keyword")) {
     id_lin=$1
     id_col=$2
     macro_name=$4

From 8803f482b557ad61c39455b569ecaf5fc40c7b91 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 7 Oct 2021 17:30:11 -0400
Subject: [PATCH 18/34] changed internal token class to int instead of string

---
 tools/tokenizer/libtoken.c | 74 ++++++++++++++++++++++++++------------
 tools/tokenizer/libtoken.h | 24 ++++++++++++-
 tools/tokenizer/tokenize.c | 43 +++++++++++-----------
 3 files changed, 95 insertions(+), 46 deletions(-)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index f6b188e..3191f7e 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -33,6 +33,24 @@ int continuation_token = 0;  // when 1 output line continuation pseudo-token
 
 static int logical_lines = 0;     // when 1 ignore line continuations in get)
 
+// Must be synced with enum TokenClass!
+const char *token_class[] = {
+  /* 0*/ "identifier",
+  /* 1*/ "keyword",
+  /* 2*/ "string",
+  /* 3*/ "character",
+  /* 4*/ "integer",
+  /* 5*/ "floating",
+  /* 6*/ "operator",
+  /* 7*/ "preprocessor",
+  /* 8*/ "line_comment",
+  /* 9*/ "block_comment",
+  /*10*/ "whitespace",
+  /*11*/ "newline",
+  /*12*/ "continuation",
+  /*13*/ "filename"
+};
+
 /* No longer using perfect hash function but simple binary search. */
 
 /* C11 n1570.pdf 6.4.1 (44)
@@ -503,11 +521,12 @@ static void token_buf_close(void)
    Returns 0 upon EOF else the token length in bytes.
    (There are no 0-length tokens!)
 */
-unsigned C_tokenize(const char **token, const char **type,
-                    unsigned *line, unsigned *col)
+
+unsigned C_tokenize_int(const char **token, enum TokenClass *type,
+			       unsigned *line, unsigned *col)
 {
   int cc;
-  *type = "";
+  *type = 0;
 
   do { // infinite loop; after token recognized breaks out.
     // Start collecting a token.
@@ -537,7 +556,7 @@ unsigned C_tokenize(const char **token, const char **type,
     if (cc == '\n' && newline_token) { // end of a logical line
       // Here we assume the buf is empty.
       token_buf_push(cc);
-      *type = "newline";
+      *type = NEWLINE;
       break;
     }
 
@@ -545,7 +564,7 @@ unsigned C_tokenize(const char **token, const char **type,
       // Here we assume the buf is empty.
       token_buf_push('\\');
       token_buf_push('\n');
-      *type = "continuation";
+      *type = CONTINUATION;
       break;
     }
 
@@ -571,7 +590,7 @@ unsigned C_tokenize(const char **token, const char **type,
 	if (whitespace_token) {
 	  // Undo lookahead (unget(EOF) has no effect!):
 	  unget(cc); // next token will be newline
-	  *type = "whitespace";
+	  *type = WHITESPACE;
 	  token_buf_close();
 	  *token = token_buf;
 	  return token_len;
@@ -585,7 +604,7 @@ unsigned C_tokenize(const char **token, const char **type,
 	if (whitespace_token) {
 	  // Undo lookahead (unget(EOF) has no effect!):
 	  unget(cc); // next token will be continuation
-	  *type = "whitespace";
+	  *type = WHITESPACE;
 	  token_buf_close();
 	  *token = token_buf;
 	  return token_len;
@@ -599,7 +618,7 @@ unsigned C_tokenize(const char **token, const char **type,
     if (whitespace_token && token_len) {
       // Undo lookahead (unget(EOF) has no effect!):
       unget(cc);
-      *type = "whitespace";
+      *type = WHITESPACE;
       break;
     }
 
@@ -630,7 +649,7 @@ unsigned C_tokenize(const char **token, const char **type,
       if (comment_token) {
 	// Undo lookahead (unget(EOF) has no effect!):
         unget(cc);
-        *type = "line_comment";
+        *type = LINE_COMMENT;
         break;
       }
       *line = linenr-1;
@@ -656,7 +675,7 @@ unsigned C_tokenize(const char **token, const char **type,
         if (comment_token) {
 	  // Undo lookahead (unget(EOF) has no effect!):
           unget(cc);
-          *type = "line_comment";
+          *type = LINE_COMMENT;
           break;
         }
 	*line = linenr-1;
@@ -689,7 +708,7 @@ unsigned C_tokenize(const char **token, const char **type,
         // cc == '*' && nc == '/'
         // Don't consider char right after */ as part of comment.
         if (comment_token) {
-          *type = "block_comment";
+          *type = BLOCK_COMMENT;
           break;
         }
 	*line = linenr;
@@ -736,7 +755,7 @@ unsigned C_tokenize(const char **token, const char **type,
         token_buf_push(cc);
       unget(cc);
       token_buf_close();
-      *type = is_keyword(token_buf) ? "keyword" : "identifier";
+      *type = is_keyword(token_buf) ? KEYWORD : IDENTIFIER;
       break;
     }
 
@@ -847,7 +866,7 @@ unsigned C_tokenize(const char **token, const char **type,
             token_buf_push(cc);
           else
             unget(cc);
-          *type = "floating";
+          *type = FLOATING;
           break;
         }
       }
@@ -887,7 +906,7 @@ unsigned C_tokenize(const char **token, const char **type,
       }
       else
         unget(cc);
-      *type = "integer";
+      *type = INTEGER;
       break;
     }
 
@@ -920,7 +939,7 @@ unsigned C_tokenize(const char **token, const char **type,
       }
       // cc == '"'
       token_buf_push(cc);
-      *type = "string";
+      *type = STRING;
       break;
     }
 
@@ -938,7 +957,7 @@ unsigned C_tokenize(const char **token, const char **type,
 		filename, linenr);
 	// Output as token anyway, but count as illegal:
 	token_buf_push(cc);
-	*type = "character";
+	*type = CHARACTER;
 	illegals++;
 	break;
       }
@@ -990,7 +1009,7 @@ unsigned C_tokenize(const char **token, const char **type,
 	token_buf_push(cc);
       else
 	unget(cc);
-      *type = "character";
+      *type = CHARACTER;
       break;
     }
 
@@ -1031,7 +1050,7 @@ unsigned C_tokenize(const char **token, const char **type,
 
     if (strstr("{}[]();?~,@", token_buf)) { // allow @ for Java
       // Single char operator/punctuator.
-      *type = "operator";
+      *type = OPERATOR;
       break;
     }
 
@@ -1058,7 +1077,7 @@ unsigned C_tokenize(const char **token, const char **type,
             else
               unget(c4);
               //token=[>,>,>,0];len=3
-            *type = "operator";
+            *type = OPERATOR;
             break;
           }
           //token=[cc,c2,c3,0];len=3
@@ -1069,7 +1088,7 @@ unsigned C_tokenize(const char **token, const char **type,
               !strcmp("<<=", token_buf) ||
               !strcmp(">>=", token_buf)) {
             // Triple char operator/punctuator.
-            *type = "operator";
+            *type = OPERATOR;
             break;
           }
 
@@ -1097,7 +1116,7 @@ unsigned C_tokenize(const char **token, const char **type,
           if (!strcmp(ops2[i], token_buf))
             break;
         if (i < size) {
-          *type = "operator";
+          *type = OPERATOR;
           break;
         }
         //token=[cc,c2,0];len=2
@@ -1111,7 +1130,7 @@ unsigned C_tokenize(const char **token, const char **type,
 
       // Must be single char.
       unget(c2);
-      *type = "operator";
+      *type = OPERATOR;
       break;
     }
     //token=[cc,0];len=1
@@ -1124,7 +1143,7 @@ unsigned C_tokenize(const char **token, const char **type,
         unget(nc);
       else
         token_buf_push(nc);
-      *type = "preprocessor";
+      *type = PREPROCESSOR;
       break;
     }
 
@@ -1143,6 +1162,15 @@ unsigned C_tokenize(const char **token, const char **type,
   return token_len;
 }
 
+unsigned C_tokenize(const char **token, const char **type,
+                    unsigned *line, unsigned *col)
+{
+  enum TokenClass typeid;
+  unsigned result = C_tokenize_int(token, &typeid, line, col);
+  *type = token_class[typeid];
+  return result;
+}
+
 // Escape hard newlines in a string.
 void RAW_escape(FILE *out, const char *token)
 {
diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h
index 94f0195..a6ca3a8 100644
--- a/tools/tokenizer/libtoken.h
+++ b/tools/tokenizer/libtoken.h
@@ -52,6 +52,25 @@ extern int comment_token/*= 0*/;     // when 1 output comments as tokens
 extern int whitespace_token/*= 0*/;  // when 1 output adjacent white-space as a token
 extern int continuation_token/*= 0*/; // when 1 output line continuation pseudo-token
 
+enum TokenClass {
+  /* 0*/ IDENTIFIER,
+  /* 1*/ KEYWORD,
+  /* 2*/ STRING,
+  /* 3*/ CHARACTER,
+  /* 4*/ INTEGER,
+  /* 5*/ FLOATING,
+  /* 6*/ OPERATOR,
+  /* 7*/ PREPROCESSOR,
+  /* 8*/ LINE_COMMENT,
+  /* 9*/ BLOCK_COMMENT,
+  /*10*/ WHITESPACE,
+  /*11*/ NEWLINE,
+  /*12*/ CONTINUATION,
+  /*13*/ FILENAME
+};
+
+extern const char *token_class[];
+
 // keyword lookup function (pointer variable):
 // (initialized by set_or_detect_lang())
 extern const char *(*is_keyword)(const char *);
@@ -62,7 +81,10 @@ extern Language set_or_detect_lang(const char *source);
 extern const char *lang_name(Language lang);
 extern int open_as_stdin(const char *file);
 
-extern unsigned C_tokenize(const char **token, const char **type, unsigned *line, unsigned *col);
+extern unsigned C_tokenize_int(const char **token, enum TokenClass *type,
+			       unsigned *line, unsigned *col);
+extern unsigned C_tokenize(const char **token, const char **type,
+			   unsigned *line, unsigned *col);
 
 extern void  RAW_escape(FILE *out, const char *token);
 extern void  CSV_escape(FILE *out, const char *token);
diff --git a/tools/tokenizer/tokenize.c b/tools/tokenizer/tokenize.c
index 7822a30..d63a2fa 100644
--- a/tools/tokenizer/tokenize.c
+++ b/tools/tokenizer/tokenize.c
@@ -108,7 +108,7 @@ int main(int argc, char *argv[])
   char usage_str[80];
 
   const char *token;
-  const char *type;
+  enum TokenClass type;
   unsigned line;
   unsigned col;
   unsigned token_len;
@@ -316,28 +316,27 @@ fputs(
       break;
     }
 
-    while ((token_len = C_tokenize(&token, &type, &line, &col))) {
+    while ((token_len = C_tokenize_int(&token, &type, &line, &col))) {
       switch (mode) {
       case RAW:
         fputs(token, stdout);
         fputc('\n', stdout);
         break;
       case PLAIN:
-        fprintf(stdout, "(%4u,%3u) %s: %s\n", line, col, type, token);
+        fprintf(stdout, "(%4u,%3u) %s: %s\n",
+		line, col, token_class[type], token);
         break;
       case CSV:
         // Escape , " in token
         // csvkit treats . as null fields even as ".".
-        fprintf(stdout, "%u,%u,%s,", line, col, type);
-        if (!strcmp(type, "string") ||
+        fprintf(stdout, "%u,%u,%s,", line, col, token_class[type]);
+        if (type == STRING ||
             // Do we need this too? Yes!
-            !strcmp(type, "character") && strchr(token, '"') ||
-            !strcmp(type, "character") && strchr(token, ',') ||
-            !strcmp(type, "whitespace") && strchr(token, '\n') ||
-            !strcmp(type, "newline") ||
-            !strcmp(type, "continuation") ||
-            comment_token && (!strcmp(type, "line_comment") ||
-                              !strcmp(type, "block_comment")))
+	    type == CHARACTER && (strchr(token, '"') || strchr(token, ',')) ||
+            type == WHITESPACE && strchr(token, '\n') ||
+            type == NEWLINE ||
+            type == CONTINUATION ||
+            comment_token && (type == LINE_COMMENT || type == BLOCK_COMMENT))
           CSV_escape(stdout, token);
         else if (!strcmp(token, ","))
           fputs("\",\"", stdout);
@@ -356,11 +355,11 @@ fputs(
         fprintf(stdout,
                 "{ \"line\": %u, \"column\": %u, "
                 "\"class\": \"%s\", \"length\": %u, \"token\": \"",
-                line, col, type, token_len);
+                line, col, token_class[type], token_len);
         // token value is always a JSON string.
-        if (!strcmp(type, "string") || !strcmp(type, "character") ||
-            !strcmp(type, "newline") || !strcmp(type, "whitespace") ||
-            !strcmp(type, "continuation"))
+        if (type == STRING  || type == CHARACTER ||
+            type == NEWLINE || type == WHITESPACE ||
+            type == CONTINUATION)
           JSON_escape(stdout, token);
         else
           fputs(token, stdout);
@@ -368,12 +367,12 @@ fputs(
         break;
       case XML:
         fprintf(stdout, "<token line='%u' column='%u' class='%s' length='%u'>",
-                line, col, type, token_len);
-            if (!strcmp(type, "string") ||
-                !strcmp(type, "character") ||
-                !strcmp(type, "operator") ||
-                comment_token && (!strcmp(type, "line_comment") ||
-                                  !strcmp(type, "block_comment")))
+                line, col, token_class[type], token_len);
+	if (type == STRING ||
+	    type == CHARACTER ||
+	    type == OPERATOR ||
+	    comment_token && (type == LINE_COMMENT ||
+			      type == BLOCK_COMMENT))
           XML_escape(stdout, token);
         else
           fputs(token, stdout);

From 291ca63d389753fa233752a1ffacb289d317819b Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Sat, 9 Oct 2021 11:39:14 -0400
Subject: [PATCH 19/34] using enum TokenClass instead of string

---
 tools/tokenizer/tokml.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/tokenizer/tokml.c b/tools/tokenizer/tokml.c
index 7fb96d9..c63e618 100644
--- a/tools/tokenizer/tokml.c
+++ b/tools/tokenizer/tokml.c
@@ -54,7 +54,7 @@ int main(int argc, char *argv[])
   char usage_str[80];
 
   const char *token;
-  const char *type;
+  enum TokenClass type;
   unsigned line;
   unsigned col;
   unsigned token_len;
@@ -171,22 +171,22 @@ fputs(
 	      lang_name(source), filename);
     }
 
-    while ((token_len = C_tokenize(&token, &type, &line, &col))) {
-      if (!strcmp(type, "whitespace")) {
+    while ((token_len = C_tokenize_int(&token, &type, &line, &col))) {
+      if (type == WHITESPACE) {
 	fputs(token, stdout);
 	continue;
       }
       fprintf(stdout, "<%s line='%u' col='%u' len='%u'>",
-	      type, line, col, token_len);
-      if (!strcmp(type, "string")
-	  || !strcmp(type, "character")
-	  || !strcmp(type, "operator")
-	  || !strcmp(type, "line_comment")
-	  || !strcmp(type, "block_comment"))
+	      token_class[type], line, col, token_len);
+      if (type == STRING ||
+	  type == CHARACTER ||
+	  type == OPERATOR ||
+	  type == LINE_COMMENT ||
+	  type == BLOCK_COMMENT)
 	XML_escape(stdout, token);
       else
 	fputs(token, stdout);
-      fprintf(stdout, "</%s>", type);
+      fprintf(stdout, "</%s>", token_class[type]);
     }
 
     if (!continuous_files) {

From e8a4f4319c8a1cdde232e7a001a47a993571bd79 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Wed, 13 Oct 2021 13:40:40 -0400
Subject: [PATCH 20/34] minor updates; more comments

---
 tools/tokenizer/ntokenize.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/tokenizer/ntokenize.c b/tools/tokenizer/ntokenize.c
index 3d1e67e..7adac51 100644
--- a/tools/tokenizer/ntokenize.c
+++ b/tools/tokenizer/ntokenize.c
@@ -56,8 +56,8 @@
 #define ws_RE           "[ \t\v\f\n]*"
 
 // 96 chars (omitted are e.g.: @ $ `)
-//                                     3  5  67         8         9       9
-//                        1234 5 6 7   3  9  9012345678901234567890123 4 56
+//                                     33 56 67         8         9       9
+//                        1234 5 6 7 8 34 90 9012345678901234567890123 4 56
 #define basic_char0_RE  "[][ \t\v\f\na-zA-Z0-9_{}#()<>%:;.?*+/^&|~!=,\\\"'-]"
 
 // all basic chars except \n and >
@@ -247,10 +247,10 @@ int normalize_newline(void)
       char_count++; // counts the carriage return
       utf8_count++;
       // No use incrementing column.
-      return nc; // effectively skip the \r
+      return nc; // return \n; effectively skipping the \r
     }
-    // Mind nc not \n.
-    if (nc != EOF) ungetc(nc, stdin);
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
     // cc == '\r'; consider a newline as well, so turn into \n:
     cc = '\n';
   }
@@ -265,15 +265,15 @@ int get(void)
   int cc;
  restart:
   // Read a fresh char:
-  cc = normalize_newline();
+  cc = normalize_newline(); // cc != '\r'
   if (cc == EOF) return EOF;
   char_count++;
   if (utf8_start(cc)) utf8_count++;
 
-  if (cc == '\n') {
+  if (cc == '\n') { // a normalized end-of-line (\r|\r?\n)
     linenr++;
     column = 0;
-    return cc;
+    return cc; // \n here signals a logical end-of-line
   }
 
   // Deal with \ line continuations!
@@ -290,8 +290,8 @@ int get(void)
       // Could again start a line continuation!
       goto restart;
     }
-    // Mind nc not \n.
-    if (nc != EOF) ungetc(nc, stdin);
+    // Mind nc not \n. ungetc(EOF) is Okay.
+    ungetc(nc, stdin);
     // cc == '\\' a regular backslash
   }
   column++;

From 29a753fb0f6621b6739c542914e3aaec1fbf999d Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Wed, 13 Oct 2021 22:49:47 -0400
Subject: [PATCH 21/34] add support to include by C++

---
 tools/tokenizer/libtoken.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h
index a6ca3a8..f4fe4dd 100644
--- a/tools/tokenizer/libtoken.h
+++ b/tools/tokenizer/libtoken.h
@@ -13,6 +13,10 @@
 #include <ctype.h>
 #include <string.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define MAX_BUF       8  // maximum lookahead in chars
 
 /* Let's assume UTF-8 encoding.
@@ -91,4 +95,8 @@ extern void  CSV_escape(FILE *out, const char *token);
 extern void JSON_escape(FILE *out, const char *token);
 extern void  XML_escape(FILE *out, const char *token);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* LIBTOKEN_H */

From 47f50fcfbf77437c3f089efa740bae3dae1408d0 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 14 Oct 2021 13:12:59 -0400
Subject: [PATCH 22/34] allow EOF to be recognized as a pseudo token

---
 tools/tokenizer/libtoken.c | 51 +++++++++++++++++++++++++++-----------
 tools/tokenizer/libtoken.h |  3 ++-
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 3191f7e..1695726 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -48,7 +48,8 @@ const char *token_class[] = {
   /*10*/ "whitespace",
   /*11*/ "newline",
   /*12*/ "continuation",
-  /*13*/ "filename"
+  /*13*/ "filename",
+  /*14*/ "endoffile"
 };
 
 /* No longer using perfect hash function but simple binary search. */
@@ -446,12 +447,6 @@ static char *token_buf = 0;
 static unsigned token_alloc = 0;
 static unsigned token_len = 0;
 
-// Resets the token buffer cursor.
-static void token_buf_reset(void)
-{
-  token_len = 0;
-}
-
 // Makes sure there is room in the token buffer.
 static void token_buf_room(void)
 {
@@ -496,6 +491,12 @@ static void token_buf_close(void)
   token_buf[token_len] = '\0'; // Note: no advance
 }
 
+// Resets the token buffer cursor.
+static void token_buf_reset(void)
+{
+  token_len = 0;
+}
+
 /* Tokenization of C++ programming language source text.
    Recognizes:
    - identifier
@@ -520,13 +521,19 @@ static void token_buf_close(void)
 
    Returns 0 upon EOF else the token length in bytes.
    (There are no 0-length tokens!)
+   EOF may be interpreted as a token. The function then returns:
+   token = "", type = endoffile, line and col correctly defined.
+
+   An unexpected EOF is the middle of a token will cause an error message
+   and the partial token to be output first before a next call returns 0
+   (to indicate the EOF condition).
 */
 
 unsigned C_tokenize_int(const char **token, enum TokenClass *type,
 			       unsigned *line, unsigned *col)
 {
   int cc;
-  *type = 0;
+  *type = ENDOFFILE;
 
   do { // infinite loop; after token recognized breaks out.
     // Start collecting a token.
@@ -622,8 +629,10 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
       break;
     }
 
-    if (cc == EOF)
-      return 0;
+    if (cc == EOF) {
+      token_buf_reset();
+      break;
+    }
 
     // Rest of tokens treat line continuations as non-existent:
     logical_lines = 1;
@@ -700,7 +709,14 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
                     "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
                     filename, *line);
             unexpect_eof++;
-            return 0;
+	    if (comment_token)
+	      // Better return partial comment as token and postpone EOF:
+	      *type = BLOCK_COMMENT;
+	    else
+	      token_buf_reset();
+	    token_buf_close();
+	    *token = token_buf;
+            return token_len;
           }
           if (comment_token)
             token_buf_push(nc);
@@ -923,7 +939,11 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
                   "(E): [%s:%u] Unexpected end-of-file in string literal.\n",
                   filename, *line);
           unexpect_eof++;
-          return 0;
+	  // Better return partial string as token and postpone EOF:
+	  *type = STRING;
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
         }
         token_buf_push(cc);
         int nc = get();
@@ -969,8 +989,11 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
                   "(E): [%s:%u] Unexpected end-of-file in character literal.\n",
                   filename, linenr);
           unexpect_eof++;
-	  // Note: partial character literal is lost.
-          return 0;
+	  // Better return partial character as token and postpone EOF:
+	  *type = CHARACTER;
+	  token_buf_close();
+	  *token = token_buf;
+	  return token_len;
         }
         if (cc == '\n') { // Error!
           fprintf(stderr,
diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h
index f4fe4dd..eb49c6a 100644
--- a/tools/tokenizer/libtoken.h
+++ b/tools/tokenizer/libtoken.h
@@ -70,7 +70,8 @@ enum TokenClass {
   /*10*/ WHITESPACE,
   /*11*/ NEWLINE,
   /*12*/ CONTINUATION,
-  /*13*/ FILENAME
+  /*13*/ FILENAME,
+  /*14*/ ENDOFFILE
 };
 
 extern const char *token_class[];

From e120f937a1cbfb1b2115251619ba9fca291dedf0 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 14 Oct 2021 13:55:20 -0400
Subject: [PATCH 23/34] consolidated \n and \r cases

---
 tools/tokenizer/libtoken.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 1695726..16737c2 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -524,7 +524,7 @@ static void token_buf_reset(void)
    EOF may be interpreted as a token. The function then returns:
    token = "", type = endoffile, line and col correctly defined.
 
-   An unexpected EOF is the middle of a token will cause an error message
+   An unexpected EOF in the middle of a token will cause an error message
    and the partial token to be output first before a next call returns 0
    (to indicate the EOF condition).
 */
@@ -592,31 +592,18 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
       // Here: whitespace_token implies token_len > 0
 
       cc = get();
-      if (cc == '\n' && newline_token) {
+      if (cc == '\n' && newline_token ||
+	  cc == '\r' && continuation_token) {
 	// Must issue whitespace token if so requested.
 	if (whitespace_token) {
 	  // Undo lookahead (unget(EOF) has no effect!):
-	  unget(cc); // next token will be newline
+	  unget(cc); // next token will be newline/continuation
 	  *type = WHITESPACE;
 	  token_buf_close();
 	  *token = token_buf;
 	  return token_len;
 	}
-	// Issue newline token right away:
-	goto restart;
-      }
-
-      if (cc == '\r' && continuation_token) {
-	// Must issue whitespace token if so requested.
-	if (whitespace_token) {
-	  // Undo lookahead (unget(EOF) has no effect!):
-	  unget(cc); // next token will be continuation
-	  *type = WHITESPACE;
-	  token_buf_close();
-	  *token = token_buf;
-	  return token_len;
-	}
-	// Issue continuation token right away:
+	// Issue newline/continuation token right away:
 	goto restart;
       }
     }

From 8c4ab731a9d0b65b57d8ae3a4ab8af4ae0291467 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 14 Oct 2021 14:35:16 -0400
Subject: [PATCH 24/34] minor clarifications

---
 tools/tokenizer/README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/tokenizer/README.md b/tools/tokenizer/README.md
index 4c3a8f6..e82940f 100644
--- a/tools/tokenizer/README.md
+++ b/tools/tokenizer/README.md
@@ -20,6 +20,7 @@ The following lexeme classes are recognized:
 - single-quoted character literal
 - all single, double, and triple operator and punctuation symbols
 - the preprocessor tokens # and ##
+- a number of pseudo tokens depending on selected options
 
 For each correctly recognized token, the program determines its class/type and
 the exact coordinates (line number and column) in the input text of
@@ -30,8 +31,8 @@ A newline is defined as a single linefeed character `\n`, a carriage return
 `\r`, or the combination carriage return `\r` followed by linefeed `\n`.
 Line continuations, i.e., a backslash immediately followed by a newline, are handled
 at the character input level, so the token recognizers will only see logical
-lines. Line and column coordinates reflect positions in the physical line
-structure, not the logical one. When requested, logical line endings are
+lines. Line and column coordinates however reflect positions in the physical line
+structure, not the logical one. When so requested, logical line endings are
 output as `newline` pseudo tokens and will be represented by a linefeed
 character. Similarly, when requested, continuations are output as
 `continuation` pseudo tokens and will be represented by a backslash-escaped
@@ -208,9 +209,10 @@ line,column,class,token
 ```
 
 The operator token `,` is escaped with double quotes, like so `","`.
-String tokens are escaped as well and any original double quote is doubled.
+String tokens are always escaped and any original double quote is doubled.
 A newline on its own or as part of whitespace will appear escaped as `\n`.
-A whitespace token text will appear inside double quotes.
+A whitespace token text will appear inside double quotes. A continuation token
+will appear as `"\\n"`.
 
 ### JSON output
 

From 86f12a0db44cb6cc83dc0dfeaccd74816f40ece6 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Fri, 22 Oct 2021 10:11:41 -0400
Subject: [PATCH 25/34] fixed dataset download link markdown syntax

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f7f2418..02cf02e 100644
--- a/README.md
+++ b/README.md
@@ -67,8 +67,7 @@ https://dax-cdn.cdn.appdomain.cloud/dax-project-codenet/1.0.0/Project_CodeNet.ta
 
 ## Download the dataset
 
-Download the full dataset in our [data repository]
-(https://developer.ibm.com/technologies/artificial-intelligence/data/project-codenet/).
+Download the full dataset in our [data repository](https://developer.ibm.com/technologies/artificial-intelligence/data/project-codenet/).
 
 `tar -zxf Project_CodeNet_full.tar.gz`
 to uncompress and untar. The directory structure and how the code samples are organized are explained [here](README.md#directory-structure-and-naming-convention).
@@ -266,4 +265,4 @@ Whether and to what extent the above steps can successfully be applied to any gi
 
 ### Contributors
 
-Ruchir Puri, David S. Kung, Geert Janssen, Wei Zhang, Giacomo Domeniconi, Vladimir Zolotov, Julian Dolby, Jie Chen, Mihir Choudhury, Lindsey Decker, Veronika Thost, Luca Buratti, Saurabh Pujar, Shyam Ramji, Ulrich Finkler, Susan Malaika, Frederick Reiss.
\ No newline at end of file
+Ruchir Puri, David S. Kung, Geert Janssen, Wei Zhang, Giacomo Domeniconi, Vladimir Zolotov, Julian Dolby, Jie Chen, Mihir Choudhury, Lindsey Decker, Veronika Thost, Luca Buratti, Saurabh Pujar, Shyam Ramji, Ulrich Finkler, Susan Malaika, Frederick Reiss.

From 4cb2ca48b5fcc0eb5ad741dd28c6bc52bde4e06c Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Tue, 4 Jan 2022 13:38:26 -0500
Subject: [PATCH 26/34] no warn for backslash ending comment

---
 tools/tokenizer/pytokenize.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tools/tokenizer/pytokenize.c b/tools/tokenizer/pytokenize.c
index 96dd491..664fdcb 100644
--- a/tools/tokenizer/pytokenize.c
+++ b/tools/tokenizer/pytokenize.c
@@ -1,7 +1,7 @@
 /* Copyright (c) 2021 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
-   Tokenizer for Python 3.
+   Tokenizer for Python 3.x
 
    Token classes:
    - identifier
@@ -33,7 +33,7 @@
 // Program globals:
 static unsigned brackets_opened = 0; // unpaired nested ( [ { seen
 static int prev_was_newline = 1;     // no previous token or was newline
-static int first_time = 1;
+static int first_time = 1;	     // control add , for JSON and JSONL
 
 // Program option settings:
 static int start_token = 0;       // when 1 start filename pseudo-token
@@ -51,7 +51,7 @@ static const char *keywords[] = {
 
 static const unsigned num_keywords = sizeof(keywords)/sizeof(keywords[0]);
 
-static void emit(const char *s, unsigned line,  unsigned col)
+static void emit(const char *s, unsigned line, unsigned col)
 {
   if (output_layout) {
     switch (mode) {
@@ -100,7 +100,7 @@ static void process_newline(unsigned indent)
 {
   emit("NEWLINE", linenr-1, saved_col);
 
-  unsigned last_indent = indents_top();
+  unsigned last_indent = indents_top(); // maybe 0
 
   if (indent > last_indent) {
     indents_push(indent);
@@ -116,11 +116,12 @@ static void process_newline(unsigned indent)
     } while (indent < indents_top());
     // Here: empty() || indent >= top()
     if (indent > indents_top() && !nowarn)
-      fprintf(stderr, "(W): incorrect indentation.\n");
+      fprintf(stderr, "(W): Incorrect indentation.\n");
   }
   // else: indent == last_indent: no action
 }
 
+// cc in [ \t\f]
 static int process_ws(int cc)
 {
   // Collect white-space and compute possible indentation:
@@ -288,8 +289,11 @@ static int tokenize(char *token, const char **type,
         ;
       // cc == '\n' || cc == '\r' || cc == EOF
       if (cc == '\r') {
-        if (!nowarn)
-        fprintf(stderr, "(W): Comment may not be continued with \\.\n");
+	// presumably a \ may occur in a comment as last char before \n
+        /*
+	  if (!nowarn)
+	  fprintf(stderr, "(W): Comment may not be continued with \\.\n");
+	*/
         // Effectively ignore any \ and terminate logical line:
         cc == '\n';
       }
@@ -885,7 +889,7 @@ fputs(
 
     case '?':
     default:
-      fputs("(F): unknown option. Stop.\n", stderr);
+      fputs("(F): Unknown option. Stop.\n", stderr);
       fprintf(stderr, usage_str, argv[0]);
       return 1;
     }
@@ -893,7 +897,7 @@ fputs(
 
   if (outfile && outfile[0]) {
     if (!freopen(outfile, "w", stdout)) {
-      fprintf(stderr, "(F): cannot open %s for writing.\n", outfile);
+      fprintf(stderr, "(F): Cannot open %s for writing.\n", outfile);
       exit(3);
     }
   }
@@ -905,7 +909,7 @@ fputs(
     filename = argv[optind];
     if (!freopen(filename, "r", stdin)) {
       if (!nowarn)
-      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
+      fprintf(stderr, "(W): Cannot read file %s; skipped.\n", filename);
       continue;
     }
 

From 14abc40e0cc6087ac00eea8e187b7940251418d4 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Wed, 19 Jan 2022 12:40:31 -0500
Subject: [PATCH 27/34] fixed bug: did not recognize escaped TAB and NL
 correctly

---
 tools/json-graph/src/jsonml2jgf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/json-graph/src/jsonml2jgf.c b/tools/json-graph/src/jsonml2jgf.c
index 73a0bf8..c1395ac 100644
--- a/tools/json-graph/src/jsonml2jgf.c
+++ b/tools/json-graph/src/jsonml2jgf.c
@@ -94,7 +94,7 @@ static void det_coords_adjust_labels(Graph g)
     for (p = start; p < end; ++p)
       if (isspace(*p))
 	n->label->start++;
-      else if (*p == '\\' && (*p == 'n' || *p == 't')) {
+      else if (*p == '\\' && (*(p+1) == 'n' || *(p+1) == 't')) {
 	n->label->start += 2;
 	++p;
       }

From a16474a1b0fd1935f507ff9e31d630098da285a4 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 20 Jan 2022 13:44:48 -0500
Subject: [PATCH 28/34] added -r option

---
 tools/tokenizer/tokenize.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/tokenizer/tokenize.c b/tools/tokenizer/tokenize.c
index d63a2fa..5bcb4aa 100644
--- a/tools/tokenizer/tokenize.c
+++ b/tools/tokenizer/tokenize.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 International Business Machines Corporation
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
    Simple C/C++ and Java Tokenizer.
@@ -122,6 +122,7 @@ int main(int argc, char *argv[])
   Language source;
   int explicit_source = 0;
   int append = 0;
+  int suppress_newline = 0;
 
   sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str);
 
@@ -164,6 +165,7 @@ fputs(
 "-n       : output newlines as a special pseudo token.\n"
 "-N       : output line continuations as a special pseudo token.\n"
 "-o<file> : write output to this file (instead of stdout).\n"
+"-r       : suppress newline after each token in raw mode.\n"
 "-s       : enable a special start token specifying the filename.\n"
 "-1       : treat all filename arguments as a continuous single input.\n"
 "-v       : print action summary to stderr.\n"
@@ -218,6 +220,10 @@ fputs(
       outfile = optarg;
       break;
 
+    case 'r':
+      suppress_newline = 1;
+      break;
+
     case 's':
       start_token = 1;
       break;
@@ -320,7 +326,7 @@ fputs(
       switch (mode) {
       case RAW:
         fputs(token, stdout);
-        fputc('\n', stdout);
+        if (!suppress_newline) fputc('\n', stdout);
         break;
       case PLAIN:
         fprintf(stdout, "(%4u,%3u) %s: %s\n",

From 010b0ef84dd19fb2f2ec5daa46495c299914eaf8 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Thu, 20 Jan 2022 13:45:10 -0500
Subject: [PATCH 29/34] removed unused first_time

---
 tools/tokenizer/tokml.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/tokenizer/tokml.c b/tools/tokenizer/tokml.c
index c63e618..a9abe6d 100644
--- a/tools/tokenizer/tokml.c
+++ b/tools/tokenizer/tokml.c
@@ -62,7 +62,6 @@ int main(int argc, char *argv[])
   int continuous_files = 0;  // when 1 do not reset after each file
 
   char *outfile = 0;
-  int first_time = 1;
   Language source;
   int explicit_source = 0;
   int append = 0;
@@ -204,7 +203,6 @@ fputs(
       column = 0;
       buffered = 0;
       saved_col = 0;
-      first_time = 1;
     }
   } while (++optind < argc);
 

From f710d5f94db4e634ee0c6dbb014f6264f82836d8 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Sat, 22 Jan 2022 16:01:27 -0500
Subject: [PATCH 30/34] anticipate use of absolute file position

---
 tools/tokenizer/libtoken.c  | 14 ++++++++++----
 tools/tokenizer/libtoken.h  |  6 +++---
 tools/tokenizer/tokenize.c  |  7 ++++---
 tools/tokenizer/tokenize.py | 10 +++++++---
 tools/tokenizer/tokml.c     | 11 ++++++-----
 5 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 16737c2..127af79 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 International Business Machines Corporation
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
    Code functionality shared by all tokenizers.
@@ -530,7 +530,7 @@ static void token_buf_reset(void)
 */
 
 unsigned C_tokenize_int(const char **token, enum TokenClass *type,
-			       unsigned *line, unsigned *col)
+			unsigned *line, unsigned *col, unsigned *pos)
 {
   int cc;
   *type = ENDOFFILE;
@@ -540,6 +540,7 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
     token_buf_reset();
     *line = linenr;
     *col = column;
+    *pos = char_count;
     // white-space tokens see continuation lines:
     logical_lines = 0;
     cc = get();
@@ -627,6 +628,7 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
     // If white-space skipped must reset coordinates:
     *line = linenr;
     *col = column-1;
+    *pos = char_count-1;
 
     /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/
     // Java: no preprocessor directives.
@@ -650,6 +652,7 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
       }
       *line = linenr-1;
       *col = saved_col;
+      *pos = char_count;
       goto restart;
     }
 
@@ -676,6 +679,7 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
         }
 	*line = linenr-1;
 	*col = saved_col;
+	*pos = char_count;
         goto restart;
       }
 
@@ -716,6 +720,7 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
         }
 	*line = linenr;
 	*col = column;
+	*pos = char_count;
         cc = get();
         goto restart;
       }
@@ -727,6 +732,7 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
     // If white-space and/or comments skipped must reset coordinates:
     *line = linenr;
     *col = column-1;
+    *pos = char_count-1;
 
     /*** CHAR and STRING PREFIX (C/C++) ***/
 
@@ -1173,10 +1179,10 @@ unsigned C_tokenize_int(const char **token, enum TokenClass *type,
 }
 
 unsigned C_tokenize(const char **token, const char **type,
-                    unsigned *line, unsigned *col)
+                    unsigned *line, unsigned *col, unsigned *pos)
 {
   enum TokenClass typeid;
-  unsigned result = C_tokenize_int(token, &typeid, line, col);
+  unsigned result = C_tokenize_int(token, &typeid, line, col, pos);
   *type = token_class[typeid];
   return result;
 }
diff --git a/tools/tokenizer/libtoken.h b/tools/tokenizer/libtoken.h
index eb49c6a..0af2491 100644
--- a/tools/tokenizer/libtoken.h
+++ b/tools/tokenizer/libtoken.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 International Business Machines Corporation
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
    Code functionality shared by all tokenizers.
@@ -87,9 +87,9 @@ extern const char *lang_name(Language lang);
 extern int open_as_stdin(const char *file);
 
 extern unsigned C_tokenize_int(const char **token, enum TokenClass *type,
-			       unsigned *line, unsigned *col);
+			       unsigned *line, unsigned *col, unsigned *pos);
 extern unsigned C_tokenize(const char **token, const char **type,
-			   unsigned *line, unsigned *col);
+			   unsigned *line, unsigned *col, unsigned *pos);
 
 extern void  RAW_escape(FILE *out, const char *token);
 extern void  CSV_escape(FILE *out, const char *token);
diff --git a/tools/tokenizer/tokenize.c b/tools/tokenizer/tokenize.c
index 5bcb4aa..559ed80 100644
--- a/tools/tokenizer/tokenize.c
+++ b/tools/tokenizer/tokenize.c
@@ -111,6 +111,7 @@ int main(int argc, char *argv[])
   enum TokenClass type;
   unsigned line;
   unsigned col;
+  unsigned pos;
   unsigned token_len;
   unsigned num_files = 0;    // number of files read
   int start_token = 0;       // when 1 start filename pseudo-token
@@ -322,15 +323,15 @@ fputs(
       break;
     }
 
-    while ((token_len = C_tokenize_int(&token, &type, &line, &col))) {
+    while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) {
       switch (mode) {
       case RAW:
         fputs(token, stdout);
         if (!suppress_newline) fputc('\n', stdout);
         break;
       case PLAIN:
-        fprintf(stdout, "(%4u,%3u) %s: %s\n",
-		line, col, token_class[type], token);
+        fprintf(stdout, "(%4u,%3u;%6u:%3u) %s: %s\n",
+		line, col, pos, token_len, token_class[type], token);
         break;
       case CSV:
         // Escape , " in token
diff --git a/tools/tokenizer/tokenize.py b/tools/tokenizer/tokenize.py
index e848a51..f5a162c 100755
--- a/tools/tokenizer/tokenize.py
+++ b/tools/tokenizer/tokenize.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright IBM Corporation 2021
+# Copyright IBM Corporation 2021, 2022
 # Written by Geert Janssen <geert@us.ibm.com>
 
 # Simple ctypes-based Python wrapper of libtoken.so
@@ -17,6 +17,7 @@
 libtoken.C_tokenize.argtypes = (POINTER(c_char_p),
                                 POINTER(c_char_p),
                                 POINTER(c_uint),
+                                POINTER(c_uint),
                                 POINTER(c_uint))
 libtoken.open_as_stdin.argtypes = (c_char_p,)
 
@@ -25,16 +26,19 @@
 _kind   = c_char_p()
 _linenr = c_uint()
 _column = c_uint()
+_pos    = c_uint()
 
 # Token generator:
 def token():
-    global _token, _kind, _linenr, _column
+    global _token, _kind, _linenr, _column, _pos
 
     # C_tokenize returns 0 upon end-of-file.
-    while int(libtoken.C_tokenize(byref(_token), byref(_kind), byref(_linenr), byref(_column))):
+    while int(libtoken.C_tokenize(byref(_token), byref(_kind), byref(_linenr),
+                                  byref(_column), byref(_pos))):
         # Turn ctypes into real Python values:
         lin = _linenr.value
         col = _column.value
+        pos = _pos.value # not used for now
         clas = _kind.value.decode()
         text = _token.value.decode()
         yield (lin,col,clas,text)
diff --git a/tools/tokenizer/tokml.c b/tools/tokenizer/tokml.c
index a9abe6d..3fdc6aa 100644
--- a/tools/tokenizer/tokml.c
+++ b/tools/tokenizer/tokml.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 International Business Machines Corporation
+/* Copyright (c) 2021, 2022 International Business Machines Corporation
    Prepared by: Geert Janssen <geert@us.ibm.com>
 
    Tokenizer for C, C++ and Java with output as annotated XML,
@@ -19,10 +19,10 @@
    The characters <, >, and & will be replaced by the special XML entities
    &lt;, &gt; and &amp; respectively.
 
-   To undo the XML annotation use either:
+   To undo the XML annotation in <file>.xml use either:
    (this will also correctly revert the XML entities)
-   xmlstarlet sel -T -t -v 'source' libtoken.xml
-   xidel -s -e 'source'
+   xmlstarlet sel -T -t -v 'source' <file>.xml, or
+   xidel -s -e 'source' <file>.xml
 
    Useful xpath queries:
    (the results show all occurrences and these are not necessarily unique)
@@ -57,6 +57,7 @@ int main(int argc, char *argv[])
   enum TokenClass type;
   unsigned line;
   unsigned col;
+  unsigned pos;
   unsigned token_len;
   unsigned num_files = 0;    // number of files read
   int continuous_files = 0;  // when 1 do not reset after each file
@@ -170,7 +171,7 @@ fputs(
 	      lang_name(source), filename);
     }
 
-    while ((token_len = C_tokenize_int(&token, &type, &line, &col))) {
+    while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) {
       if (type == WHITESPACE) {
 	fputs(token, stdout);
 	continue;

From dd86a1e739df24ad92424a2065cd172a184ade00 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Sat, 22 Jan 2022 16:44:11 -0500
Subject: [PATCH 31/34] must adjust char_count when unget

---
 tools/tokenizer/libtoken.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 127af79..9a0eb19 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -327,6 +327,7 @@ int get(void)
   // Get the next character:
   if (buffered) { // chars available in lookahead buffer
     cc = buffer[--buffered]; // never EOF
+    char_count++;
     // cc maybe '\r' (line continuation); act like '\n':
     if (cc == '\n' || cc == '\r') {
       linenr++;
@@ -395,6 +396,7 @@ void unget(int cc)
     }
     else
       column--;
+    char_count--;
     buffer[buffered++] = cc;
   }
   else {

From eb8440d8d3a3e4833289b8a8f36cbf69413cf6c6 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Wed, 26 Jan 2022 11:44:05 -0500
Subject: [PATCH 32/34] for queries no need for conversion to XML

---
 doc/srcml.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/srcml.md b/doc/srcml.md
index cf462ec..3002501 100644
--- a/doc/srcml.md
+++ b/doc/srcml.md
@@ -179,23 +179,25 @@ TRANSFORMATIONS:
 For instance, the XML markup can be enhanced with line and column coordinates.
 Notice also that srcML has built-in capabilities to query and manipulate the
 XML. Queries can be done with XPath expressions. General transformations can
-be executed with XSLT.
+be executed with XSLT. By the way, there is no need to first convert the
+source to XML; the built-in queries capabilities work also directly off the
+source code.
 
 Here are a few examples of useful operations:
 
 - Get all function and method definition names:
 ```console
-$ srcml --xpath="//src:function/src:name" program.xml
+$ srcml --xpath="//src:function/src:name" program.java
 ```
 
 - Count the number of conditions:
 ```console
-$ srcml --xpath='count(//src:condition)' program.xml
+$ srcml --xpath='count(//src:condition)' program.c
 ```
 
 - Output all line comments:
 ```console
-$ srcml --xpath='//src:comment[@type="line"]' program.xml
+$ srcml --xpath='//src:comment[@type="line"]' program.cpp
 ```
 
 Much more versatile and powerful tools to process any XML are

From 79660f34e6bd8d594161d336df5bf60387d3d60f Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Wed, 23 Feb 2022 18:56:54 -0500
Subject: [PATCH 33/34] did not detect private as keyword; fixed

---
 tools/tokenizer/libtoken.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 9a0eb19..3ff942f 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -84,7 +84,7 @@ static const char *CPP_keywords[] = {
   "friend",        "goto",          "if",            "inline",     "int",
   "long",          "mutable",       "namespace",     "new",        "noexcept",
   "not",           "not_eq",        "nullptr",       "operator",   "or",
-  "or_eq"          "private",       "protected",     "public",     "register",
+  "or_eq",         "private",       "protected",     "public",     "register",
   "reinterpret_cast", "return",     "short",         "signed",     "sizeof",
   "static",        "static_assert", "static_cast",   "struct",     "switch",
   "template",      "this",          "thread_local",  "throw",      "true",
@@ -107,7 +107,7 @@ static const char *CPP_keywords[] = {
   "float",         "for",           "friend",        "goto",       "if",
   "inline",        "int",           "long",          "mutable",    "namespace",
   "new",           "noexcept",      "not",           "not_eq",     "nullptr",
-  "operator",      "or",            "or_eq"          "private",    "protected",
+  "operator",      "or",            "or_eq",         "private",    "protected",
   "public",        "register",      "reinterpret_cast", "requires","return",
   "short",         "signed",        "sizeof",        "static",  "static_assert",
   "static_cast",   "struct",        "switch",        "template",   "this",

From d0959cc7b7951aabb656a7b02bd3670fbe8f23c9 Mon Sep 17 00:00:00 2001
From: Geert Janssen <geert@us.ibm.com>
Date: Tue, 8 Mar 2022 14:08:30 -0500
Subject: [PATCH 34/34] added escape of real TAB for JSON mode

---
 tools/tokenizer/libtoken.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/tokenizer/libtoken.c b/tools/tokenizer/libtoken.c
index 3ff942f..2fa4563 100644
--- a/tools/tokenizer/libtoken.c
+++ b/tools/tokenizer/libtoken.c
@@ -1232,6 +1232,11 @@ void JSON_escape(FILE *out, const char *token)
       fputs("\\n", out);
       continue;
     }
+    if (*p == '\t') { // escape embedded real TABs
+      fputs("\\t", out);
+      continue;
+    }
+    // FIXME: control characters from U+0000 through U+001F must be escaped
     if (*p == '\\' || *p == '"')
       fputc('\\', out);
     fputc(*p, out);