tools/generate_uri_parser_tables.dart

// Copyright (c) 2024, the Dart project authors.  Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

// ----------------------------------------------------------------------
// Code to create the URI scanner table used by `uri.dart`.
//
// This file exists in case someone, some day, will want to change the
// representation of the tables, maybe if Dart gets `Uint8List` literals.
// It should not otherwise be necessary to re-generate the tables.
//
// The table is stored in the `uri.dart` file as a 1-byte string literal.
// This script generates the string literal and prints it on stdout.
// If passed the `-u filename` flag, it instead updates the file directly.
// The file should be the `sdk/lib/core/uri.dart` file, which contains markers
// showing where to insert the generated code.

import "dart:convert" show LineSplitter;
import "dart:io";
import "dart:typed_data";

// Indices in the position array, where transitions write
// their current position.

/// Index of the position of that `:` after a scheme.
const int _schemeEndIndex = 1;

/// Index of the position of the character just before the host name.
const int _hostStartIndex = 2;

/// Index of the position of the `:` before a port value.
const int _portStartIndex = 3;

/// Index of the position of the first character of a path.
const int _pathStartIndex = 4;

/// Index of the position of the `?` before a query.
const int _queryStartIndex = 5;

/// Index of the position of the `#` before a fragment.
const int _fragmentStartIndex = 6;

/// Index of a position where the URI was determined to be "non-simple".
const int _notSimpleIndex = 7;

// Significant states and state related numbers.

/// Initial state for scanner.
const int _uriStart = 0;

/// If scanning of a URI terminates in this state or above,
/// consider the URI non-simple
const int _nonSimpleEndStates = 14;

/// Initial state for scheme validation.
const int _schemeStart = 20;

/// Number of states total.
const int _stateCount = 22;

/// Number of bits used to store a state.
///
/// Satisfies `1 << stateBits >= _stateCount`.
/// Also used as shift for extra information in the transition table.
const int _stateBits = 5;

/// Mask of low `_stateBits` bits, to extract state from transition table entry.
const int _stateMask = (1 << _stateBits) - 1;

// Table structure constants.
//
// The table contains entries only for characters in the range U+0020 to U+007F.
// The input characters are permuted to make the lookup easy.

/// Input characters are xor'ed with this value.
///
/// That puts the range 0x20-0x7f into the range 0x00-0x5F,
/// which is easily usable as a an index into a table of length 0x60,
/// and checking if the value was originally in the range 0x20-0x7f can
/// be done by a single `<= 0x5f` (since the value is a string character unit,
/// which is known to be positive).
const int _charXor = 0x60;

/// Limit of valid characters after xor'ing with the above value.
const int _xorCharLimit = 0x5f;

void main(List<String> args) {
  var parserTableText = _createParserTableText();
  var charsetTableText = _createCharacterSetText();
  if (args.isEmpty || !args.first.startsWith("-u")) {
    print(parserTableText);
    print(charsetTableText);
    return;
  }
  var arg = args.first;
  var filePath = "sdk/lib/core/uri.dart";
  // Default file location, if run from root of SDK.
  if (arg.length > 2) {
    filePath = arg.substring(2);
  } else if (args.length > 1) {
    filePath = args[1];
  }
  var file = File(filePath);
  if (!file.existsSync()) {
    stderr.writeln("Cannot find file: $filePath");
    exit(1);
  }
  var contents = file.readAsStringSync();

  // Replace marked range for parser tables.
  var pattern = RegExp(
    r"^// --- URI PARSER TABLE --- (start|end) --- [^]*?^",
    multiLine: true,
  );
  var matches = pattern.allMatches(contents).toList();
  if (matches.length != 2) {
    stderr.writeln("Cannot find marked section in file $filePath");
    exit(1);
  }
  var start = matches.first.end;
  var end = matches.last.start;
  var newContents = contents.replaceRange(start, end, parserTableText);

  // Replace marked range for character sets.
  pattern = RegExp(
    r"^// --- URI CHARSET TABLE --- (start|end) --- [^]*?^",
    multiLine: true,
  );
  matches = pattern.allMatches(contents).toList();
  if (matches.length != 2) {
    stderr.writeln("Cannot find marked section in file $filePath");
    exit(1);
  }
  start = matches.first.end;
  end = matches.last.start;
  newContents = newContents.replaceRange(start, end, charsetTableText);

  if (newContents != contents) {
    file.writeAsStringSync(newContents);
    print("$filePath updated.");
  } else {
    stderr.writeln("No update needed.");
    return;
  }
}

String _createParserTableText() {
  var tables = _createTables();
  var literalBuilder = StringLiteralBuilder("_scannerTables");
  for (var table in tables) {
    literalBuilder.writeBytes(table, hexAll: true);
  }
  var tableString = literalBuilder.close();

  var result = """
$generatedHeader

// --------------------------------------------------------------------
// Constants used to read the scanner result.
// The indices points into the table filled by [_scan] which contains
// recognized positions in the scanned URI.
// The `0` index is only used internally.

/// Index of the position of that `:` after a scheme.
const int _schemeEndIndex = $_schemeEndIndex;

/// Index of the position of the character just before the host name.
const int _hostStartIndex = $_hostStartIndex;

/// Index of the position of the `:` before a port value.
const int _portStartIndex = $_portStartIndex;

/// Index of the position of the first character of a path.
const int _pathStartIndex = $_pathStartIndex;

/// Index of the position of the `?` before a query.
const int _queryStartIndex = $_queryStartIndex;

/// Index of the position of the `#` before a fragment.
const int _fragmentStartIndex = $_fragmentStartIndex;

/// Index of a position where the URI was determined to be "non-simple".
const int _notSimpleIndex = $_notSimpleIndex;

/// Initial state for scanner.
const int _uriStart = $_uriStart;

/// If scanning of a URI terminates in this state or above,
/// consider the URI non-simple
const int _nonSimpleEndStates = $_nonSimpleEndStates;

/// Initial state for scheme validation.
const int _schemeStart = $_schemeStart;

// --------------------------------------------------------------------
/// Transition tables are used to scan a URI to determine its structure.
///
/// The tables represent a state machine with output.
///
/// To scan the URI, start in the [_uriStart] state, then read each character
/// of the URI in order, from start to end, and for each character perform a
/// transition to a new state while writing the current position
/// into the output buffer at a designated index.
///
/// Each state, represented by an integer which is an index into
/// [_scannerTables], has a set of transitions, one for each character.
/// The transitions are encoded as a 5-bit integer representing the next state
/// and a 3-bit index into the output table.
///
/// For URI scanning, only characters in the range U+0020 through U+007E are
/// interesting; all characters outside that range are treated the same.
/// The tables only contain 96 entries, representing the 95 characters in the
/// interesting range, and one entry for all values outside the range.
/// The character entries are stored in one `String` of 96 characters per state,
/// with the transition for a character at position `character ^ 0x60`,
/// which maps the range U+0020 .. U+007F into positions 0 .. 95.
/// All remaining characters are mapped to position 0x1f (`0x7f ^ 0x60`), which
/// represents the transition for all remaining characters.
$tableString
// --------------------------------------------------------------------
/// Scan a string using the [_scannerTables] state machine.
///
/// Scans [uri] from [start] to [end], starting in state [state] and
/// writing output into [indices].
///
/// Returns the final state. If that state is greater than or equal to
/// [_nonSimpleEndStates], the general URI scan should consider the
/// result non-simple, even if no position has been written to
/// [_notSimpleIndex] of [indices].
int _scan(String uri, int start, int end, int state, List<int> indices) {
  // Number of characters in table for each state (range 0x20..0x60).
  const int stateTableSize = 0x60;
  // Value to xor input character with to make valid range start at zero.
  const int _charXor = $_charXor;
  // Limit on valid values after doing xor.
  const int _xorCharLimit = $_xorCharLimit;
  // Entry used for invalid input characters (not in the range 0x20-0x7f).
  const int _invalidChar = 0x7F ^ _charXor;
  // Shift to extract write position from transition table entry.
  const int _writeIndexShift = $_stateBits;
  // Mask for state part of transition table entry.
  const int _stateMask = $_stateMask;

  assert(end <= uri.length);
  for (int i = start; i < end; i++) {
    int char = uri.codeUnitAt(i) ^ _charXor;
    if (char > _xorCharLimit) char = _invalidChar;
    int transition = _scannerTables.codeUnitAt(state * stateTableSize + char);
    state = transition & _stateMask;
    indices[transition >> _writeIndexShift] = i;
  }
  return state;
}
""";
  return result;
}

String _createCharacterSetText() {
  var bits = Uint16List(128);
  var nextBit = 1;
  var seen = <String, String>{};
  var buffer = StringBuffer(generatedHeader);
  buffer.writeln();

  // Generates a documented entry for `${name}Mask` and adds the `chars`
  // to the `bits` table.
  // The chars can use `-` for a range of characters, and `\` for
  // the next character being verbatim (to escape `-` and `\`).
  void tableEntry(String name, String chars, String doc) {
    buffer.writeln();
    for (var line in LineSplitter.split(doc)) {
      if (line.isEmpty) {
        buffer.writeln("//");
      } else {
        buffer
          ..write('// ')
          ..writeln(line);
      }
    }
    buffer
      ..write('const ')
      ..write(name)
      ..write('Mask = ');
    if (seen[chars] case var existingName?) {
      buffer
        ..write(existingName)
        ..write('Mask');
    } else {
      seen[chars] = name;
      var bit = nextBit;
      nextBit *= 2;
      // Previous char emitted. Used to test that strings are ordered,
      // and as start for writing ranges.
      var prevChar = -1;
      for (var i = 0; i < chars.length; i++) {
        var char = chars.codeUnitAt(i);
        int? rangeStart;
        const charDash = 0x2D; // `-` character.
        const charBackslash = 0x5C; // `;` character.
        if (char == charDash) {
          char = chars.codeUnitAt(++i);
          rangeStart = prevChar + 1;
        }
        if (char == charBackslash) {
          char = chars.codeUnitAt(++i);
        }
        if (char <= prevChar) throw FormatException("Not sorted", chars, i);
        for (var c = rangeStart ?? char; c <= char; c++) {
          bits[c] |= bit;
        }
        prevChar = char;
      }
      var hexDigits = bit.toRadixString(16);
      const zeroPrefix = ['0x', '0x0', '0x00', '0x000'];
      buffer
        ..write(zeroPrefix[4 - hexDigits.length])
        ..write(hexDigits);
    }
    buffer.writeln(';');
  }

  tableEntry("_unreserved", r"\-.0-9A-Z_a-z~", r"""
The unreserved characters of RFC 3986.
[A-Za-z0-9\-._~]
""");
  tableEntry("_unreserved2396", r"!'()*\-.0-9A-Z_a-z~", r"""
The unreserved characters of RFC 2396.
[A-Za-z0-9!'()*\-._~]
""");
  tableEntry("_encodeFull", r"!#$&'()*+,\-./0-9:;=?@A-Z_a-z~", r"""
Table of reserved characters specified by ECMAScript 5.
[A-Za-z0-9!#$&'()*+,\-./:;=?_~]
""");
  tableEntry("_scheme", r"+\-.0-9A-Za-z", r"""
Characters allowed in the scheme.
[A-Za-z0-9+\-.]
""");
  tableEntry("_userinfo", r"!$&'()*+,\-.0-9:;=A-Z_a-z~", r"""
Characters allowed in the userinfo as of RFC 3986.
RFC 3986 Appendix A
userinfo = *( unreserved / pct-encoded / sub-delims / ':')
[A-Za-z0-9!$&'()*+,\-.:;=_~] (without '%')
""");
  tableEntry("_regName", r"!$%&'()*+,\-.0-9;=A-Z_a-z~", r"""
Characters allowed in the reg-name as of RFC 3986.
RFC 3986 Appendix A
reg-name = *( unreserved / pct-encoded / sub-delims )
Same as `_userinfoMask` without the `:`.
// [A-Za-z0-9!$%&'()*+,\-.;=_~] (including '%')
""");
  tableEntry("_pathChar", r"!$&'()*+,\-.0-9:;=@A-Z_a-z~", r"""
Characters allowed in the path as of RFC 3986.
RFC 3986 section 3.3.
pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
[A-Za-z0-9!$&'()*+,\-.:;=@_~] (without '%')
""");
  tableEntry("_pathCharOrSlash", r"!$&'()*+,\-./0-9:;=@A-Z_a-z~", r"""
Characters allowed in the path as of RFC 3986.
RFC 3986 section 3.3 *and* slash.
[A-Za-z0-9!$&'()*+,\-./:;=@_~] (without '%')
""");
  tableEntry("_queryChar", r"!$&'()*+,\-./0-9:;=?@A-Z_a-z~", r"""
Characters allowed in the query as of RFC 3986.
RFC 3986 section 3.4.
query = *( pchar / "/" / "?" )
[A-Za-z0-9!$&'()*+,\-./:;=?@_~] (without '%')
""");
  tableEntry("_zoneID", r"\-.0-9A-Z_a-z~", r"""
Characters allowed in the ZoneID as of RFC 6874.
ZoneID = 1*( unreserved / pct-encoded )
[A-Za-z0-9\-._~] + '%'
""");
  tableEntry("_tokenChar", r"!$&'*+\-.0-9A-Z^_`a-z{|}~", r"""
Table of the `token` characters of RFC 2045 in a `data:` URI.

A token is any US-ASCII character except SPACE, control characters and
`tspecial` characters. The `tspecial` category is:
'(', ')', '<', '>', '@', ',', ';', ':', '\', '"', '/', '[, ']', '?', '='.

In a data URI, we also need to escape '%' and '#' characters.
""");
  tableEntry("_uric", r"!$&'()*+,\-./0-9:;=?@A-Z_a-z~", r"""
All non-escape RFC-2396 "uric" characters.

The "uric" character set is defined by:
```
 uric        =  reserved | unreserved | escaped
 reserved    =  ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
 unreserved  =  alphanum | mark
 mark        =  "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
```
This is the same characters as in a URI query (which is URI pchar plus '?')
""");
  tableEntry("_genDelimiters", r"#/:?@[]", r"""
General delimiter characters, RFC 3986 section 2.2.
gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
[:/?#[]@]
""");
  tableEntry("_ipvFutureAddressChars", r"!$&'()*+,\-.0-9:;=A-Z_a-z~", r"""
Characters valid in an IPvFuture address, RFC 3986 section 3.2.2.
1*( unreserved / sub-delims / ":" )
[A-Za-z0-9\-._~]|[!$&'()*+,;=]|:
""");

  var table = (StringLiteralBuilder('_charTables')
        ..writeChars(bits, hexAll: true))
      .close();
  buffer
    ..writeln()
    ..write(table);

  return buffer.toString();
}

const String generatedHeader = """
// Use tools/generate_uri_parser_tables.dart to generate this code
// if necessary.""";

/// Creates a literal of the form
/// ```dart
/// const String someName = "ab\x82azx......"
///     "more bytes and escapes \xff        "
///     "....";
/// ```
/// while escaping non-printable characters, `"`, `$` and `\`,
/// and trying to fit as many characters on each line as possible.
///
/// Not optimized for speed or memory consumption. Assumed to be run
/// rarely and offline.
class StringLiteralBuilder {
  final buffer = StringBuffer();
  String indent;
  var lineLength = 0;
  StringLiteralBuilder(String name, {int indent = 0})
      : indent = " " * (indent + 4) {
    if (indent > 0) buffer.write(" " * indent);
    buffer
      ..write("const String ")
      ..write(name)
      ..write(" = \"");
    lineLength = buffer.length;
  }

  void writeBytes(Uint8List bytes, {bool hexAll = false}) {
    for (var byte in bytes) {
      var string = hexAll ? hex(byte) : charString(byte);
      lineLength += string.length;
      if (lineLength > 79) {
        buffer
          ..write('"\n')
          ..write(indent)
          ..write('"');
        lineLength = indent.length + 1 + string.length;
      }
      buffer.write(string);
    }
  }

  void writeChars(Uint16List chars, {bool hexAll = false}) {
    for (var char in chars) {
      var string = hexAll ? hex(char) : charString(char);
      lineLength += string.length;
      if (lineLength > 79) {
        buffer
          ..write('"\n')
          ..write(indent)
          ..write('"');
        lineLength = indent.length + 1 + string.length;
      }
      buffer.write(string);
    }
  }

  /// Terminates the string literal.
  ///
  /// Do not call use builder after calling close.
  String close() {
    if (lineLength < 78) {
      buffer.write("\";\n");
    } else {
      buffer
        ..write("\"\n")
        ..write(indent)
        ..write(";\n");
    }
    return buffer.toString();
  }

  static String charString(int char) {
    // Recognized characters that need escaping, or has a short escape.
    switch (char) {
      case 0x08:
        return r"\b";
      case 0x09:
        return r"\t";
      case 0x0a:
        return r"\n";
      case 0x0b:
        return r"\v";
      case 0x0c:
        return r"\f";
      case 0x0d:
        return r"\r";
      case 0x22:
        return r'\"';
      case 0x5c:
        return r"\\";
      case 0x24:
        return r"\$";
    }
    // All control characters, all non-one-byte-string chars.
    if (char > 0xFF || char & 0x60 == 0 || char == 0x7F) {
      // 0x00 - 0x1F, 0x80 - 0xBF, 0x7F-...
      return hex(char);
    }
    return String.fromCharCode(char);
  }

  static String hex(int char) {
    const digits = "0123456789ABCDEF";
    if (char <= 0xFF) {
      return "\\x${digits[char >> 4]}${digits[char & 0xf]}";
    }
    // Don't try to be clever.
    return "\\u${char.toRadixString(16).padLeft(4, "0")}";
  }
}

/// Creates the tables for `_scannerTables` used by [Uri.parse].
///
/// See `_scannerTables` in `sdk/lib/core/uri.dart` for the generated format.
///
/// The concrete tables are chosen as a trade-off between the number of states
/// needed and the precision of the result.
/// This allows definitely recognizing the general structure of the URI
/// (presence and location of scheme, user-info, host, port, path, query and
/// fragment) while at the same time detecting that some components are not
/// in canonical form (anything containing a `%`, a host-name containing a
/// capital letter). Since the scanner doesn't know whether something is a
/// scheme or a path until it sees `:`, or user-info or host until it sees
/// a `@`, a second pass is needed to validate the scheme and any user-info
/// is considered non-canonical by default.
///
/// The states (starting from [_uriStart]) write positions while scanning
/// a string from `start` to `end` as follows:
///
/// - [_schemeEndIndex]: Should be initialized to `start-1`.
///   If the URI has a scheme, it is set to the position of the `:` after
///   the scheme.
/// - [_hostStartIndex]: Should be initialized to `start - 1`.
///   If the URI has an authority, it is set to the character before the
///   host name - either the second `/` in the `//` leading the authority,
///   or the `@` after a user-info. Comparing this value to the scheme end
///   position can be used to detect that there is a user-info component.
/// - [_portStartIndex]: Should be initialized to `start`.
///   Set to the position of the last `:` in an authority, and unchanged
///   if there is no authority or no `:` in an authority.
///   If this position is after the host start, there is a port, otherwise it
///   is just marking a colon in the user-info component.
/// - [_pathStartIndex]: Should be initialized to `start`.
///   Is set to the first path character unless the path is empty.
///   If the path is empty, the position is either unchanged (`start`) or
///   the first slash of an authority. So, if the path start is before a
///   host start or scheme end, the path is empty.
/// - [_queryStartIndex]: Should be initialized to `end`.
///   The position of the `?` leading a query if the URI contains a query.
/// - [_fragmentStartIndex]: Should be initialized to `end`.
///   The position of the `#` leading a fragment if the URI contains a fragment.
/// - [_notSimpleIndex]: Should be initialized to `start - 1`.
///   Set to another value if the URI is considered "not simple".
///   This is elaborated below.
///
/// # Simple URIs
/// A URI is considered "simple" if it is in a normalized form containing no
/// escapes. This allows us to skip normalization and checking whether escapes
/// are valid, and to extract components without worrying about unescaping.
///
/// The scanner computes a conservative approximation of being "simple".
/// It rejects any URI with an escape, with a user-info component (mainly
/// because they are rare and would increase the number of states in the
/// scanner significantly), with an IPV6 host or with a capital letter in
/// the scheme or host name (the scheme is handled in a second scan using
/// a separate two-state table).
/// Further, paths containing `..` or `.` path segments are considered
/// non-simple except for pure relative paths (no scheme or authority) starting
/// with a sequence of "../" segments.
///
/// The transition tables cannot detect a trailing ".." in the path,
/// followed by a query or fragment, because the segment is not known to be
/// complete until we are past it, and we then need to store the query/fragment
/// start instead. This case is checked manually post-scanning (such a path
/// needs to be normalized to end in "../", so the URI shouldn't be considered
/// simple).
List<Uint8List> _createTables() {
  // States used to scan a URI from scratch.
  assert(_uriStart == 0);
  const int uriStart = _uriStart;
  const int schemeOrPath = uriStart + 1;
  const int authOrPath = schemeOrPath + 1;
  const int authOrPathSlash = authOrPath + 1;
  const int userInfoOrHost0 = authOrPathSlash + 1;
  const int userInfoOrHost = userInfoOrHost0 + 1;
  const int userInfoOrPort0 = userInfoOrHost + 1;
  const int userInfoOrPort = userInfoOrPort0 + 1;
  const int ipv6Host = userInfoOrPort + 1;
  const int relPathSeg = ipv6Host + 1;
  const int pathSeg = relPathSeg + 1;
  const int path = pathSeg + 1;
  const int query = path + 1;
  const int fragment = query + 1;
  const int schemeOrPathDot = fragment + 1; // Path ends in `.`.
  const int schemeOrPathDot2 = schemeOrPathDot + 1; // Path ends in `..`.
  const int relPathSegDot = schemeOrPathDot2 + 1; // Path ends in `.`.
  const int relPathSegDot2 = relPathSegDot + 1; // Path ends in `..`.
  const int pathSegDot = relPathSegDot2 + 1; // Path ends in `.`.
  const int pathSegDot2 = pathSegDot + 1; // Path ends in `..`.
  assert(_notSimpleIndex == schemeOrPathDot);

  // States used to validate a scheme after its end position has been found.
  // A separate state machine in the same table.
  const int scheme0 = pathSegDot2 + 1;
  const int scheme = scheme0 + 1;
  assert(scheme0 == _schemeStart);

  // Total number of states for the scanner.
  const int stateCount = scheme + 1;
  assert(stateCount == _stateCount);
  assert(1 << _stateBits >= stateCount);

  // Constants encoding the write-index for the state transition into the top 3
  // bits of a byte.
  const int schemeEnd = _schemeEndIndex << 5;
  const int hostStart = _hostStartIndex << 5;
  const int portStart = _portStartIndex << 5;
  const int pathStart = _pathStartIndex << 5;
  const int queryStart = _queryStartIndex << 5;
  const int fragmentStart = _fragmentStartIndex << 5;
  const int notSimple = _notSimpleIndex << 5;

  /// The `unreserved` characters of RFC 3986.
  const unreserved =
      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-._~";

  /// The `sub-delim` characters of RFC 3986.
  const subDelimiters = r"!$&'()*+,;=";
  // The `pchar` characters of RFC 3986: characters that may occur in a path,
  // excluding escapes.
  const pchar = "$unreserved$subDelimiters";

  var tables = List<Uint8List>.generate(stateCount, (_) => Uint8List(96));

  // Helper function which initialize the table for [state] with a default
  // transition and returns the table.
  Uint8List build(int state, int defaultTransition) =>
      tables[state]..fillRange(0, 96, defaultTransition);

  // Helper function which sets the transition for each character in [chars]
  // to [transition] in the [target] table.
  // The [chars] string must contain only characters in the U+0020 .. U+007E
  // range.
  void setChars(Uint8List target, String chars, int transition) {
    for (int i = 0; i < chars.length; i++) {
      var char = chars.codeUnitAt(i);
      target[char ^ 0x60] = transition;
    }
  }

  // Helper function which sets the transition for all characters in the
  // range from `range[0]` to `range[1]` to [transition] in the [target] table.
  //
  // The [range] must be a two-character string where both characters are in
  // the U+0020 .. U+007E range and the former character must have a lower
  // code point than the latter.
  void setRange(Uint8List target, String range, int transition) {
    for (int i = range.codeUnitAt(0), n = range.codeUnitAt(1); i <= n; i++) {
      target[i ^ 0x60] = transition;
    }
  }

  // Create the transitions for each state.
  Uint8List b;

  // Entry point of URI-scanner state machine.
  // Validate as path. If it is a scheme, we recognize that
  // and validate it later.
  b = build(uriStart, schemeOrPath | notSimple);
  setChars(b, pchar, schemeOrPath);
  setChars(b, ".", schemeOrPathDot);
  setChars(b, ":", authOrPath | schemeEnd); // Handle later.
  setChars(b, "/", authOrPathSlash);
  setChars(b, r"\", authOrPathSlash | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(schemeOrPathDot, schemeOrPath | notSimple);
  setChars(b, pchar, schemeOrPath);
  setChars(b, ".", schemeOrPathDot2);
  setChars(b, ':', authOrPath | schemeEnd);
  setChars(b, r"/\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(schemeOrPathDot2, schemeOrPath | notSimple);
  setChars(b, pchar, schemeOrPath);
  setChars(b, "%", schemeOrPath | notSimple);
  setChars(b, ':', authOrPath | schemeEnd);
  setChars(b, "/", relPathSeg);
  setChars(b, r"\", relPathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(schemeOrPath, schemeOrPath | notSimple);
  setChars(b, pchar, schemeOrPath);
  setChars(b, ':', authOrPath | schemeEnd);
  setChars(b, "/", pathSeg);
  setChars(b, r"\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(authOrPath, path | notSimple);
  setChars(b, pchar, path | pathStart);
  setChars(b, "/", authOrPathSlash | pathStart);
  setChars(b, r"\", authOrPathSlash | pathStart); // This should be non-simple.
  setChars(b, ".", pathSegDot | pathStart);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(authOrPathSlash, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, "/", userInfoOrHost0 | hostStart);
  setChars(b, r"\", userInfoOrHost0 | hostStart); // This should be non-simple.
  setChars(b, ".", pathSegDot);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(userInfoOrHost0, userInfoOrHost | notSimple);
  setChars(b, pchar, userInfoOrHost);
  setRange(b, "AZ", userInfoOrHost | notSimple);
  setChars(b, ":", userInfoOrPort0 | portStart);
  setChars(b, "@", userInfoOrHost0 | hostStart);
  setChars(b, "[", ipv6Host | notSimple);
  setChars(b, "/", pathSeg | pathStart);
  setChars(b, r"\", pathSeg | pathStart); // This should be non-simple.
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(userInfoOrHost, userInfoOrHost | notSimple);
  setChars(b, pchar, userInfoOrHost);
  setRange(b, "AZ", userInfoOrHost | notSimple);
  setChars(b, ":", userInfoOrPort0 | portStart);
  setChars(b, "@", userInfoOrHost0 | hostStart);
  setChars(b, "/", pathSeg | pathStart);
  setChars(b, r"\", pathSeg | pathStart); // This should be non-simple.
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(userInfoOrPort0, userInfoOrPort | notSimple);
  setRange(b, "19", userInfoOrPort);
  setChars(b, "@", userInfoOrHost0 | hostStart);
  setChars(b, "/", pathSeg | pathStart);
  setChars(b, r"\", pathSeg | pathStart); // This should be non-simple.
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(userInfoOrPort, userInfoOrPort | notSimple);
  setRange(b, "09", userInfoOrPort);
  setChars(b, "@", userInfoOrHost0 | hostStart);
  setChars(b, "/", pathSeg | pathStart);
  setChars(b, r"\", pathSeg | pathStart); // This should be non-simple.
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(ipv6Host, ipv6Host);
  setChars(b, "]", userInfoOrHost);

  b = build(relPathSeg, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, ".", relPathSegDot);
  setChars(b, r"/\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(relPathSegDot, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, ".", relPathSegDot2);
  setChars(b, r"/\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(relPathSegDot2, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, "/", relPathSeg);
  setChars(b, r"\", relPathSeg | notSimple);
  setChars(b, "?", query | queryStart); // This should be non-simple.
  setChars(b, "#", fragment | fragmentStart); // This should be non-simple.

  b = build(pathSeg, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, ".", pathSegDot);
  setChars(b, "/", pathSeg);
  setChars(b, r"\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(pathSegDot, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, ".", pathSegDot2);
  setChars(b, r"/\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(pathSegDot2, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, r"/\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(path, path | notSimple);
  setChars(b, pchar, path);
  setChars(b, "/", pathSeg);
  setChars(b, r"\", pathSeg | notSimple);
  setChars(b, "?", query | queryStart);
  setChars(b, "#", fragment | fragmentStart);

  b = build(query, query | notSimple);
  setChars(b, pchar, query);
  setChars(b, "?", query);
  setChars(b, "#", fragment | fragmentStart);

  b = build(fragment, fragment | notSimple);
  setChars(b, pchar, fragment);
  setChars(b, "?", fragment);

  // A separate two-state validator for lower-case scheme names.
  // Any non-scheme character or upper-case letter is marked as non-simple.
  b = build(scheme0, scheme | notSimple);
  setRange(b, "az", scheme);

  b = build(scheme, scheme | notSimple);
  setRange(b, "az", scheme);
  setRange(b, "09", scheme);
  setChars(b, "+-.", scheme);

  return tables;
}