tools/dom/src/Validators.dart

// Copyright (c) 2013, the Dart project authors.  Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

part of dart.dom.html;

/**
 * Interface used to validate that only accepted elements and attributes are
 * allowed while parsing HTML strings into DOM nodes.
 *
 * In general, customization of validation behavior should be done via the
 * [NodeValidatorBuilder] class to mitigate the chances of incorrectly
 * implementing validation rules.
 */
abstract class NodeValidator {
  /**
   * Construct a default NodeValidator which only accepts whitelisted HTML5
   * elements and attributes.
   *
   * If a uriPolicy is not specified then the default uriPolicy will be used.
   */
  factory NodeValidator({UriPolicy? uriPolicy}) =>
      new _Html5NodeValidator(uriPolicy: uriPolicy);

  factory NodeValidator.throws(NodeValidator base) =>
      new _ThrowsNodeValidator(base);

  /**
   * Returns true if the tagName is an accepted type.
   */
  bool allowsElement(Element element);

  /**
   * Returns true if the attribute is allowed.
   *
   * The attributeName parameter will always be in lowercase.
   *
   * See [allowsElement] for format of tagName.
   */
  bool allowsAttribute(Element element, String attributeName, String value);
}

/**
 * Performs sanitization of a node tree after construction to ensure that it
 * does not contain any disallowed elements or attributes.
 *
 * In general custom implementations of this class should not be necessary and
 * all validation customization should be done in custom NodeValidators, but
 * custom implementations of this class can be created to perform more complex
 * tree sanitization.
 */
abstract class NodeTreeSanitizer {
  /**
   * Constructs a default tree sanitizer which will remove all elements and
   * attributes which are not allowed by the provided validator.
   */
  factory NodeTreeSanitizer(NodeValidator validator) =>
      new _ValidatingTreeSanitizer(validator);

  /**
   * Called with the root of the tree which is to be sanitized.
   *
   * This method needs to walk the entire tree and either remove elements and
   * attributes which are not recognized as safe or throw an exception which
   * will mark the entire tree as unsafe.
   */
  void sanitizeTree(Node node);

  /**
   * A sanitizer for trees that we trust. It does no validation and allows
   * any elements. It is also more efficient, since it can pass the text
   * directly through to the underlying APIs without creating a document
   * fragment to be sanitized.
   */
  static const trusted = const _TrustedHtmlTreeSanitizer();
}

/**
 * A sanitizer for trees that we trust. It does no validation and allows
 * any elements.
 */
class _TrustedHtmlTreeSanitizer implements NodeTreeSanitizer {
  const _TrustedHtmlTreeSanitizer();

  sanitizeTree(Node node) {}
}

/**
 * Defines the policy for what types of uris are allowed for particular
 * attribute values.
 *
 * This can be used to provide custom rules such as allowing all http:// URIs
 * for image attributes but only same-origin URIs for anchor tags.
 */
abstract class UriPolicy {
  /**
   * Constructs the default UriPolicy which is to only allow Uris to the same
   * origin as the application was launched from.
   *
   * This will block all ftp: mailto: URIs. It will also block accessing
   * https://example.com if the app is running from http://example.com.
   */
  factory UriPolicy() => new _SameOriginUriPolicy();

  /**
   * Checks if the uri is allowed on the specified attribute.
   *
   * The uri provided may or may not be a relative path.
   */
  bool allowsUri(String uri);
}

/**
 * Allows URIs to the same origin as the current application was loaded from
 * (such as https://example.com:80).
 */
class _SameOriginUriPolicy implements UriPolicy {
  final AnchorElement _hiddenAnchor = new AnchorElement();
  final Location _loc = window.location;

  bool allowsUri(String uri) {
    _hiddenAnchor.href = uri;
    // IE leaves an empty hostname for same-origin URIs.
    return (_hiddenAnchor.hostname == _loc.hostname &&
            _hiddenAnchor.port == _loc.port &&
            _hiddenAnchor.protocol == _loc.protocol) ||
        (_hiddenAnchor.hostname == '' &&
            _hiddenAnchor.port == '' &&
            (_hiddenAnchor.protocol == ':' || _hiddenAnchor.protocol == ''));
  }
}

class _ThrowsNodeValidator implements NodeValidator {
  final NodeValidator validator;

  _ThrowsNodeValidator(this.validator) {}

  bool allowsElement(Element element) {
    if (!validator.allowsElement(element)) {
      throw new ArgumentError(Element._safeTagName(element));
    }
    return true;
  }

  bool allowsAttribute(Element element, String attributeName, String value) {
    if (!validator.allowsAttribute(element, attributeName, value)) {
      throw new ArgumentError(
          '${Element._safeTagName(element)}[$attributeName="$value"]');
    }
    return true;
  }
}

/**
 * Standard tree sanitizer which validates a node tree against the provided
 * validator and removes any nodes or attributes which are not allowed.
 */
class _ValidatingTreeSanitizer implements NodeTreeSanitizer {
  NodeValidator validator;

  /// Number of tree modifications this instance has made.
  int numTreeModifications = 0;
  _ValidatingTreeSanitizer(this.validator) {}

  void sanitizeTree(Node node) {
    void walk(Node node, Node? parent) {
      sanitizeNode(node, parent);

      var child = node.lastChild;
      while (null != child) {
        Node? nextChild;
        try {
          // Child may be removed during the walk, and we may not even be able
          // to get its previousNode. But it's also possible that previousNode
          // (i.e. previousSibling) is being spoofed, so double-check it.
          nextChild = child.previousNode;
          if (nextChild != null && nextChild.nextNode != child) {
            throw StateError("Corrupt HTML");
          }
        } catch (e) {
          // Child appears bad, remove it. We want to check the rest of the
          // children of node and, but we have no way of getting to the next
          // child, so start again from the last child.
          _removeNode(child, node);
          child = null;
          nextChild = node.lastChild;
        }
        if (child != null) walk(child, node);
        child = nextChild;
      }
    }

    // Walk the tree until no new modifications are added to the tree.
    var previousTreeModifications;
    do {
      previousTreeModifications = numTreeModifications;
      walk(node, null);
    } while (previousTreeModifications != numTreeModifications);
  }

  /// Aggressively try to remove node.
  void _removeNode(Node node, Node? parent) {
    // If we have the parent, it's presumably already passed more sanitization
    // or is the fragment, so ask it to remove the child. And if that fails
    // try to set the outer html.
    numTreeModifications++;
    if (parent == null || parent != node.parentNode) {
      node.remove();
    } else {
      parent._removeChild(node);
    }
  }

  /// Sanitize the element, assuming we can't trust anything about it.
  void _sanitizeUntrustedElement(/* Element */ element, Node? parent) {
    // If the _hasCorruptedAttributes does not successfully return false,
    // then we consider it corrupted and remove.
    // TODO(alanknight): This is a workaround because on Firefox
    // embed/object
    // tags typeof is "function", not "object". We don't recognize them, and
    // can't call methods. This does mean that you can't explicitly allow an
    // embed tag. The only thing that will let it through is a null
    // sanitizer that doesn't traverse the tree at all. But sanitizing while
    // allowing embeds seems quite unlikely. This is also the reason that we
    // can't declare the type of element, as an embed won't pass any type
    // check in dart2js.
    var corrupted = true;
    var attrs;
    var isAttr;
    try {
      // If getting/indexing attributes throws, count that as corrupt.
      // Don't remove dynamic calls in sanitizing code.
      // ignore: avoid_dynamic_calls
      attrs = element.attributes;
      // Don't remove dynamic calls in sanitizing code.
      // ignore: avoid_dynamic_calls
      isAttr = attrs['is'];
      var corruptedTest1 = Element._hasCorruptedAttributes(element);

      // On IE, erratically, the hasCorruptedAttributes test can return false,
      // even though it clearly is corrupted. A separate copy of the test
      // inlining just the basic check seems to help.
      corrupted = corruptedTest1
          ? true
          : Element._hasCorruptedAttributesAdditionalCheck(element);
    } catch (e) {}
    var elementText = 'element unprintable';
    try {
      elementText = element.toString();
    } catch (e) {}
    try {
      var elementTagName = Element._safeTagName(element);
      _sanitizeElement(element, parent, corrupted, elementText, elementTagName,
          attrs, isAttr);
    } on ArgumentError {
      // Thrown by _ThrowsNodeValidator
      rethrow;
    } catch (e) {
      // Unexpected exception sanitizing -> remove
      _removeNode(element, parent);
      window.console.warn('Removing corrupted element $elementText');
    }
  }

  /// Having done basic sanity checking on the element, and computed the
  /// important attributes we want to check, remove it if it's not valid
  /// or not allowed, either as a whole or particular attributes.
  void _sanitizeElement(Element element, Node? parent, bool corrupted,
      String text, String tag, Map attrs, String? isAttr) {
    if (false != corrupted) {
      _removeNode(element, parent);
      window.console
          .warn('Removing element due to corrupted attributes on <$text>');
      return;
    }
    if (!validator.allowsElement(element)) {
      _removeNode(element, parent);
      window.console.warn('Removing disallowed element <$tag> from $parent');
      return;
    }

    if (isAttr != null) {
      if (!validator.allowsAttribute(element, 'is', isAttr)) {
        _removeNode(element, parent);
        window.console.warn('Removing disallowed type extension '
            '<$tag is="$isAttr">');
        return;
      }
    }

    // TODO(blois): Need to be able to get all attributes, irrespective of
    // XMLNS.
    var keys = attrs.keys.toList();
    for (var i = attrs.length - 1; i >= 0; --i) {
      var name = keys[i];
      if (!validator.allowsAttribute(
          element,
          // Don't remove dynamic calls in sanitizing code.
          // ignore: avoid_dynamic_calls
          name.toLowerCase(),
          attrs[name])) {
        window.console.warn('Removing disallowed attribute '
            '<$tag $name="${attrs[name]}">');
        attrs.remove(name);
      }
    }

    if (element is TemplateElement) {
      TemplateElement template = element;
      sanitizeTree(template.content!);
    }
  }

  /// Sanitize the node and its children recursively.
  void sanitizeNode(Node node, Node? parent) {
    switch (node.nodeType) {
      case Node.ELEMENT_NODE:
        _sanitizeUntrustedElement(node, parent);
        break;
      case Node.COMMENT_NODE:
      case Node.DOCUMENT_FRAGMENT_NODE:
      case Node.TEXT_NODE:
      case Node.CDATA_SECTION_NODE:
        break;
      default:
        _removeNode(node, parent);
    }
  }
}