Skip to content

Make parsing of text be non-quadratic. #579

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Make parsing of text be non-quadratic.
In Python, appending strings is not guaranteed to be constant-time,
since they are documented to be immutable.  In some corner cases,
CPython is able to make these operations constant-time, but reaching
into ETree objects is not such a case.

This leads to parse times being quadratic in the size of the text in
the input in pathological cases where parsing outputs a large number
of adjacent text nodes which must be combined (e.g. HTML-escaped
values).  Specifically, we expect doubling the size of the input to
result in approximately doubling the time to parse; instead, we
observe quadratic behavior:

```
In [1]: import html5lib

In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000)
2.99 s ± 269 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000)
6.7 s ± 242 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000)
19.5 s ± 1.48 s per loop (mean ± std. dev. of 5 runs, 1 loop each)
```

Switch from appending to the internal `str`, to appending text to an
array of text chunks, as appends can be done in constant time.  Using
`bytearray` is a similar solution, but benchmarks slightly worse
because the strings must be encoded before being appended.

This improves parsing of text documents noticeably:

```
In [1]: import html5lib

In [2]: %timeit -n1 -r5 html5lib.parse("<" * 200000)
2.3 s ± 373 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [3]: %timeit -n1 -r5 html5lib.parse("<" * 400000)
3.85 s ± 29.7 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)

In [4]: %timeit -n1 -r5 html5lib.parse("<" * 800000)
8.04 s ± 317 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)
```
  • Loading branch information
alexmv committed Feb 27, 2024
commit 075cb7cf7a2e332ba41e60f812878885434af89b
81 changes: 52 additions & 29 deletions html5lib/treebuilders/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@
tag_regexp = re.compile("{([^}]*)}(.*)")


class TextBuffer:
def __init__(self, initial=""):
self.chunks = [initial]

def __str__(self):
return "".join(self.chunks)

def getvalue(self):
return "".join(self.chunks)

def append(self, other):
self.chunks.append(other)

def __eq__(self, other):
return self.getvalue() == other


def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
Expand Down Expand Up @@ -110,25 +127,25 @@ def removeChild(self, node):
def insertText(self, data, insertBefore=None):
if not len(self._element):
if not self._element.text:
self._element.text = ""
self._element.text += data
self._element.text = TextBuffer("")
self._element.text.append(data)
elif insertBefore is None:
# Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
self._element[-1].tail = TextBuffer("")
self._element[-1].tail.append(data)
else:
# Insert the text before the specified node
children = list(self._element)
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index - 1].tail:
self._element[index - 1].tail = ""
self._element[index - 1].tail += data
self._element[index - 1].tail = TextBuffer("")
self._element[index - 1].tail.append(data)
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
self._element.text = TextBuffer("")
self._element.text.append(data)

def cloneNode(self):
element = type(self)(self.name, self.namespace)
Expand All @@ -138,36 +155,39 @@ def cloneNode(self):

def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
newParent.childNodes[-1]._element.tail.append(
self._element.text.getvalue()
)
else:
if not newParent._element.text:
newParent._element.text = ""
newParent._element.text = TextBuffer("")
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
newParent._element.text.append(self._element.text.getvalue())
self._element.text = TextBuffer("")
base.Node.reparentChildren(self, newParent)

class Comment(Element):
def __init__(self, data):
# Use the superclass constructor to set all properties on the
# wrapper element
self._element = ElementTree.Comment(data)
self._element.text = TextBuffer(data)
self.parent = None
self._childNodes = []
self._flags = []

def _getData(self):
return self._element.text
return self._element.text.getvalue()

def _setData(self, value):
self._element.text = value
self._element.text = TextBuffer(value)

data = property(_getData, _setData)

class DocumentType(Element):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self._element.text = TextBuffer(name)
self.publicId = publicId
self.systemId = systemId

Expand Down Expand Up @@ -208,19 +228,19 @@ def serializeElement(element, indent=0):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
(element.text, publicId, systemId))
(element.text.getvalue(), publicId, systemId))
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
elif element.tag == "DOCUMENT_ROOT":
rv.append("#document")
if element.text is not None:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
elif element.tag == ElementTreeCommentType:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text.getvalue()))
else:
assert isinstance(element.tag, text_type), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
Expand Down Expand Up @@ -248,13 +268,14 @@ def serializeElement(element, indent=0):

for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
if element.text and element.text.getvalue():
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
indent += 2
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail.getvalue()))

serializeElement(element, 0)

return "\n".join(rv)
Expand All @@ -272,13 +293,15 @@ def serializeElement(element):
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
(element.text, publicId, systemId))
rv.append(
"""<!DOCTYPE %s PUBLIC "%s" "%s">"""
% (element.text.getvalue(), publicId, systemId)
)
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
elif element.tag == "DOCUMENT_ROOT":
if element.text is not None:
rv.append(element.text)
rv.append(element.text.getvalue())
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
Expand All @@ -288,7 +311,7 @@ def serializeElement(element):
serializeElement(child)

elif element.tag == ElementTreeCommentType:
rv.append("<!--%s-->" % (element.text,))
rv.append("<!--%s-->" % (element.text.getvalue(),))
else:
# This is assumed to be an ordinary element
if not element.attrib:
Expand All @@ -299,15 +322,15 @@ def serializeElement(element):
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
rv.append(element.text.getvalue())

for child in element:
serializeElement(child)

rv.append("</%s>" % (element.tag,))

if element.tail:
rv.append(element.tail)
rv.append(element.tail.getvalue())

serializeElement(element)

Expand Down
12 changes: 8 additions & 4 deletions html5lib/treewalkers/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, _, _, flag = node
if flag in ("text", "tail"):
return base.TEXT, getattr(elt, flag)
return base.TEXT, getattr(elt, flag).getvalue()
else:
node = elt

Expand All @@ -44,11 +44,15 @@ def getNodeDetails(self, node):
return (base.DOCUMENT,)

elif node.tag == "<!DOCTYPE>":
return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
return (
base.DOCTYPE,
node.text.getvalue(),
node.get("publicId"),
node.get("systemId"),
)

elif node.tag == ElementTreeCommentType:
return base.COMMENT, node.text
return base.COMMENT, node.text.getvalue()

else:
assert isinstance(node.tag, string_types), type(node.tag)
Expand Down