Skip to content

Tokenizer: use Python objects to represent tokens #521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
183d8a0
Consistency: consume a single character at a time during attribute na…
jayaddison Dec 29, 2020
2e86373
Refactor: pretranslate lowercase element and attribute names
jayaddison Dec 29, 2020
8f96b17
Restore self.currentToken safety check
jayaddison Dec 29, 2020
a912842
Alternate approach: do not pretranslate temporary buffered data
jayaddison Dec 30, 2020
f9f370e
Consistency: character consumption within double-escaped state
jayaddison Dec 30, 2020
bcee8bd
Refactor: use Python objects for tokens within tokenizer
jayaddison Dec 29, 2020
67262f8
Introduce type hierarchy for tag-related tokens
jayaddison Dec 29, 2020
900bdaf
Simplify tag token construction
jayaddison Dec 29, 2020
1f6cae9
Refactor token attribution name/value accumulation
jayaddison Dec 29, 2020
695ac1c
Cleanup: remove leavingThisState / emitToken logic
jayaddison Dec 29, 2020
b1a444b
Remove EmptyTag tokenizer token class
jayaddison Dec 29, 2020
bb7fabc
Refactor: pre-translate strings that are only used in lowercase context
jayaddison Dec 29, 2020
5f4ace9
Cleanup: remove getattr anti-pattern
jayaddison Dec 29, 2020
d744c86
Consistency: use camel-casing to correspond with existing codebase style
jayaddison Dec 29, 2020
1d62e69
Consistency: consume a single character at a time during attribute na…
jayaddison Dec 29, 2020
8772408
Merge branch 'tokenizer/pretranslate-lowercase-names' into tokenizer/…
jayaddison Dec 30, 2020
192cce0
Linting cleanup
jayaddison Dec 30, 2020
e76e0dd
Clarify method name: clearAttribute -> flushAttribute
jayaddison Jan 4, 2021
da37332
Merge branch 'master' into tokenizer/object-tokens
jayaddison Sep 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor token attribution name/value accumulation
  • Loading branch information
jayaddison committed Dec 30, 2020
commit 1f6cae93c773266a90940a3228b6b89b7288d11d
97 changes: 52 additions & 45 deletions html5lib/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,24 @@ class SpaceCharacters(Token):


class Tag(Token):
def __init__(self, name, data):
def __init__(self, name, attributes):
self.name = name
self.data = data or []
self.attributes = attributeMap(attributes or {})
self.self_closing = False
self.attribute_name = ""
self.attribute_value = ""

def clearAttribute(self):
if self.attribute_name and self.attribute_name not in self.attributes:
self.attributes[self.attribute_name] = self.attribute_value
self.attribute_name = ""
self.attribute_value = ""

def accumulateAttributeName(self, text):
self.attribute_name += text.translate(asciiUpper2Lower)

def accumulateAttributeValue(self, text):
self.attribute_value += text

class StartTag(Tag):
def __init__(self, name, data=None):
Expand Down Expand Up @@ -248,7 +262,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
output = "&" + "".join(charStack)

if fromAttribute:
self.currentToken.data[-1][1] += output
self.currentToken.accumulateAttributeValue(output)
else:
if output in spaceCharacters:
token = SpaceCharacters(output)
Expand All @@ -270,17 +284,9 @@ def emitCurrentToken(self):
# Add token to the queue to be yielded
if isinstance(token, Tag):
token.name = token.name.translate(asciiUpper2Lower)
if isinstance(token, StartTag):
raw = token.data
data = attributeMap(raw)
if len(raw) > len(data):
# we had some duplicated attribute, fix so first wins
was = dict(data)
data.update(raw[::-1])
token.data = data

token.clearAttribute()
if isinstance(token, EndTag):
if token.data:
if token.attributes:
self.tokenQueue.append(ParseError("attributes-in-end-tag"))
if token.self_closing:
self.tokenQueue.append(ParseError("self-closing-flag-on-end-tag"))
Expand Down Expand Up @@ -820,25 +826,29 @@ def beforeAttributeNameState(self):
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
elif data in asciiLetters:
self.currentToken.data.append([data, ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName(data)
self.state = self.attributeNameState
elif data == ">":
self.emitCurrentToken()
elif data == "/":
self.state = self.selfClosingStartTagState
elif data in ("'", '"', "=", "<"):
self.tokenQueue.append(ParseError("invalid-character-in-attribute-name"))
self.currentToken.data.append([data, ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName(data)
self.state = self.attributeNameState
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data.append(["\uFFFD", ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName("\uFFFD")
self.state = self.attributeNameState
elif data is EOF:
self.tokenQueue.append(ParseError("expected-attribute-name-but-got-eof"))
self.state = self.dataState
else:
self.currentToken.data.append([data, ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName(data)
self.state = self.attributeNameState
return True

Expand All @@ -849,8 +859,7 @@ def attributeNameState(self):
if data == "=":
self.state = self.beforeAttributeValueState
elif data in asciiLetters:
self.currentToken.data[-1][0] += data +\
self.stream.charsUntil(asciiLetters, True)
self.currentToken.accumulateAttributeName(data + self.stream.charsUntil(asciiLetters, True))
leavingThisState = False
elif data == ">":
# XXX If we emit here the attributes are converted to a dict
Expand All @@ -863,29 +872,25 @@ def attributeNameState(self):
self.state = self.selfClosingStartTagState
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data[-1][0] += "\uFFFD"
self.currentToken.accumulateAttributeName("\uFFFD")
leavingThisState = False
elif data in ("'", '"', "<"):
self.tokenQueue.append(ParseError("invalid-character-in-attribute-name"))
self.currentToken.data[-1][0] += data
self.currentToken.accumulateAttributeName(data)
leavingThisState = False
elif data is EOF:
self.tokenQueue.append(ParseError("eof-in-attribute-name"))
self.state = self.dataState
else:
self.currentToken.data[-1][0] += data
self.currentToken.accumulateAttributeName(data)
leavingThisState = False

if leavingThisState:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self.currentToken.data[-1][0] = (
self.currentToken.data[-1][0].translate(asciiUpper2Lower))
for name, _ in self.currentToken.data[:-1]:
if self.currentToken.data[-1][0] == name:
self.tokenQueue.append(ParseError("duplicate-attribute"))
break
if self.currentToken.attribute_name in self.currentToken.attributes:
self.tokenQueue.append(ParseError("duplicate-attribute"))
# XXX Fix for above XXX
if emitToken:
self.emitCurrentToken()
Expand All @@ -900,23 +905,27 @@ def afterAttributeNameState(self):
elif data == ">":
self.emitCurrentToken()
elif data in asciiLetters:
self.currentToken.data.append([data, ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName(data)
self.state = self.attributeNameState
elif data == "/":
self.state = self.selfClosingStartTagState
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data.append(["\uFFFD", ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName("\uFFFD")
self.state = self.attributeNameState
elif data in ("'", '"', "<"):
self.tokenQueue.append(ParseError("invalid-character-after-attribute-name"))
self.currentToken.data.append([data, ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName(data)
self.state = self.attributeNameState
elif data is EOF:
self.tokenQueue.append(ParseError("expected-end-of-tag-but-got-eof"))
self.state = self.dataState
else:
self.currentToken.data.append([data, ""])
self.currentToken.clearAttribute()
self.currentToken.accumulateAttributeName(data)
self.state = self.attributeNameState
return True

Expand All @@ -936,17 +945,17 @@ def beforeAttributeValueState(self):
self.emitCurrentToken()
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data[-1][1] += "\uFFFD"
self.currentToken.accumulateAttributeValue("\uFFFD")
self.state = self.attributeValueUnQuotedState
elif data in ("=", "<", "`"):
self.tokenQueue.append(ParseError("equals-in-unquoted-attribute-value"))
self.currentToken.data[-1][1] += data
self.currentToken.accumulateAttributeValue(data)
self.state = self.attributeValueUnQuotedState
elif data is EOF:
self.tokenQueue.append(ParseError("expected-attribute-value-but-got-eof"))
self.state = self.dataState
else:
self.currentToken.data[-1][1] += data
self.currentToken.accumulateAttributeValue(data)
self.state = self.attributeValueUnQuotedState
return True

Expand All @@ -958,13 +967,12 @@ def attributeValueDoubleQuotedState(self):
self.processEntityInAttribute('"')
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data[-1][1] += "\uFFFD"
self.currentToken.accumulateAttributeValue("\uFFFD")
elif data is EOF:
self.tokenQueue.append(ParseError("eof-in-attribute-value-double-quote"))
self.state = self.dataState
else:
self.currentToken.data[-1][1] += data +\
self.stream.charsUntil(("\"", "&", "\u0000"))
self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("\"", "&", "\u0000")))
return True

def attributeValueSingleQuotedState(self):
Expand All @@ -975,13 +983,12 @@ def attributeValueSingleQuotedState(self):
self.processEntityInAttribute("'")
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data[-1][1] += "\uFFFD"
self.currentToken.accumulateAttributeValue("\uFFFD")
elif data is EOF:
self.tokenQueue.append(ParseError("eof-in-attribute-value-single-quote"))
self.state = self.dataState
else:
self.currentToken.data[-1][1] += data +\
self.stream.charsUntil(("'", "&", "\u0000"))
self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("'", "&", "\u0000")))
return True

def attributeValueUnQuotedState(self):
Expand All @@ -994,16 +1001,16 @@ def attributeValueUnQuotedState(self):
self.emitCurrentToken()
elif data in ('"', "'", "=", "<", "`"):
self.tokenQueue.append(ParseError("unexpected-character-in-unquoted-attribute-value"))
self.currentToken.data[-1][1] += data
self.currentToken.accumulateAttributeValue(data)
elif data == "\u0000":
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.data[-1][1] += "\uFFFD"
self.currentToken.accumulateAttributeValue("\uFFFD")
elif data is EOF:
self.tokenQueue.append(ParseError("eof-in-attribute-value-no-quotes"))
self.state = self.dataState
else:
self.currentToken.data[-1][1] += data + self.stream.charsUntil(
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters))
return True

def afterAttributeValueState(self):
Expand Down
38 changes: 16 additions & 22 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from . import _inputstream
from ._tokenizer import (
attributeMap,
HTMLTokenizer,
Characters,
SpaceCharacters,
Expand Down Expand Up @@ -471,7 +472,7 @@ def startTagHtml(self, token):
self.parser.parseError("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in token.data.items():
for attr, value in token.attributes.items():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
Expand Down Expand Up @@ -733,7 +734,7 @@ def startTagMeta(self, token):
self.tree.openElements.pop()
token.self_closing_acknowledged = True

attributes = token.data
attributes = token.attributes
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
Expand Down Expand Up @@ -1018,7 +1019,7 @@ def startTagBody(self, token):
assert self.parser.innerHTML
else:
self.parser.framesetOK = False
for attr, value in token.data.items():
for attr, value in token.attributes.items():
if attr not in self.tree.openElements[1].attributes:
self.tree.openElements[1].attributes[attr] = value

Expand Down Expand Up @@ -1162,8 +1163,8 @@ def startTagVoidFormatting(self, token):
def startTagInput(self, token):
framesetOK = self.parser.framesetOK
self.startTagVoidFormatting(token)
if ("type" in token.data and
token.data["type"].translate(asciiUpper2Lower) == "hidden"):
token_type = token.attributes.get('type', '')
if token_type.translate(asciiUpper2Lower) == "hidden":
# input type=hidden doesn't change framesetOK
self.parser.framesetOK = framesetOK

Expand All @@ -1184,28 +1185,23 @@ def startTagImage(self, token):
# No really...
self.parser.parseError("unexpected-start-tag-treated-as",
{"originalName": "image", "newName": "img"})
self.processStartTag(impliedTagToken("img", StartTag,
attributes=token.data,
selfClosing=token.self_closing))
self.processStartTag(impliedTagToken("img", StartTag, attributes=token.attributes))

def startTagIsIndex(self, token):
self.parser.parseError("deprecated-tag", {"name": "isindex"})
if self.tree.formPointer:
return
form_attrs = {}
if "action" in token.data:
form_attrs["action"] = token.data["action"]
if "action" in token.attributes:
form_attrs["action"] = token.attributes["action"]
self.processStartTag(impliedTagToken("form", StartTag,
attributes=form_attrs))
self.processStartTag(impliedTagToken("hr", StartTag))
self.processStartTag(impliedTagToken("label", StartTag))
# XXX Localization ...
if "prompt" in token.data:
prompt = token.data["prompt"]
else:
prompt = "This is a searchable index. Enter search keywords: "
prompt = token.attributes.get("prompt", "This is a searchable index. Enter search keywords: ")
self.processCharacters(Characters(prompt))
attributes = token.data.copy()
attributes = token.attributes.copy()
if "action" in attributes:
del attributes["action"]
if "prompt" in attributes:
Expand Down Expand Up @@ -1767,8 +1763,8 @@ def startTagStyleScript(self, token):
return self.parser.phases["inHead"].processStartTag(token)

def startTagInput(self, token):
if ("type" in token.data and
token.data["type"].translate(asciiUpper2Lower) == "hidden"):
token_type = token.attributes.get('type', '')
if token_type.translate(asciiUpper2Lower) == "hidden":
self.parser.parseError("unexpected-hidden-input-in-table")
self.tree.insertElement(token)
# XXX associate with form
Expand Down Expand Up @@ -2483,7 +2479,7 @@ def processStartTag(self, token):
currentNode = self.tree.openElements[-1]
if (token.name in self.breakoutElements or
(token.name == "font" and
set(token.data.keys()) & {"color", "face", "size"})):
set(token.attributes.keys()) & {"color", "face", "size"})):
self.parser.parseError("unexpected-html-element-in-foreign-content",
{"name": token.name})
while (self.tree.openElements[-1].namespace !=
Expand Down Expand Up @@ -2773,10 +2769,8 @@ def processEndTag(self, token):


def adjust_attributes(token, replacements):
needs_adjustment = viewkeys(token.data) & viewkeys(replacements)
if needs_adjustment:
token.data = type(token.data)((replacements.get(k, k), v)
for k, v in token.data.items())
if viewkeys(token.attributes) & viewkeys(replacements):
token.attributes = attributeMap((replacements.get(k, k), v) for k, v in token.attributes.items())


def impliedTagToken(name, type=EndTag, attributes=None,
Expand Down
6 changes: 3 additions & 3 deletions html5lib/tests/test_tokenizer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_maintain_attribute_order():
assert len(out) == 1
assert isinstance(out[0], StartTag)

attrs_tok = out[0].data
attrs_tok = out[0].attributes
assert len(attrs_tok) == len(attrs)

for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
Expand All @@ -41,7 +41,7 @@ def test_duplicate_attribute():
assert len(out) == 1
assert isinstance(out[0], StartTag)

attrs_tok = out[0].data
attrs_tok = out[0].attributes
assert len(attrs_tok) == 1
assert list(attrs_tok.items()) == [('a', '1')]

Expand All @@ -57,7 +57,7 @@ def test_maintain_duplicate_attribute_order():
assert len(out) == 1
assert isinstance(out[0], StartTag)

attrs_tok = out[0].data
attrs_tok = out[0].attributes
assert len(attrs_tok) == len(attrs)

for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
Expand Down
Loading