Skip to content

Tokenizer: use Python objects to represent tokens #521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
183d8a0
Consistency: consume a single character at a time during attribute na…
jayaddison Dec 29, 2020
2e86373
Refactor: pretranslate lowercase element and attribute names
jayaddison Dec 29, 2020
8f96b17
Restore self.currentToken safety check
jayaddison Dec 29, 2020
a912842
Alternate approach: do not pretranslate temporary buffered data
jayaddison Dec 30, 2020
f9f370e
Consistency: character consumption within double-escaped state
jayaddison Dec 30, 2020
bcee8bd
Refactor: use Python objects for tokens within tokenizer
jayaddison Dec 29, 2020
67262f8
Introduce type hierarchy for tag-related tokens
jayaddison Dec 29, 2020
900bdaf
Simplify tag token construction
jayaddison Dec 29, 2020
1f6cae9
Refactor token attribution name/value accumulation
jayaddison Dec 29, 2020
695ac1c
Cleanup: remove leavingThisState / emitToken logic
jayaddison Dec 29, 2020
b1a444b
Remove EmptyTag tokenizer token class
jayaddison Dec 29, 2020
bb7fabc
Refactor: pre-translate strings that are only used in lowercase context
jayaddison Dec 29, 2020
5f4ace9
Cleanup: remove getattr anti-pattern
jayaddison Dec 29, 2020
d744c86
Consistency: use camel-casing to correspond with existing codebase style
jayaddison Dec 29, 2020
1d62e69
Consistency: consume a single character at a time during attribute na…
jayaddison Dec 29, 2020
8772408
Merge branch 'tokenizer/pretranslate-lowercase-names' into tokenizer/…
jayaddison Dec 30, 2020
192cce0
Linting cleanup
jayaddison Dec 30, 2020
e76e0dd
Clarify method name: clearAttribute -> flushAttribute
jayaddison Jan 4, 2021
da37332
Merge branch 'master' into tokenizer/object-tokens
jayaddison Sep 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor: pre-translate strings that are only used in lowercase context
  • Loading branch information
jayaddison committed Dec 30, 2020
commit bb7fabc2eae796de21760b04d2b059009225848e
46 changes: 21 additions & 25 deletions html5lib/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, data=None):

class Doctype(Token):
def __init__(self, name, public_id, system_id, correct):
self.name = name
self.name = name.translate(asciiUpper2Lower)
self.public_id = public_id
self.system_id = system_id
self.correct = correct
Expand All @@ -44,7 +44,7 @@ class SpaceCharacters(Token):

class Tag(Token):
def __init__(self, name, attributes):
self.name = name
self.name = name.translate(asciiUpper2Lower)
self.attributes = attributeMap(attributes or {})
self.self_closing = False
self.attribute_name = ""
Expand Down Expand Up @@ -278,7 +278,6 @@ def emitCurrentToken(self):
token = self.currentToken
# Add token to the queue to be yielded
if isinstance(token, Tag):
token.name = token.name.translate(asciiUpper2Lower)
if self.currentToken.attribute_name in self.currentToken.attributes:
self.tokenQueue.append(ParseError("duplicate-attribute"))
token.clearAttribute()
Expand Down Expand Up @@ -456,7 +455,7 @@ def tagNameState(self):
self.tokenQueue.append(ParseError("invalid-codepoint"))
self.currentToken.name += "\uFFFD"
else:
self.currentToken.name += data
self.currentToken.name += data.translate(asciiUpper2Lower)
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
Expand All @@ -475,7 +474,7 @@ def rcdataLessThanSignState(self):
def rcdataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
self.state = self.rcdataEndTagNameState
else:
self.tokenQueue.append(Characters("</"))
Expand All @@ -484,7 +483,7 @@ def rcdataEndTagOpenState(self):
return True

def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
appropriate = self.currentToken.name == self.temporaryBuffer
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = EndTag(name=self.temporaryBuffer)
Expand All @@ -497,7 +496,7 @@ def rcdataEndTagNameState(self):
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
else:
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
self.stream.unget(data)
Expand All @@ -518,7 +517,7 @@ def rawtextLessThanSignState(self):
def rawtextEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
self.state = self.rawtextEndTagNameState
else:
self.tokenQueue.append(Characters("</"))
Expand All @@ -527,7 +526,7 @@ def rawtextEndTagOpenState(self):
return True

def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
appropriate = self.currentToken.name == self.temporaryBuffer
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = EndTag(name=self.temporaryBuffer)
Expand All @@ -540,7 +539,7 @@ def rawtextEndTagNameState(self):
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
else:
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
self.stream.unget(data)
Expand All @@ -564,7 +563,7 @@ def scriptDataLessThanSignState(self):
def scriptDataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
self.state = self.scriptDataEndTagNameState
else:
self.tokenQueue.append(Characters("</"))
Expand All @@ -573,7 +572,7 @@ def scriptDataEndTagOpenState(self):
return True

def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
appropriate = self.currentToken.name == self.temporaryBuffer
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = EndTag(name=self.temporaryBuffer)
Expand All @@ -586,7 +585,7 @@ def scriptDataEndTagNameState(self):
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
else:
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
self.stream.unget(data)
Expand Down Expand Up @@ -675,7 +674,7 @@ def scriptDataEscapedLessThanSignState(self):
self.state = self.scriptDataEscapedEndTagOpenState
elif data in asciiLetters:
self.tokenQueue.append(Characters("<" + data))
self.temporaryBuffer = data
self.temporaryBuffer = data.translate(asciiUpper2Lower)
self.state = self.scriptDataDoubleEscapeStartState
else:
self.tokenQueue.append(Characters("<"))
Expand All @@ -686,7 +685,7 @@ def scriptDataEscapedLessThanSignState(self):
def scriptDataEscapedEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer = data
self.temporaryBuffer = data.translate(asciiUpper2Lower)
self.state = self.scriptDataEscapedEndTagNameState
else:
self.tokenQueue.append(Characters("</"))
Expand All @@ -695,7 +694,7 @@ def scriptDataEscapedEndTagOpenState(self):
return True

def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken.name.lower() == self.temporaryBuffer.lower()
appropriate = self.currentToken.name == self.temporaryBuffer
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = EndTag(name=self.temporaryBuffer)
Expand All @@ -708,7 +707,7 @@ def scriptDataEscapedEndTagNameState(self):
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
else:
self.tokenQueue.append(Characters("</" + self.temporaryBuffer))
self.stream.unget(data)
Expand All @@ -719,13 +718,13 @@ def scriptDataDoubleEscapeStartState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append(Characters(data))
if self.temporaryBuffer.lower() == "script":
if self.temporaryBuffer == "script":
self.state = self.scriptDataDoubleEscapedState
else:
self.state = self.scriptDataEscapedState
elif data in asciiLetters:
self.tokenQueue.append(Characters(data))
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
else:
self.stream.unget(data)
self.state = self.scriptDataEscapedState
Expand Down Expand Up @@ -806,13 +805,13 @@ def scriptDataDoubleEscapeEndState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append(Characters(data))
if self.temporaryBuffer.lower() == "script":
if self.temporaryBuffer == "script":
self.state = self.scriptDataEscapedState
else:
self.state = self.scriptDataDoubleEscapedState
elif data in asciiLetters:
self.tokenQueue.append(Characters(data))
self.temporaryBuffer += data
self.temporaryBuffer += data.translate(asciiUpper2Lower)
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
Expand Down Expand Up @@ -1240,10 +1239,8 @@ def beforeDoctypeNameState(self):
def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.currentToken.name = self.currentToken.name.translate(asciiUpper2Lower)
self.state = self.afterDoctypeNameState
elif data == ">":
self.currentToken.name = self.currentToken.name.translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == "\u0000":
Expand All @@ -1253,11 +1250,10 @@ def doctypeNameState(self):
elif data is EOF:
self.tokenQueue.append(ParseError("eof-in-doctype-name"))
self.currentToken.correct = False
self.currentToken.name = self.currentToken.name.translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken.name += data
self.currentToken.name += data.translate(asciiUpper2Lower)
return True

def afterDoctypeNameState(self):
Expand Down