Refactor token attribution name/value accumulation

html5lib · jayaddison · Dec 29, 2020 · Dec 29, 2020 · Dec 29, 2020 · Dec 30, 2020
commit 1f6cae93c773266a90940a3228b6b89b7288d11d
diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
@@ -43,10 +43,24 @@ class SpaceCharacters(Token):
 
 
 class Tag(Token):
-    def __init__(self, name, data):
+    def __init__(self, name, attributes):
         self.name = name
-        self.data = data or []
+        self.attributes = attributeMap(attributes or {})
         self.self_closing = False
+        self.attribute_name = ""
+        self.attribute_value = ""
+
+    def clearAttribute(self):
+        if self.attribute_name and self.attribute_name not in self.attributes:
+            self.attributes[self.attribute_name] = self.attribute_value
+        self.attribute_name = ""
+        self.attribute_value = ""
+
+    def accumulateAttributeName(self, text):
+        self.attribute_name += text.translate(asciiUpper2Lower)
+
+    def accumulateAttributeValue(self, text):
+        self.attribute_value += text
 
 class StartTag(Tag):
     def __init__(self, name, data=None):
@@ -248,7 +262,7 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
                 output = "&" + "".join(charStack)
 
         if fromAttribute:
-            self.currentToken.data[-1][1] += output
+            self.currentToken.accumulateAttributeValue(output)
         else:
             if output in spaceCharacters:
                 token = SpaceCharacters(output)
@@ -270,17 +284,9 @@ def emitCurrentToken(self):
         # Add token to the queue to be yielded
         if isinstance(token, Tag):
             token.name = token.name.translate(asciiUpper2Lower)
-            if isinstance(token, StartTag):
-                raw = token.data
-                data = attributeMap(raw)
-                if len(raw) > len(data):
-                    # we had some duplicated attribute, fix so first wins
-                    was = dict(data)
-                    data.update(raw[::-1])
-                token.data = data
-
+            token.clearAttribute()
             if isinstance(token, EndTag):
-                if token.data:
+                if token.attributes:
                     self.tokenQueue.append(ParseError("attributes-in-end-tag"))
                 if token.self_closing:
                     self.tokenQueue.append(ParseError("self-closing-flag-on-end-tag"))
@@ -820,25 +826,29 @@ def beforeAttributeNameState(self):
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
         elif data in asciiLetters:
-            self.currentToken.data.append([data, ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName(data)
             self.state = self.attributeNameState
         elif data == ">":
             self.emitCurrentToken()
         elif data == "/":
             self.state = self.selfClosingStartTagState
         elif data in ("'", '"', "=", "<"):
             self.tokenQueue.append(ParseError("invalid-character-in-attribute-name"))
-            self.currentToken.data.append([data, ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName(data)
             self.state = self.attributeNameState
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data.append(["\uFFFD", ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName("\uFFFD")
             self.state = self.attributeNameState
         elif data is EOF:
             self.tokenQueue.append(ParseError("expected-attribute-name-but-got-eof"))
             self.state = self.dataState
         else:
-            self.currentToken.data.append([data, ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName(data)
             self.state = self.attributeNameState
         return True
 
@@ -849,8 +859,7 @@ def attributeNameState(self):
         if data == "=":
             self.state = self.beforeAttributeValueState
         elif data in asciiLetters:
-            self.currentToken.data[-1][0] += data +\
-                self.stream.charsUntil(asciiLetters, True)
+            self.currentToken.accumulateAttributeName(data + self.stream.charsUntil(asciiLetters, True))
             leavingThisState = False
         elif data == ">":
             # XXX If we emit here the attributes are converted to a dict
@@ -863,29 +872,25 @@ def attributeNameState(self):
             self.state = self.selfClosingStartTagState
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data[-1][0] += "\uFFFD"
+            self.currentToken.accumulateAttributeName("\uFFFD")
             leavingThisState = False
         elif data in ("'", '"', "<"):
             self.tokenQueue.append(ParseError("invalid-character-in-attribute-name"))
-            self.currentToken.data[-1][0] += data
+            self.currentToken.accumulateAttributeName(data)
             leavingThisState = False
         elif data is EOF:
             self.tokenQueue.append(ParseError("eof-in-attribute-name"))
             self.state = self.dataState
         else:
-            self.currentToken.data[-1][0] += data
+            self.currentToken.accumulateAttributeName(data)
             leavingThisState = False
 
         if leavingThisState:
             # Attributes are not dropped at this stage. That happens when the
             # start tag token is emitted so values can still be safely appended
             # to attributes, but we do want to report the parse error in time.
-            self.currentToken.data[-1][0] = (
-                self.currentToken.data[-1][0].translate(asciiUpper2Lower))
-            for name, _ in self.currentToken.data[:-1]:
-                if self.currentToken.data[-1][0] == name:
-                    self.tokenQueue.append(ParseError("duplicate-attribute"))
-                    break
+            if self.currentToken.attribute_name in self.currentToken.attributes:
+                self.tokenQueue.append(ParseError("duplicate-attribute"))
             # XXX Fix for above XXX
             if emitToken:
                 self.emitCurrentToken()
@@ -900,23 +905,27 @@ def afterAttributeNameState(self):
         elif data == ">":
             self.emitCurrentToken()
         elif data in asciiLetters:
-            self.currentToken.data.append([data, ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName(data)
             self.state = self.attributeNameState
         elif data == "/":
             self.state = self.selfClosingStartTagState
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data.append(["\uFFFD", ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName("\uFFFD")
             self.state = self.attributeNameState
         elif data in ("'", '"', "<"):
             self.tokenQueue.append(ParseError("invalid-character-after-attribute-name"))
-            self.currentToken.data.append([data, ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName(data)
             self.state = self.attributeNameState
         elif data is EOF:
             self.tokenQueue.append(ParseError("expected-end-of-tag-but-got-eof"))
             self.state = self.dataState
         else:
-            self.currentToken.data.append([data, ""])
+            self.currentToken.clearAttribute()
+            self.currentToken.accumulateAttributeName(data)
             self.state = self.attributeNameState
         return True
 
@@ -936,17 +945,17 @@ def beforeAttributeValueState(self):
             self.emitCurrentToken()
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data[-1][1] += "\uFFFD"
+            self.currentToken.accumulateAttributeValue("\uFFFD")
             self.state = self.attributeValueUnQuotedState
         elif data in ("=", "<", "`"):
             self.tokenQueue.append(ParseError("equals-in-unquoted-attribute-value"))
-            self.currentToken.data[-1][1] += data
+            self.currentToken.accumulateAttributeValue(data)
             self.state = self.attributeValueUnQuotedState
         elif data is EOF:
             self.tokenQueue.append(ParseError("expected-attribute-value-but-got-eof"))
             self.state = self.dataState
         else:
-            self.currentToken.data[-1][1] += data
+            self.currentToken.accumulateAttributeValue(data)
             self.state = self.attributeValueUnQuotedState
         return True
 
@@ -958,13 +967,12 @@ def attributeValueDoubleQuotedState(self):
             self.processEntityInAttribute('"')
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data[-1][1] += "\uFFFD"
+            self.currentToken.accumulateAttributeValue("\uFFFD")
         elif data is EOF:
             self.tokenQueue.append(ParseError("eof-in-attribute-value-double-quote"))
             self.state = self.dataState
         else:
-            self.currentToken.data[-1][1] += data +\
-                self.stream.charsUntil(("\"", "&", "\u0000"))
+            self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("\"", "&", "\u0000")))
         return True
 
     def attributeValueSingleQuotedState(self):
@@ -975,13 +983,12 @@ def attributeValueSingleQuotedState(self):
             self.processEntityInAttribute("'")
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data[-1][1] += "\uFFFD"
+            self.currentToken.accumulateAttributeValue("\uFFFD")
         elif data is EOF:
             self.tokenQueue.append(ParseError("eof-in-attribute-value-single-quote"))
             self.state = self.dataState
         else:
-            self.currentToken.data[-1][1] += data +\
-                self.stream.charsUntil(("'", "&", "\u0000"))
+            self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(("'", "&", "\u0000")))
         return True
 
     def attributeValueUnQuotedState(self):
@@ -994,16 +1001,16 @@ def attributeValueUnQuotedState(self):
             self.emitCurrentToken()
         elif data in ('"', "'", "=", "<", "`"):
             self.tokenQueue.append(ParseError("unexpected-character-in-unquoted-attribute-value"))
-            self.currentToken.data[-1][1] += data
+            self.currentToken.accumulateAttributeValue(data)
         elif data == "\u0000":
             self.tokenQueue.append(ParseError("invalid-codepoint"))
-            self.currentToken.data[-1][1] += "\uFFFD"
+            self.currentToken.accumulateAttributeValue("\uFFFD")
         elif data is EOF:
             self.tokenQueue.append(ParseError("eof-in-attribute-value-no-quotes"))
             self.state = self.dataState
         else:
-            self.currentToken.data[-1][1] += data + self.stream.charsUntil(
-                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
+            self.currentToken.accumulateAttributeValue(data + self.stream.charsUntil(
+                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters))
         return True
 
     def afterAttributeValueState(self):

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -5,6 +5,7 @@
 
 from . import _inputstream
 from ._tokenizer import (
+    attributeMap,
     HTMLTokenizer,
     Characters,
     SpaceCharacters,
@@ -471,7 +472,7 @@ def startTagHtml(self, token):
                 self.parser.parseError("non-html-root")
             # XXX Need a check here to see if the first start tag token emitted is
             # this token... If it's not, invoke self.parser.parseError().
-            for attr, value in token.data.items():
+            for attr, value in token.attributes.items():
                 if attr not in self.tree.openElements[0].attributes:
                     self.tree.openElements[0].attributes[attr] = value
             self.parser.firstStartTag = False
@@ -733,7 +734,7 @@ def startTagMeta(self, token):
             self.tree.openElements.pop()
             token.self_closing_acknowledged = True
 
-            attributes = token.data
+            attributes = token.attributes
             if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
                 if "charset" in attributes:
                     self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
@@ -1018,7 +1019,7 @@ def startTagBody(self, token):
                 assert self.parser.innerHTML
             else:
                 self.parser.framesetOK = False
-                for attr, value in token.data.items():
+                for attr, value in token.attributes.items():
                     if attr not in self.tree.openElements[1].attributes:
                         self.tree.openElements[1].attributes[attr] = value
 
@@ -1162,8 +1163,8 @@ def startTagVoidFormatting(self, token):
         def startTagInput(self, token):
             framesetOK = self.parser.framesetOK
             self.startTagVoidFormatting(token)
-            if ("type" in token.data and
-                    token.data["type"].translate(asciiUpper2Lower) == "hidden"):
+            token_type = token.attributes.get('type', '')
+            if token_type.translate(asciiUpper2Lower) == "hidden":
                 # input type=hidden doesn't change framesetOK
                 self.parser.framesetOK = framesetOK
 
@@ -1184,28 +1185,23 @@ def startTagImage(self, token):
             # No really...
             self.parser.parseError("unexpected-start-tag-treated-as",
                                    {"originalName": "image", "newName": "img"})
-            self.processStartTag(impliedTagToken("img", StartTag,
-                                                 attributes=token.data,
-                                                 selfClosing=token.self_closing))
+            self.processStartTag(impliedTagToken("img", StartTag, attributes=token.attributes))
 
         def startTagIsIndex(self, token):
             self.parser.parseError("deprecated-tag", {"name": "isindex"})
             if self.tree.formPointer:
                 return
             form_attrs = {}
-            if "action" in token.data:
-                form_attrs["action"] = token.data["action"]
+            if "action" in token.attributes:
+                form_attrs["action"] = token.attributes["action"]
             self.processStartTag(impliedTagToken("form", StartTag,
                                                  attributes=form_attrs))
             self.processStartTag(impliedTagToken("hr", StartTag))
             self.processStartTag(impliedTagToken("label", StartTag))
             # XXX Localization ...
-            if "prompt" in token.data:
-                prompt = token.data["prompt"]
-            else:
-                prompt = "This is a searchable index. Enter search keywords: "
+            prompt = token.attributes.get("prompt", "This is a searchable index. Enter search keywords: ")
             self.processCharacters(Characters(prompt))
-            attributes = token.data.copy()
+            attributes = token.attributes.copy()
             if "action" in attributes:
                 del attributes["action"]
             if "prompt" in attributes:
@@ -1767,8 +1763,8 @@ def startTagStyleScript(self, token):
             return self.parser.phases["inHead"].processStartTag(token)
 
         def startTagInput(self, token):
-            if ("type" in token.data and
-                    token.data["type"].translate(asciiUpper2Lower) == "hidden"):
+            token_type = token.attributes.get('type', '')
+            if token_type.translate(asciiUpper2Lower) == "hidden":
                 self.parser.parseError("unexpected-hidden-input-in-table")
                 self.tree.insertElement(token)
                 # XXX associate with form
@@ -2483,7 +2479,7 @@ def processStartTag(self, token):
             currentNode = self.tree.openElements[-1]
             if (token.name in self.breakoutElements or
                 (token.name == "font" and
-                 set(token.data.keys()) & {"color", "face", "size"})):
+                 set(token.attributes.keys()) & {"color", "face", "size"})):
                 self.parser.parseError("unexpected-html-element-in-foreign-content",
                                        {"name": token.name})
                 while (self.tree.openElements[-1].namespace !=
@@ -2773,10 +2769,8 @@ def processEndTag(self, token):
 
 
 def adjust_attributes(token, replacements):
-    needs_adjustment = viewkeys(token.data) & viewkeys(replacements)
-    if needs_adjustment:
-        token.data = type(token.data)((replacements.get(k, k), v)
-                                            for k, v in token.data.items())
+    if viewkeys(token.attributes) & viewkeys(replacements):
+        token.attributes = attributeMap((replacements.get(k, k), v) for k, v in token.attributes.items())
 
 
 def impliedTagToken(name, type=EndTag, attributes=None,

diff --git a/html5lib/tests/test_tokenizer2.py b/html5lib/tests/test_tokenizer2.py
@@ -24,7 +24,7 @@ def test_maintain_attribute_order():
     assert len(out) == 1
     assert isinstance(out[0], StartTag)
 
-    attrs_tok = out[0].data
+    attrs_tok = out[0].attributes
     assert len(attrs_tok) == len(attrs)
 
     for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
@@ -41,7 +41,7 @@ def test_duplicate_attribute():
     assert len(out) == 1
     assert isinstance(out[0], StartTag)
 
-    attrs_tok = out[0].data
+    attrs_tok = out[0].attributes
     assert len(attrs_tok) == 1
     assert list(attrs_tok.items()) == [('a', '1')]
 
@@ -57,7 +57,7 @@ def test_maintain_duplicate_attribute_order():
     assert len(out) == 1
     assert isinstance(out[0], StartTag)
 
-    attrs_tok = out[0].data
+    attrs_tok = out[0].attributes
     assert len(attrs_tok) == len(attrs)
 
     for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):