Skip to content

Commit e3d6945

Browse files
committed
Find form nodes in invalid html5 documents
lxml fails to parse invalid html5 documents This error was reported in scrapy/loginform#3 closes scrapy#243 Conflicts: scrapy/http/request/form.py
1 parent a274276 commit e3d6945

File tree

2 files changed

+12
-3
lines changed

2 files changed

+12
-3
lines changed

scrapy/http/request/form.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def _get_form(response, formname, formnumber):
4949
"""Find the form element """
5050
from scrapy.selector.lxmldocument import LxmlDocument
5151
root = LxmlDocument(response, lxml.html.HTMLParser)
52-
if not root.forms:
52+
forms = root.xpath('//form')
53+
if not forms:
5354
raise ValueError("No <form> element found in %s" % response)
5455

5556
if formname is not None:
@@ -61,7 +62,7 @@ def _get_form(response, formname, formnumber):
6162
# or invalid
6263
if formnumber is not None:
6364
try:
64-
form = root.forms[formnumber]
65+
form = forms[formnumber]
6566
except IndexError:
6667
raise IndexError("Form number %d not found in %s" %
6768
(formnumber, response))

scrapy/tests/test_http_request.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def test_ajax_url(http://github.com/self):
116116

117117
def test_copy(self):
118118
"""Test Request copy"""
119-
119+
120120
def somecallback():
121121
pass
122122

@@ -400,6 +400,14 @@ def test_from_response_errors_noform(self):
400400
response = _buildresponse("""<html></html>""")
401401
self.assertRaises(ValueError, self.request_class.from_response, response)
402402

403+
def test_from_response_invalid_html5(self):
404+
response = _buildresponse("""<!DOCTYPE html><body></html><form>"""
405+
"""<input type="text" name="foo" value="xxx">"""
406+
"""</form></body></html>""")
407+
req = self.request_class.from_response(response, formdata={'bar': 'buz'})
408+
fs = _qs(req)
409+
self.assertEqual(fs, {'foo': ['xxx'], 'bar': ['buz']})
410+
403411
def test_from_response_errors_formnumber(self):
404412
response = _buildresponse(
405413
"""<form action="get.php" method="GET">

0 commit comments

Comments
 (0)