Skip to content

Commit ce21884

Browse files
committed
migrating scrapy Selector to use Parsel
1 parent f67a808 commit ce21884

File tree

4 files changed

+15
-156
lines changed

4 files changed

+15
-156
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ queuelib
77
six>=1.5.2
88
PyDispatcher>=2.0.5
99
service_identity
10+
parsel>=0.9.0

scrapy/selector/unified.py

Lines changed: 12 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,18 @@
22
XPath selectors based on lxml
33
"""
44

5-
from lxml import etree
6-
import six
7-
8-
from scrapy.utils.misc import extract_regex
95
from scrapy.utils.trackref import object_ref
10-
from scrapy.utils.python import to_bytes, flatten, iflatten
11-
from scrapy.utils.decorators import deprecated
6+
from scrapy.utils.python import to_bytes
127
from scrapy.http import HtmlResponse, XmlResponse
8+
from scrapy.utils.decorators import deprecated
9+
from parsel import Selector as ParselSelector, SelectorList
10+
from parsel.unified import _ctgroup
1311
from .lxmldocument import LxmlDocument
14-
from .csstranslator import ScrapyHTMLTranslator, ScrapyGenericTranslator
1512

1613

1714
__all__ = ['Selector', 'SelectorList']
1815

1916

20-
class SafeXMLParser(etree.XMLParser):
21-
def __init__(self, *args, **kwargs):
22-
kwargs.setdefault('resolve_entities', False)
23-
super(SafeXMLParser, self).__init__(*args, **kwargs)
24-
25-
_ctgroup = {
26-
'html': {'_parser': etree.HTMLParser,
27-
'_csstranslator': ScrapyHTMLTranslator(),
28-
'_tostring_method': 'html'},
29-
'xml': {'_parser': SafeXMLParser,
30-
'_csstranslator': ScrapyGenericTranslator(),
31-
'_tostring_method': 'xml'},
32-
}
33-
34-
3517
def _st(response, st):
3618
if st is None:
3719
return 'xml' if isinstance(response, XmlResponse) else 'html'
@@ -47,111 +29,25 @@ def _response_from_text(text, st):
4729
body=to_bytes(text, 'utf-8'))
4830

4931

50-
class Selector(object_ref):
32+
class Selector(ParselSelector, object_ref):
5133

52-
__slots__ = ['response', 'text', 'namespaces', 'type', '_expr', '_root',
53-
'__weakref__', '_parser', '_csstranslator', '_tostring_method']
34+
__slots__ = ['response']
5435

55-
_default_type = None
56-
_default_namespaces = {
57-
"re": "http://exslt.org/regular-expressions",
36+
def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
37+
st = _st(response, type or self._default_type)
38+
root = kwargs.get('root', root)
5839

59-
# supported in libxslt:
60-
# set:difference
61-
# set:has-same-node
62-
# set:intersection
63-
# set:leading
64-
# set:trailing
65-
"set": "http://exslt.org/sets"
66-
}
67-
_lxml_smart_strings = False
68-
69-
def __init__(self, response=None, text=None, type=None, namespaces=None,
70-
_root=None, _expr=None):
71-
self.type = st = _st(response, type or self._default_type)
7240
self._parser = _ctgroup[st]['_parser']
73-
self._csstranslator = _ctgroup[st]['_csstranslator']
74-
self._tostring_method = _ctgroup[st]['_tostring_method']
7541

7642
if text is not None:
7743
response = _response_from_text(text, st)
7844

7945
if response is not None:
80-
_root = LxmlDocument(response, self._parser)
46+
root = LxmlDocument(response, self._parser)
8147

8248
self.response = response
83-
self.namespaces = dict(self._default_namespaces)
84-
if namespaces is not None:
85-
self.namespaces.update(namespaces)
86-
self._root = _root
87-
self._expr = _expr
88-
89-
def xpath(self, query):
90-
try:
91-
xpathev = self._root.xpath
92-
except AttributeError:
93-
return SelectorList([])
94-
95-
try:
96-
result = xpathev(query, namespaces=self.namespaces,
97-
smart_strings=self._lxml_smart_strings)
98-
except etree.XPathError:
99-
msg = u"Invalid XPath: %s" % query
100-
raise ValueError(msg if six.PY3 else msg.encode("unicode_escape"))
101-
102-
if type(result) is not list:
103-
result = [result]
104-
105-
result = [self.__class__(_root=x, _expr=query,
106-
namespaces=self.namespaces,
107-
type=self.type)
108-
for x in result]
109-
return SelectorList(result)
110-
111-
def css(self, query):
112-
return self.xpath(self._css2xpath(query))
113-
114-
def _css2xpath(self, query):
115-
return self._csstranslator.css_to_xpath(query)
116-
117-
def re(self, regex):
118-
return extract_regex(regex, self.extract())
119-
120-
def extract(self):
121-
try:
122-
return etree.tostring(self._root,
123-
method=self._tostring_method,
124-
encoding="unicode",
125-
with_tail=False)
126-
except (AttributeError, TypeError):
127-
if self._root is True:
128-
return u'1'
129-
elif self._root is False:
130-
return u'0'
131-
else:
132-
return six.text_type(self._root)
133-
134-
def register_namespace(self, prefix, uri):
135-
if self.namespaces is None:
136-
self.namespaces = {}
137-
self.namespaces[prefix] = uri
138-
139-
def remove_namespaces(self):
140-
for el in self._root.iter('*'):
141-
if el.tag.startswith('{'):
142-
el.tag = el.tag.split('}', 1)[1]
143-
# loop on element attributes also
144-
for an in el.attrib.keys():
145-
if an.startswith('{'):
146-
el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
147-
148-
def __nonzero__(self):
149-
return bool(self.extract())
150-
151-
def __str__(self):
152-
data = repr(self.extract()[:40])
153-
return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
154-
__repr__ = __str__
49+
text = response.body_as_unicode() if response else None
50+
super(Selector, self).__init__(text=text, type=st, root=root, **kwargs)
15551

15652
# Deprecated api
15753
@deprecated(use_instead='.xpath()')
@@ -162,42 +58,3 @@ def select(self, xpath):
16258
def extract_unquoted(self):
16359
return self.extract()
16460

165-
166-
class SelectorList(list):
167-
168-
def __getslice__(self, i, j):
169-
return self.__class__(list.__getslice__(self, i, j))
170-
171-
def xpath(self, xpath):
172-
return self.__class__(flatten([x.xpath(xpath) for x in self]))
173-
174-
def css(self, xpath):
175-
return self.__class__(flatten([x.css(xpath) for x in self]))
176-
177-
def re(self, regex):
178-
return flatten([x.re(regex) for x in self])
179-
180-
def re_first(self, regex):
181-
for el in iflatten(x.re(regex) for x in self):
182-
return el
183-
184-
def extract(self):
185-
return [x.extract() for x in self]
186-
187-
def extract_first(self, default=None):
188-
for x in self:
189-
return x.extract()
190-
else:
191-
return default
192-
193-
@deprecated(use_instead='.extract()')
194-
def extract_unquoted(self):
195-
return [x.extract_unquoted() for x in self]
196-
197-
@deprecated(use_instead='.xpath()')
198-
def x(self, xpath):
199-
return self.select(xpath)
200-
201-
@deprecated(use_instead='.xpath()')
202-
def select(self, xpath):
203-
return self.xpath(xpath)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
'pyOpenSSL',
4545
'cssselect>=0.9',
4646
'six>=1.5.2',
47+
'parsel>=0.9.0',
4748
'PyDispatcher>=2.0.5',
4849
'service_identity',
4950
],

tests/test_selector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ def test_nested_select_on_text_nodes(self):
332332

333333
def test_weakref_slots(self):
334334
"""Check that classes are using slots and are weak-referenceable"""
335-
x = self.sscls()
335+
x = self.sscls(text='')
336336
weakref.ref(x)
337337
assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \
338338
x.__class__.__name__

0 commit comments

Comments
 (0)