2
2
XPath selectors based on lxml
3
3
"""
4
4
5
- from lxml import etree
6
- import six
7
-
8
- from scrapy .utils .misc import extract_regex
9
5
from scrapy .utils .trackref import object_ref
10
- from scrapy .utils .python import to_bytes , flatten , iflatten
11
- from scrapy .utils .decorators import deprecated
6
+ from scrapy .utils .python import to_bytes
12
7
from scrapy .http import HtmlResponse , XmlResponse
8
+ from scrapy .utils .decorators import deprecated
9
+ from parsel import Selector as ParselSelector , SelectorList
10
+ from parsel .unified import _ctgroup
13
11
from .lxmldocument import LxmlDocument
14
- from .csstranslator import ScrapyHTMLTranslator , ScrapyGenericTranslator
15
12
16
13
17
14
__all__ = ['Selector' , 'SelectorList' ]
18
15
19
16
20
- class SafeXMLParser (etree .XMLParser ):
21
- def __init__ (self , * args , ** kwargs ):
22
- kwargs .setdefault ('resolve_entities' , False )
23
- super (SafeXMLParser , self ).__init__ (* args , ** kwargs )
24
-
25
- _ctgroup = {
26
- 'html' : {'_parser' : etree .HTMLParser ,
27
- '_csstranslator' : ScrapyHTMLTranslator (),
28
- '_tostring_method' : 'html' },
29
- 'xml' : {'_parser' : SafeXMLParser ,
30
- '_csstranslator' : ScrapyGenericTranslator (),
31
- '_tostring_method' : 'xml' },
32
- }
33
-
34
-
35
17
def _st (response , st ):
36
18
if st is None :
37
19
return 'xml' if isinstance (response , XmlResponse ) else 'html'
@@ -47,111 +29,25 @@ def _response_from_text(text, st):
47
29
body = to_bytes (text , 'utf-8' ))
48
30
49
31
50
- class Selector (object_ref ):
32
+ class Selector (ParselSelector , object_ref ):
51
33
52
- __slots__ = ['response' , 'text' , 'namespaces' , 'type' , '_expr' , '_root' ,
53
- '__weakref__' , '_parser' , '_csstranslator' , '_tostring_method' ]
34
+ __slots__ = ['response' ]
54
35
55
- _default_type = None
56
- _default_namespaces = {
57
- "re" : "http://exslt.org/regular-expressions" ,
36
+ def __init__ ( self , response = None , text = None , type = None , root = None , ** kwargs ):
37
+ st = _st ( response , type or self . _default_type )
38
+ root = kwargs . get ( 'root' , root )
58
39
59
- # supported in libxslt:
60
- # set:difference
61
- # set:has-same-node
62
- # set:intersection
63
- # set:leading
64
- # set:trailing
65
- "set" : "http://exslt.org/sets"
66
- }
67
- _lxml_smart_strings = False
68
-
69
- def __init__ (self , response = None , text = None , type = None , namespaces = None ,
70
- _root = None , _expr = None ):
71
- self .type = st = _st (response , type or self ._default_type )
72
40
self ._parser = _ctgroup [st ]['_parser' ]
73
- self ._csstranslator = _ctgroup [st ]['_csstranslator' ]
74
- self ._tostring_method = _ctgroup [st ]['_tostring_method' ]
75
41
76
42
if text is not None :
77
43
response = _response_from_text (text , st )
78
44
79
45
if response is not None :
80
- _root = LxmlDocument (response , self ._parser )
46
+ root = LxmlDocument (response , self ._parser )
81
47
82
48
self .response = response
83
- self .namespaces = dict (self ._default_namespaces )
84
- if namespaces is not None :
85
- self .namespaces .update (namespaces )
86
- self ._root = _root
87
- self ._expr = _expr
88
-
89
- def xpath (self , query ):
90
- try :
91
- xpathev = self ._root .xpath
92
- except AttributeError :
93
- return SelectorList ([])
94
-
95
- try :
96
- result = xpathev (query , namespaces = self .namespaces ,
97
- smart_strings = self ._lxml_smart_strings )
98
- except etree .XPathError :
99
- msg = u"Invalid XPath: %s" % query
100
- raise ValueError (msg if six .PY3 else msg .encode ("unicode_escape" ))
101
-
102
- if type (result ) is not list :
103
- result = [result ]
104
-
105
- result = [self .__class__ (_root = x , _expr = query ,
106
- namespaces = self .namespaces ,
107
- type = self .type )
108
- for x in result ]
109
- return SelectorList (result )
110
-
111
- def css (self , query ):
112
- return self .xpath (self ._css2xpath (query ))
113
-
114
- def _css2xpath (self , query ):
115
- return self ._csstranslator .css_to_xpath (query )
116
-
117
- def re (self , regex ):
118
- return extract_regex (regex , self .extract ())
119
-
120
- def extract (self ):
121
- try :
122
- return etree .tostring (self ._root ,
123
- method = self ._tostring_method ,
124
- encoding = "unicode" ,
125
- with_tail = False )
126
- except (AttributeError , TypeError ):
127
- if self ._root is True :
128
- return u'1'
129
- elif self ._root is False :
130
- return u'0'
131
- else :
132
- return six .text_type (self ._root )
133
-
134
- def register_namespace (self , prefix , uri ):
135
- if self .namespaces is None :
136
- self .namespaces = {}
137
- self .namespaces [prefix ] = uri
138
-
139
- def remove_namespaces (self ):
140
- for el in self ._root .iter ('*' ):
141
- if el .tag .startswith ('{' ):
142
- el .tag = el .tag .split ('}' , 1 )[1 ]
143
- # loop on element attributes also
144
- for an in el .attrib .keys ():
145
- if an .startswith ('{' ):
146
- el .attrib [an .split ('}' , 1 )[1 ]] = el .attrib .pop (an )
147
-
148
- def __nonzero__ (self ):
149
- return bool (self .extract ())
150
-
151
- def __str__ (self ):
152
- data = repr (self .extract ()[:40 ])
153
- return "<%s xpath=%r data=%s>" % (type (self ).__name__ , self ._expr , data )
154
- __repr__ = __str__
49
+ text = response .body_as_unicode () if response else None
50
+ super (Selector , self ).__init__ (text = text , type = st , root = root , ** kwargs )
155
51
156
52
# Deprecated api
157
53
@deprecated (use_instead = '.xpath()' )
@@ -162,42 +58,3 @@ def select(self, xpath):
162
58
def extract_unquoted (self ):
163
59
return self .extract ()
164
60
165
-
166
- class SelectorList (list ):
167
-
168
- def __getslice__ (self , i , j ):
169
- return self .__class__ (list .__getslice__ (self , i , j ))
170
-
171
- def xpath (self , xpath ):
172
- return self .__class__ (flatten ([x .xpath (xpath ) for x in self ]))
173
-
174
- def css (self , xpath ):
175
- return self .__class__ (flatten ([x .css (xpath ) for x in self ]))
176
-
177
- def re (self , regex ):
178
- return flatten ([x .re (regex ) for x in self ])
179
-
180
- def re_first (self , regex ):
181
- for el in iflatten (x .re (regex ) for x in self ):
182
- return el
183
-
184
- def extract (self ):
185
- return [x .extract () for x in self ]
186
-
187
- def extract_first (self , default = None ):
188
- for x in self :
189
- return x .extract ()
190
- else :
191
- return default
192
-
193
- @deprecated (use_instead = '.extract()' )
194
- def extract_unquoted (self ):
195
- return [x .extract_unquoted () for x in self ]
196
-
197
- @deprecated (use_instead = '.xpath()' )
198
- def x (self , xpath ):
199
- return self .select (xpath )
200
-
201
- @deprecated (use_instead = '.xpath()' )
202
- def select (self , xpath ):
203
- return self .xpath (xpath )
0 commit comments