3
3
"""
4
4
5
5
import csv
6
+ import io
6
7
import sys
7
8
import pprint
8
9
import marshal
11
12
from xml .sax .saxutils import XMLGenerator
12
13
13
14
from scrapy .utils .serialize import ScrapyJSONEncoder
15
+ from scrapy .utils .python import to_bytes , to_unicode , to_native_str , is_listlike
14
16
from scrapy .item import BaseItem
17
+ from scrapy .exceptions import ScrapyDeprecationWarning
18
+ import warnings
19
+
15
20
16
21
__all__ = ['BaseItemExporter' , 'PprintItemExporter' , 'PickleItemExporter' ,
17
22
'CsvItemExporter' , 'XmlItemExporter' , 'JsonLinesItemExporter' ,
@@ -38,7 +43,7 @@ def export_item(self, item):
38
43
raise NotImplementedError
39
44
40
45
def serialize_field (self , field , name , value ):
41
- serializer = field .get ('serializer' , self . _to_str_if_unicode )
46
+ serializer = field .get ('serializer' , lambda x : x )
42
47
return serializer (value )
43
48
44
49
def start_exporting (self ):
@@ -47,9 +52,6 @@ def start_exporting(self):
47
52
def finish_exporting (self ):
48
53
pass
49
54
50
- def _to_str_if_unicode (self , value ):
51
- return value .encode (self .encoding ) if isinstance (value , unicode ) else value
52
-
53
55
def _get_serialized_fields (self , item , default_value = None , include_empty = None ):
54
56
"""Return the fields to export as an iterable of tuples
55
57
(name, serialized_value)
@@ -86,10 +88,10 @@ def __init__(self, file, **kwargs):
86
88
87
89
def export_item (self , item ):
88
90
itemdict = dict (self ._get_serialized_fields (item ))
89
- self .file .write (self .encoder .encode (itemdict ) + '\n ' )
91
+ self .file .write (to_bytes ( self .encoder .encode (itemdict ) + '\n ' ) )
90
92
91
93
92
- class JsonItemExporter (JsonLinesItemExporter ):
94
+ class JsonItemExporter (BaseItemExporter ):
93
95
94
96
def __init__ (self , file , ** kwargs ):
95
97
self ._configure (kwargs , dont_fail = True )
@@ -98,18 +100,18 @@ def __init__(self, file, **kwargs):
98
100
self .first_item = True
99
101
100
102
def start_exporting (self ):
101
- self .file .write ("[" )
103
+ self .file .write (b "[" )
102
104
103
105
def finish_exporting (self ):
104
- self .file .write ("]" )
106
+ self .file .write (b "]" )
105
107
106
108
def export_item (self , item ):
107
109
if self .first_item :
108
110
self .first_item = False
109
111
else :
110
- self .file .write (',\n ' )
112
+ self .file .write (b ',\n ' )
111
113
itemdict = dict (self ._get_serialized_fields (item ))
112
- self .file .write (self .encoder .encode (itemdict ))
114
+ self .file .write (to_bytes ( self .encoder .encode (itemdict ) ))
113
115
114
116
115
117
class XmlItemExporter (BaseItemExporter ):
@@ -139,7 +141,7 @@ def _export_xml_field(self, name, serialized_value):
139
141
if hasattr (serialized_value , 'items' ):
140
142
for subname , value in serialized_value .items ():
141
143
self ._export_xml_field (subname , value )
142
- elif hasattr (serialized_value , '__iter__' ):
144
+ elif is_listlike (serialized_value ):
143
145
for value in serialized_value :
144
146
self ._export_xml_field ('value' , value )
145
147
else :
@@ -153,10 +155,10 @@ def _export_xml_field(self, name, serialized_value):
153
155
# and Python 3.x will require unicode, so ">= 2.7.4" should be fine.
154
156
if sys .version_info [:3 ] >= (2 , 7 , 4 ):
155
157
def _xg_characters (self , serialized_value ):
156
- if not isinstance (serialized_value , unicode ):
158
+ if not isinstance (serialized_value , six . text_type ):
157
159
serialized_value = serialized_value .decode (self .encoding )
158
160
return self .xg .characters (serialized_value )
159
- else :
161
+ else : # pragma: no cover
160
162
def _xg_characters (self , serialized_value ):
161
163
return self .xg .characters (serialized_value )
162
164
@@ -166,17 +168,22 @@ class CsvItemExporter(BaseItemExporter):
166
168
def __init__ (self , file , include_headers_line = True , join_multivalued = ',' , ** kwargs ):
167
169
self ._configure (kwargs , dont_fail = True )
168
170
self .include_headers_line = include_headers_line
171
+ file = file if six .PY2 else io .TextIOWrapper (file , line_buffering = True )
169
172
self .csv_writer = csv .writer (file , ** kwargs )
170
173
self ._headers_not_written = True
171
174
self ._join_multivalued = join_multivalued
172
175
173
- def _to_str_if_unicode (self , value ):
176
+ def serialize_field (self , field , name , value ):
177
+ serializer = field .get ('serializer' , self ._join_if_needed )
178
+ return serializer (value )
179
+
180
+ def _join_if_needed (self , value ):
174
181
if isinstance (value , (list , tuple )):
175
182
try :
176
- value = self ._join_multivalued .join (value )
183
+ return self ._join_multivalued .join (value )
177
184
except TypeError : # list in value may not contain strings
178
185
pass
179
- return super ( CsvItemExporter , self ). _to_str_if_unicode ( value )
186
+ return value
180
187
181
188
def export_item (self , item ):
182
189
if self ._headers_not_written :
@@ -185,9 +192,16 @@ def export_item(self, item):
185
192
186
193
fields = self ._get_serialized_fields (item , default_value = '' ,
187
194
include_empty = True )
188
- values = [ x [ 1 ] for x in fields ]
195
+ values = list ( self . _build_row ( x for _ , x in fields ))
189
196
self .csv_writer .writerow (values )
190
197
198
+ def _build_row (self , values ):
199
+ for s in values :
200
+ try :
201
+ yield to_native_str (s )
202
+ except TypeError :
203
+ yield to_native_str (repr (s ))
204
+
191
205
def _write_headers_and_set_fields_to_export (self , item ):
192
206
if self .include_headers_line :
193
207
if not self .fields_to_export :
@@ -197,7 +211,8 @@ def _write_headers_and_set_fields_to_export(self, item):
197
211
else :
198
212
# use fields declared in Item
199
213
self .fields_to_export = list (item .fields .keys ())
200
- self .csv_writer .writerow (self .fields_to_export )
214
+ row = list (self ._build_row (self .fields_to_export ))
215
+ self .csv_writer .writerow (row )
201
216
202
217
203
218
class PickleItemExporter (BaseItemExporter ):
@@ -230,7 +245,7 @@ def __init__(self, file, **kwargs):
230
245
231
246
def export_item (self , item ):
232
247
itemdict = dict (self ._get_serialized_fields (item ))
233
- self .file .write (pprint .pformat (itemdict ) + '\n ' )
248
+ self .file .write (to_bytes ( pprint .pformat (itemdict ) + '\n ' ) )
234
249
235
250
236
251
class PythonItemExporter (BaseItemExporter ):
@@ -239,6 +254,13 @@ class PythonItemExporter(BaseItemExporter):
239
254
json, msgpack, binc, etc) can be used on top of it. Its main goal is to
240
255
seamless support what BaseItemExporter does plus nested items.
241
256
"""
257
+ def _configure (self , options , dont_fail = False ):
258
+ self .binary = options .pop ('binary' , True )
259
+ super (PythonItemExporter , self )._configure (options , dont_fail )
260
+ if self .binary :
261
+ warnings .warn (
262
+ "PythonItemExporter will drop support for binary export in the future" ,
263
+ ScrapyDeprecationWarning )
242
264
243
265
def serialize_field (self , field , name , value ):
244
266
serializer = field .get ('serializer' , self ._serialize_value )
@@ -249,13 +271,20 @@ def _serialize_value(self, value):
249
271
return self .export_item (value )
250
272
if isinstance (value , dict ):
251
273
return dict (self ._serialize_dict (value ))
252
- if hasattr (value , '__iter__' ):
274
+ if is_listlike (value ):
253
275
return [self ._serialize_value (v ) for v in value ]
254
- return self ._to_str_if_unicode (value )
276
+ if self .binary :
277
+ return to_bytes (value , encoding = self .encoding )
278
+ else :
279
+ return to_unicode (value , encoding = self .encoding )
255
280
256
281
def _serialize_dict (self , value ):
257
282
for key , val in six .iteritems (value ):
283
+ key = to_bytes (key ) if self .binary else key
258
284
yield key , self ._serialize_value (val )
259
285
260
286
def export_item (self , item ):
261
- return dict (self ._get_serialized_fields (item ))
287
+ result = dict (self ._get_serialized_fields (item ))
288
+ if self .binary :
289
+ result = dict (self ._serialize_dict (result ))
290
+ return result
0 commit comments