Skip to content

Commit efe589c

Browse files
committed
Merge pull request scrapy#882 from ahlen/feature/csvfeed-quotechar
[MRG+1] Allow to specify the quotechar in CSVFeedSpider
2 parents 38dcf50 + 22da178 commit efe589c

File tree

5 files changed

+58
-8
lines changed

5 files changed

+58
-8
lines changed

docs/topics/spiders.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,11 @@ CSVFeedSpider
523523
A string with the separator character for each field in the CSV file
524524
Defaults to ``','`` (comma).
525525

526+
.. attribute:: quotechar
527+
528+
A string with the enclosure character for each field in the CSV file
529+
Defaults to ``'"'`` (quotation mark).
530+
526531
.. attribute:: headers
527532

528533
A list of the rows contained in the file CSV feed which will be used to
@@ -550,6 +555,7 @@ Let's see an example similar to the previous one, but using a
550555
allowed_domains = ['example.com']
551556
start_urls = ['http://www.example.com/feed.csv']
552557
delimiter = ';'
558+
quotechar = "'"
553559
headers = ['id', 'name', 'description']
554560

555561
def parse_row(self, response, row):

scrapy/contrib/spiders/feed.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,12 @@ class CSVFeedSpider(Spider):
9797
It receives a CSV file in a response; iterates through each of its rows,
9898
and calls parse_row with a dict containing each field's data.
9999
100-
You can set some options regarding the CSV file, such as the delimiter
100+
You can set some options regarding the CSV file, such as the delimiter, quotechar
101101
and the file's headers.
102102
"""
103103

104104
delimiter = None # When this is None, python's csv module's default delimiter is used
105+
quotechar = None # When this is None, python's csv module's default quotechar is used
105106
headers = None
106107

107108
def process_results(self, response, results):
@@ -123,7 +124,7 @@ def parse_rows(self, response):
123124
process_results methods for pre and post-processing purposes.
124125
"""
125126

126-
for row in csviter(response, self.delimiter, self.headers):
127+
for row in csviter(response, self.delimiter, self.headers, self.quotechar):
127128
ret = self.parse_row(response, row)
128129
if isinstance(ret, (BaseItem, Request)):
129130
ret = [ret]

scrapy/utils/iterators.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,32 @@ def xmliter(obj, nodename):
3535
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
3636

3737

38-
def csviter(obj, delimiter=None, headers=None, encoding=None):
38+
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
3939
""" Returns an iterator of dictionaries from the given csv object
4040
4141
obj can be:
4242
- a Response object
4343
- a unicode string
4444
- a string encoded as utf-8
4545
46-
delimiter is the character used to separate field on the given obj.
46+
delimiter is the character used to separate fields on the given obj.
4747
4848
headers is an iterable that when provided offers the keys
4949
for the returned dictionaries, if not the first row is used.
50+
51+
quotechar is the character used to enclosure fields on the given obj.
5052
"""
53+
5154
encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
5255
def _getrow(csv_r):
5356
return [str_to_unicode(field, encoding) for field in next(csv_r)]
5457

5558
lines = BytesIO(_body_or_str(obj, unicode=False))
56-
if delimiter:
57-
csv_r = csv.reader(lines, delimiter=delimiter)
58-
else:
59-
csv_r = csv.reader(lines)
59+
60+
kwargs = {}
61+
if delimiter: kwargs["delimiter"] = delimiter
62+
if quotechar: kwargs["quotechar"] = quotechar
63+
csv_r = csv.reader(lines, **kwargs)
6064

6165
if not headers:
6266
headers = _getrow(csv_r)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
'id','name','value'
2+
1,'alpha','foobar'
3+
2,'unicode','únícódé‽'
4+
'3','multi','foo
5+
bar'
6+
4,'empty',

tests/test_utils_iterators.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,39 @@ def test_csviter_delimiter(self):
159159
{u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL},
160160
{u'id': u'4', u'name': u'empty', u'value': u''}])
161161

162+
def test_csviter_quotechar(self):
163+
body1 = get_testdata('feeds', 'feed-sample6.csv')
164+
body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')
165+
166+
response1 = TextResponse(url="http://example.com/", body=body1)
167+
csv1 = csviter(response1, quotechar="'")
168+
169+
self.assertEqual([row for row in csv1],
170+
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
171+
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
172+
{u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL},
173+
{u'id': u'4', u'name': u'empty', u'value': u''}])
174+
175+
response2 = TextResponse(url="http://example.com/", body=body2)
176+
csv2 = csviter(response2, delimiter="|", quotechar="'")
177+
178+
self.assertEqual([row for row in csv2],
179+
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
180+
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
181+
{u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL},
182+
{u'id': u'4', u'name': u'empty', u'value': u''}])
183+
184+
def test_csviter_wrong_quotechar(self):
185+
body = get_testdata('feeds', 'feed-sample6.csv')
186+
response = TextResponse(url="http://example.com/", body=body)
187+
csv = csviter(response)
188+
189+
self.assertEqual([row for row in csv],
190+
[{u"'id'": u"1", u"'name'": u"'alpha'", u"'value'": u"'foobar'"},
191+
{u"'id'": u"2", u"'name'": u"'unicode'", u"'value'": u"'\xfan\xedc\xf3d\xe9\u203d'"},
192+
{u"'id'": u"'3'", u"'name'": u"'multi'", u"'value'": u"'foo"},
193+
{u"'id'": u"4", u"'name'": u"'empty'", u"'value'": u""}])
194+
162195
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
163196
body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
164197
response = Response(url="http://example.com/", body=body)

0 commit comments

Comments
 (0)