Skip to content

Commit db291c6

Browse files
committed
scrapy
1 parent e0cf983 commit db291c6

File tree

5 files changed

+83
-7
lines changed

5 files changed

+83
-7
lines changed

learn_scrapy/learn_scrapy_01.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
from scrapy import Selector
5+
6+
__author__ = 'Mr.Huo'
7+
8+
html = '''
9+
<ul class="list">
10+
<li>1</li>
11+
<li>2</li>
12+
<li>3</li>
13+
</ul>
14+
<ul class="list">
15+
<li>4</li>
16+
<li>5</li>
17+
<li>6</li>
18+
</ul>
19+
<div class="hero shout"><time datetime="2014-07-23 19:00">Special date</time></div>
20+
'''
21+
22+
23+
def print_iter(iter_list):
24+
if iter_list:
25+
for it in iter_list:
26+
print(it)
27+
print()
28+
29+
30+
def main():
31+
sel = Selector(text=html)
32+
xp = lambda x: sel.xpath(x).extract()
33+
print(xp('//li'))
34+
print(xp('//li[1]'))
35+
print(xp('(//li)[1]'))
36+
print_iter(sel.xpath('//li'))
37+
print_iter(sel.css('li'))
38+
pass
39+
40+
41+
if __name__ == '__main__':
42+
main()

spy/run.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
from scrapy.crawler import CrawlerProcess
5+
from scrapy.utils.project import get_project_settings
6+
from spy.spiders.example import ExampleSpider
7+
import sys
8+
9+
__author__ = 'Mr.Huo'
10+
11+
12+
def main():
13+
# 将spy跟目录加入sys.path
14+
sys.path.append('..')
15+
settings = get_project_settings()
16+
process = CrawlerProcess(settings=settings)
17+
process.crawl(ExampleSpider)
18+
process.start()
19+
20+
21+
if __name__ == '__main__':
22+
main()

spy/spy/pipelines.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,6 @@
88

99
class SpyPipeline(object):
1010
def process_item(self, item, spider):
11+
item['title'] = 'hhhhhh'
12+
print('pipline:', item['title'])
1113
return item

spy/spy/settings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@
6464

6565
# Configure item pipelines
6666
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67-
#ITEM_PIPELINES = {
68-
# 'spy.pipelines.SpyPipeline': 300,
69-
#}
67+
ITEM_PIPELINES = {
68+
'spy.pipelines.SpyPipeline': 1,
69+
}
7070

7171
# Enable and configure the AutoThrottle extension (disabled by default)
7272
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

spy/spy/spiders/example.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
11
# -*- coding: utf-8 -*-
22
import scrapy
3+
from scrapy.loader import ItemLoader
4+
from spy.items import SpyItem
35

46

57
class ExampleSpider(scrapy.Spider):
68
name = 'example'
7-
allowed_domains = ['news.baidu.com']
8-
start_urls = ['http://news.baidu.com/']
9+
allowed_domains = ['doc.scrapy.org']
10+
start_urls = ['http://doc.scrapy.org/en/latest/_static/selectors-sample1.html']
911

1012
def parse(self, response):
11-
links = response.xpath('//li')
12-
print(links)
13+
items = SpyItem()
14+
filename = response.xpath('//title/text()').extract()[0]
15+
items['title'] = response.xpath('//title/text()').extract()
16+
for sel in response.xpath('//a'):
17+
pass
18+
print("-" * 80)
19+
print(filename)
20+
print(response)
21+
print(items['title'])
22+
print("-" * 80)

0 commit comments

Comments
 (0)