File tree Expand file tree Collapse file tree 5 files changed +83
-7
lines changed Expand file tree Collapse file tree 5 files changed +83
-7
lines changed Original file line number Diff line number Diff line change 1+ #!/usr/bin/env python3
2+ # -*- coding: utf-8 -*-
3+
4+ from scrapy import Selector
5+
6+ __author__ = 'Mr.Huo'
7+
8+ html = '''
9+ <ul class="list">
10+ <li>1</li>
11+ <li>2</li>
12+ <li>3</li>
13+ </ul>
14+ <ul class="list">
15+ <li>4</li>
16+ <li>5</li>
17+ <li>6</li>
18+ </ul>
19+ <div class="hero shout"><time datetime="2014-07-23 19:00">Special date</time></div>
20+ '''
21+
22+
23+ def print_iter (iter_list ):
24+ if iter_list :
25+ for it in iter_list :
26+ print (it )
27+ print ()
28+
29+
30+ def main ():
31+ sel = Selector (text = html )
32+ xp = lambda x : sel .xpath (x ).extract ()
33+ print (xp ('//li' ))
34+ print (xp ('//li[1]' ))
35+ print (xp ('(//li)[1]' ))
36+ print_iter (sel .xpath ('//li' ))
37+ print_iter (sel .css ('li' ))
38+ pass
39+
40+
41+ if __name__ == '__main__' :
42+ main ()
Original file line number Diff line number Diff line change 1+ #!/usr/bin/env python3
2+ # -*- coding: utf-8 -*-
3+
4+ from scrapy .crawler import CrawlerProcess
5+ from scrapy .utils .project import get_project_settings
6+ from spy .spiders .example import ExampleSpider
7+ import sys
8+
9+ __author__ = 'Mr.Huo'
10+
11+
12+ def main ():
13+ # 将spy跟目录加入sys.path
14+ sys .path .append ('..' )
15+ settings = get_project_settings ()
16+ process = CrawlerProcess (settings = settings )
17+ process .crawl (ExampleSpider )
18+ process .start ()
19+
20+
21+ if __name__ == '__main__' :
22+ main ()
Original file line number Diff line number Diff line change 88
99class SpyPipeline (object ):
1010 def process_item (self , item , spider ):
11+ item ['title' ] = 'hhhhhh'
12+ print ('pipline:' , item ['title' ])
1113 return item
Original file line number Diff line number Diff line change 6464
6565# Configure item pipelines
6666# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67- # ITEM_PIPELINES = {
68- # 'spy.pipelines.SpyPipeline': 300 ,
69- # }
67+ ITEM_PIPELINES = {
68+ 'spy.pipelines.SpyPipeline' : 1 ,
69+ }
7070
7171# Enable and configure the AutoThrottle extension (disabled by default)
7272# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
Original file line number Diff line number Diff line change 11# -*- coding: utf-8 -*-
22import scrapy
3+ from scrapy .loader import ItemLoader
4+ from spy .items import SpyItem
35
46
57class ExampleSpider (scrapy .Spider ):
68 name = 'example'
7- allowed_domains = ['news.baidu.com ' ]
8- start_urls = ['http://news.baidu.com/ ' ]
9+ allowed_domains = ['doc.scrapy.org ' ]
10+ start_urls = ['http://doc.scrapy.org/en/latest/_static/selectors-sample1.html ' ]
911
1012 def parse (self , response ):
11- links = response .xpath ('//li' )
12- print (links )
13+ items = SpyItem ()
14+ filename = response .xpath ('//title/text()' ).extract ()[0 ]
15+ items ['title' ] = response .xpath ('//title/text()' ).extract ()
16+ for sel in response .xpath ('//a' ):
17+ pass
18+ print ("-" * 80 )
19+ print (filename )
20+ print (response )
21+ print (items ['title' ])
22+ print ("-" * 80 )
You can’t perform that action at this time.
0 commit comments