11# -*- coding: utf-8 -*-
22
33from spy .items import SpyItem
4- from scrapy .spiders .crawl import Rule
5- from scrapy import Selector
6- from scrapy .linkextractors import LinkExtractor
74from scrapy import Spider
8- from scrapy .spiders import CrawlSpider
95from scrapy .utils .response import get_base_url
106from urllib .parse import urljoin
117
@@ -14,32 +10,29 @@ class ExampleSpider(Spider):
1410 name = 'example'
1511 allowed_domains = ['tencent.com' ]
1612 start_urls = ['http://hr.tencent.com/position.php' ]
17- #rules = [
18- # Rule(LinkExtractor(allow=('/position_detail.php\?id=\d{1,}')), follow=True,
19- # callback='parse_item')
20- #]
2113
2214 def parse (self , response ):
23- items = []
2415 base_url = get_base_url (response )
2516 sites_even = response .css ('tr.even' )
26- for site in sites_even :
17+ sites_odd = (response .css ('tr.odd' ))
18+ for site in sites_even + sites_odd :
2719 item = SpyItem ()
28- item ['title' ] = site .css ('.l.square a' ).xpath ('text()' ).extract ()
29- item ['category' ] = site .css ('tr > td:nth-child(2)::text' ).extract ()
30- item ['recruitNumber' ] = site .css ('tr > td:nth-child(3)::text' ).extract ()
31- relative_url = site .css ('.l.square a' ).xpath ('@href' ).extract ()[0 ]
20+ item ['title' ] = site .css ('.l.square a' ).xpath ('text()' ).extract_first ()
21+ # 热门职位标志
22+ hot = site .css ('.l.square span' ).xpath ('text()' ).extract_first ()
23+ if hot is not None :
24+ item ['hot' ] = "yes"
25+ else :
26+ item ['hot' ] = "no"
27+ # 职位类别有空白
28+ item ['category' ] = site .css ('tr > td:nth-child(2)::text' ).extract_first (default = '未分类职位' )
29+ item ['recruitNumber' ] = site .css ('tr > td:nth-child(3)::text' ).extract_first ()
30+ relative_url = site .css ('.l.square a' ).xpath ('@href' ).extract_first ()
3231 item ['link' ] = urljoin (base_url , relative_url )
33- item ['workLocation' ] = site .css ('tr > td:nth-child(4)::text' ).extract ()
34- item ['publishTime' ] = site .css ('tr > td:nth-child(5)::text' ).extract ()
35- items .append (item )
36- print ("-" * 100 )
37- print ("base_url" ,base_url )
38- for item in items :
39- print (item )
40- print ("-" * 100 )
41- return items
32+ item ['workLocation' ] = site .css ('tr > td:nth-child(4)::text' ).extract_first ()
33+ item ['publishTime' ] = site .css ('tr > td:nth-child(5)::text' ).extract_first ()
34+ yield item
4235
43- #def _process_request(self, request):
44- # info('process ' + str(request))
45- # return request
36+ next_page = urljoin ( base_url , response . xpath ( '//a[@id="next"]/@href' ). extract_first ())
37+ if next_page is not None :
38+ yield response . follow ( next_page , self . parse )
0 commit comments