Skip to content

Commit 53bc28c

Browse files
committed
scrapy crawl tencent job save as json
1 parent a9c445f commit 53bc28c

File tree

4 files changed

+28
-38
lines changed

4 files changed

+28
-38
lines changed

spy/example.json

Lines changed: 1 addition & 5 deletions
Large diffs are not rendered by default.

spy/spy/items.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ class SpyItem(Item):
1717
workLocation = Field() # 工作地点
1818
publishTime = Field() # 发布时间
1919
link = Field() # 链接
20+
hot = Field() # hot

spy/spy/pipelines.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@
99

1010
class SpyPipeline(object):
1111
def __init__(self):
12-
self.file = codecs.open('example.json', 'w', encoding='utf-8')
13-
self.count = 1
12+
self.file = open('example.json', 'w', encoding='utf-8')
13+
self.jobData = {"job":[]}
14+
#self.job
1415

1516
def process_item(self, item, spider):
16-
print(self.count)
17-
self.count +=1
18-
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
19-
self.file.write(line)
17+
self.jobData["job"].append(dict(item))
2018
return item
2119

22-
def spider_close(self, spider):
20+
def close_spider(self, spider):
21+
line = json.dumps(dict(self.jobData), ensure_ascii=False)
22+
self.file.write(line)
2323
self.file.close()

spy/spy/spiders/example.py

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
# -*- coding: utf-8 -*-
22

33
from spy.items import SpyItem
4-
from scrapy.spiders.crawl import Rule
5-
from scrapy import Selector
6-
from scrapy.linkextractors import LinkExtractor
74
from scrapy import Spider
8-
from scrapy.spiders import CrawlSpider
95
from scrapy.utils.response import get_base_url
106
from urllib.parse import urljoin
117

@@ -14,32 +10,29 @@ class ExampleSpider(Spider):
1410
name = 'example'
1511
allowed_domains = ['tencent.com']
1612
start_urls = ['http://hr.tencent.com/position.php']
17-
#rules = [
18-
# Rule(LinkExtractor(allow=('/position_detail.php\?id=\d{1,}')), follow=True,
19-
# callback='parse_item')
20-
#]
2113

2214
def parse(self, response):
23-
items = []
2415
base_url = get_base_url(response)
2516
sites_even = response.css('tr.even')
26-
for site in sites_even:
17+
sites_odd = (response.css('tr.odd'))
18+
for site in sites_even + sites_odd:
2719
item = SpyItem()
28-
item['title'] = site.css('.l.square a').xpath('text()').extract()
29-
item['category'] = site.css('tr > td:nth-child(2)::text').extract()
30-
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
31-
relative_url = site.css('.l.square a').xpath('@href').extract()[0]
20+
item['title'] = site.css('.l.square a').xpath('text()').extract_first()
21+
# 热门职位标志
22+
hot = site.css('.l.square span').xpath('text()').extract_first()
23+
if hot is not None:
24+
item['hot'] = "yes"
25+
else:
26+
item['hot'] = "no"
27+
# 职位类别有空白
28+
item['category'] = site.css('tr > td:nth-child(2)::text').extract_first(default='未分类职位')
29+
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract_first()
30+
relative_url = site.css('.l.square a').xpath('@href').extract_first()
3231
item['link'] = urljoin(base_url, relative_url)
33-
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
34-
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
35-
items.append(item)
36-
print("-"*100)
37-
print("base_url",base_url)
38-
for item in items:
39-
print(item)
40-
print("-"*100)
41-
return items
32+
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract_first()
33+
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract_first()
34+
yield item
4235

43-
#def _process_request(self, request):
44-
# info('process ' + str(request))
45-
# return request
36+
next_page = urljoin(base_url, response.xpath('//a[@id="next"]/@href').extract_first())
37+
if next_page is not None:
38+
yield response.follow(next_page, self.parse)

0 commit comments

Comments
 (0)