scrapy crawl tencent job save as json

CoderHuo · CoderHuo · commit 53bc28ca15da · 2017-10-24T15:29:10.000+08:00
diff --git a/spy/example.json b/spy/example.json
diff --git a/spy/spy/items.py b/spy/spy/items.py
@@ -17,3 +17,4 @@ class SpyItem(Item):
     workLocation = Field()  # 工作地点
     publishTime = Field()   # 发布时间
     link = Field()          # 链接
+    hot = Field()           # hot
diff --git a/spy/spy/pipelines.py b/spy/spy/pipelines.py
@@ -9,15 +9,15 @@
 
 class SpyPipeline(object):
     def __init__(self):
-        self.file = codecs.open('example.json', 'w', encoding='utf-8')
-        self.count = 1
+        self.file = open('example.json', 'w', encoding='utf-8')
+        self.jobData = {"job":[]}
+        #self.job
 
     def process_item(self, item, spider):
-        print(self.count)
-        self.count +=1
-        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
-        self.file.write(line)
+        self.jobData["job"].append(dict(item))
         return item
 
-    def spider_close(self, spider):
+    def close_spider(self, spider):
+        line = json.dumps(dict(self.jobData), ensure_ascii=False)
+        self.file.write(line)
         self.file.close()
diff --git a/spy/spy/spiders/example.py b/spy/spy/spiders/example.py
@@ -1,11 +1,7 @@
 # -*- coding: utf-8 -*-
 
 from spy.items import SpyItem
-from scrapy.spiders.crawl import Rule
-from scrapy import Selector
-from scrapy.linkextractors import LinkExtractor
 from scrapy import Spider
-from scrapy.spiders import CrawlSpider
 from scrapy.utils.response import get_base_url
 from urllib.parse import urljoin
 
@@ -14,32 +10,29 @@ class ExampleSpider(Spider):
     name = 'example'
     allowed_domains = ['tencent.com']
     start_urls = ['http://hr.tencent.com/position.php']
-    #rules = [
-    #    Rule(LinkExtractor(allow=('/position_detail.php\?id=\d{1,}')), follow=True,
-    #         callback='parse_item')
-    #]
 
     def parse(self, response):
-        items = []
         base_url = get_base_url(http://github.com/response)
         sites_even = response.css('tr.even')
-        for site in sites_even:
+        sites_odd = (response.css('tr.odd'))
+        for site in sites_even + sites_odd:
             item = SpyItem()
-            item['title'] = site.css('.l.square a').xpath('text()').extract()
-            item['category'] = site.css('tr > td:nth-child(2)::text').extract()
-            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
-            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
+            item['title'] = site.css('.l.square a').xpath('text()').extract_first()
+            # 热门职位标志
+            hot = site.css('.l.square span').xpath('text()').extract_first()
+            if hot is not None:
+                item['hot'] = "yes"
+            else:
+                item['hot'] = "no"
+            # 职位类别有空白
+            item['category'] = site.css('tr > td:nth-child(2)::text').extract_first(default='未分类职位')
+            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract_first()
+            relative_url = site.css('.l.square a').xpath('@href').extract_first()
             item['link'] = urljoin(base_url, relative_url)
-            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
-            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
-            items.append(item)
-        print("-"*100)
-        print("base_url",base_url)
-        for item in items:
-            print(item)
-        print("-"*100)
-        return items
+            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract_first()
+            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract_first()
+            yield item
 
-    #def _process_request(self, request):
-    #    info('process ' + str(request))
-    #    return request
+        next_page = urljoin(base_url, response.xpath('//a[@id="next"]/@href').extract_first())
+        if next_page is not None:
+            yield response.follow(next_page, self.parse)