Skip to content

Commit 42ea1a0

Browse files
committed
🎉 add the scarpy folder
1 parent 2d4079d commit 42ea1a0

33 files changed

+265
-9
lines changed

README.md

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
11
# pythonSpider
22
some python spiders with BeautifulSoup & requests
33

4-
## [github](github)
5-
- 爬取关注或粉丝列表 [followXXXList.py](github/github_followXXXList.py)
6-
- 爬取用户个人信息 [userInfo.py](github/github_userInfo.py)
4+
##BS4
75

8-
## [58同城](58tongcheng)
9-
- 获取所有的二级菜单列表 [urlLists.py](58tongcheng/tc_urlLists.py)
10-
- 获取每一个url下的具体信息 [itemInfo.py](58tongcheng/tc_itemInfo.py)
6+
### [github](bs4/github)
7+
- 爬取关注或粉丝列表 [followXXXList.py](bs4/github/github_followXXXList.py)
8+
- 爬取用户个人信息 [userInfo.py](bs4/github/github_userInfo.py)
119

12-
## [豆瓣](douban)
13-
- 获取单个电影的短评 [comment.py](douban/douban_comment.py)
14-
- 获取单个电影的所有大图海报 [photosR.py](douban/douban_photosR.py)
10+
### [58同城](58tongcheng)
11+
- 获取所有的二级菜单列表 [urlLists.py](bs4/58tongcheng/tc_urlLists.py)
12+
- 获取每一个url下的具体信息 [itemInfo.py](bs4/58tongcheng/tc_itemInfo.py)
13+
14+
### [豆瓣](douban)
15+
- 获取单个电影的短评 [comment.py](bs4/douban/douban_comment.py)
16+
- 获取单个电影的所有大图海报 [photosR.py](bs4/douban/douban_photosR.py)
17+
18+
##scarpy
19+
20+
### [wiki](scarpy/wikiSpider)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

bs4/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# pythonSpider
2+
some python spiders with BeautifulSoup & requests
3+
4+
## [github](github)
5+
- 爬取关注或粉丝列表 [followXXXList.py](github/github_followXXXList.py)
6+
- 爬取用户个人信息 [userInfo.py](github/github_userInfo.py)
7+
8+
## [58同城](58tongcheng)
9+
- 获取所有的二级菜单列表 [urlLists.py](58tongcheng/tc_urlLists.py)
10+
- 获取每一个url下的具体信息 [itemInfo.py](58tongcheng/tc_itemInfo.py)
11+
12+
## [豆瓣](douban)
13+
- 获取单个电影的短评 [comment.py](douban/douban_comment.py)
14+
- 获取单个电影的所有大图海报 [photosR.py](douban/douban_photosR.py)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

bs4/github/spider4lang.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import requests
2+
import urllib
3+
from bs4 import BeautifulSoup
4+
import time
5+
import string
6+
7+
url = 'https://github.com/Ovilia'
8+
9+
headers = {
10+
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.933.400 QQBrowser/9.4.8699.400',
11+
}
12+
data = requests.get(url, headers=headers)
13+
soup = BeautifulSoup(data.text, 'lxml')
14+
print soup.prettify()
15+
langs = soup.select(' ol.pinned-repos-list > li > span.pinned-repo-item-content > p.text-gray.mb-0')
16+
for i in langs:
17+
print i.get_text().replace(' ','').strip('\n')
File renamed without changes.

scarpy/wikiSpider/acticle.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[
2+
{"title": "Main Page"},
3+
{"title": "Python (programming language)"}
4+
]

scarpy/wikiSpider/acticles.json

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
[
2+
{"title": "Python (programming language)"},
3+
{"title": "Software developer"},
4+
{"title": "Guido van Rossum"},
5+
{"title": "Python Software Foundation"},
6+
{"title": "Gradual typing"},
7+
{"title": "Type system"},
8+
{"title": "Strong and weak typing"},
9+
{"title": "Duck typing"},
10+
{"title": "Software design"},
11+
{"title": "Software release life cycle"},
12+
{"title": "Type system"},
13+
{"title": "Reflection (computer programming)"},
14+
{"title": "Procedural programming"},
15+
{"title": "Imperative programming"},
16+
{"title": "Object-oriented programming"},
17+
{"title": "Main Page"},
18+
{"title": "Functional programming"},
19+
{"title": "Integrated Authority File"},
20+
{"title": "United States Department of Labor"},
21+
{"title": "TechRepublic"},
22+
{"title": "Digital object identifier"},
23+
{"title": "IEEE Computer Society"},
24+
{"title": "Grail (web browser)"},
25+
{"title": "Berkeley Software Distribution"},
26+
{"title": "glob (programming)"},
27+
{"title": "Belmont, California"},
28+
{"title": "List of type designers"},
29+
{"title": "Computer program"},
30+
{"title": "Data type"},
31+
{"title": "Type theory"},
32+
{"title": "Run time (program lifecycle phase)"},
33+
{"title": "Compile time"},
34+
{"title": "Type system"},
35+
{"title": "Strong and weak typing"},
36+
{"title": "Uniqueness type"},
37+
{"title": "Lucee"},
38+
{"title": "ColdFusion Markup Language"},
39+
{"title": "C Sharp 4.0"}
40+
]

scarpy/wikiSpider/scrapy.cfg

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = wikiSpider.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = wikiSpider

scarpy/wikiSpider/wikiSpider/__init__.py

Whitespace-only changes.
151 Bytes
Binary file not shown.

scarpy/wikiSpider/wikiSpider/items.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
9+
from scrapy import Item, Field
10+
11+
#
12+
# class WikispiderItem(scrapy.Item):
13+
# # define the fields for your item here like:
14+
# # name = scrapy.Field()
15+
# pass
16+
17+
class Article(Item):
18+
# define the fields for your item here like:
19+
# name = scrapy.Field()
20+
title = Field()
450 Bytes
Binary file not shown.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
9+
class WikispiderPipeline(object):
10+
def process_item(self, item, spider):
11+
return item
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for wikiSpider project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10+
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'wikiSpider'
13+
14+
SPIDER_MODULES = ['wikiSpider.spiders']
15+
NEWSPIDER_MODULE = 'wikiSpider.spiders'
16+
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
#USER_AGENT = 'wikiSpider (+http://www.yourdomain.com)'
20+
21+
# Obey robots.txt rules
22+
# ROBOTSTXT_OBEY = True
23+
24+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
25+
#CONCURRENT_REQUESTS = 32
26+
27+
# Configure a delay for requests for the same website (default: 0)
28+
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29+
# See also autothrottle settings and docs
30+
#DOWNLOAD_DELAY = 3
31+
# The download delay setting will honor only one of:
32+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
33+
#CONCURRENT_REQUESTS_PER_IP = 16
34+
35+
# Disable cookies (enabled by default)
36+
#COOKIES_ENABLED = False
37+
38+
# Disable Telnet Console (enabled by default)
39+
#TELNETCONSOLE_ENABLED = False
40+
41+
# Override the default request headers:
42+
#DEFAULT_REQUEST_HEADERS = {
43+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44+
# 'Accept-Language': 'en',
45+
#}
46+
47+
# Enable or disable spider middlewares
48+
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49+
#SPIDER_MIDDLEWARES = {
50+
# 'wikiSpider.middlewares.MyCustomSpiderMiddleware': 543,
51+
#}
52+
53+
# Enable or disable downloader middlewares
54+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55+
#DOWNLOADER_MIDDLEWARES = {
56+
# 'wikiSpider.middlewares.MyCustomDownloaderMiddleware': 543,
57+
#}
58+
59+
# Enable or disable extensions
60+
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61+
#EXTENSIONS = {
62+
# 'scrapy.extensions.telnet.TelnetConsole': None,
63+
#}
64+
65+
# Configure item pipelines
66+
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67+
#ITEM_PIPELINES = {
68+
# 'wikiSpider.pipelines.SomePipeline': 300,
69+
#}
70+
71+
# Enable and configure the AutoThrottle extension (disabled by default)
72+
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73+
#AUTOTHROTTLE_ENABLED = True
74+
# The initial download delay
75+
#AUTOTHROTTLE_START_DELAY = 5
76+
# The maximum download delay to be set in case of high latencies
77+
#AUTOTHROTTLE_MAX_DELAY = 60
78+
# The average number of requests Scrapy should be sending in parallel to
79+
# each remote server
80+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81+
# Enable showing throttling stats for every response received:
82+
#AUTOTHROTTLE_DEBUG = False
83+
84+
# Enable and configure HTTP caching (disabled by default)
85+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86+
#HTTPCACHE_ENABLED = True
87+
#HTTPCACHE_EXPIRATION_SECS = 0
88+
#HTTPCACHE_DIR = 'httpcache'
89+
#HTTPCACHE_IGNORE_HTTP_CODES = []
90+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
267 Bytes
Binary file not shown.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
159 Bytes
Binary file not shown.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from scrapy.contrib.spiders import CrawlSpider, Rule
2+
from wikiSpider.items import Article
3+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
4+
from scrapy import log
5+
6+
class ArticleSpider(CrawlSpider):
7+
#log.start(logfile='log.txt', loglevel=log.CRITICAL)
8+
name="article"
9+
allowed_domains = ["en.wikipedia.org"]
10+
start_urls = ["http://en.wikipedia.org/wiki/Python_%28programming_language%29"]
11+
rules = [
12+
Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'),), callback="parse_item", follow=True)
13+
]
14+
15+
def parse_item(self, response):
16+
item = Article()
17+
title = response.xpath('//h1/text()')[0].extract()
18+
print "Title is: "+title
19+
item['title'] = title
20+
return item
Binary file not shown.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from scrapy.selector import Selector
2+
from scrapy import Spider
3+
from wikiSpider.items import Article
4+
5+
class ArticleSpider(Spider):
6+
#log.start(logfile='log.txt', loglevel=log.CRITICAL)
7+
name="noarticle"
8+
allowed_domains = ["en.wikipedia.org"]
9+
start_urls = [
10+
"http://en.wikipedia.org/wiki/Main_Page",
11+
"http://en.wikipedia.org/wiki/Python_%28programming_language%29"
12+
]
13+
14+
def parse(self, response):
15+
item = Article()
16+
title = response.xpath('//*[@id="n-mainpage-description"]/a/text()')[0].extract()
17+
print "Title is: "+title
18+
item['title'] = title
19+
return item
Binary file not shown.

0 commit comments

Comments
 (0)