Skip to content

Commit 86b2c24

Browse files
committed
爬取中文版书籍图片
1 parent 7960a8a commit 86b2c24

File tree

207 files changed

+31
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

207 files changed

+31
-0
lines changed

douban.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding:utf8 -*-
2+
"""
3+
获取douban上某书籍的封面图片
4+
"""
5+
6+
import re
7+
import urllib2
8+
import csv
9+
10+
IMAGE_PATH = "./images-zh-cn/"
11+
12+
def get_image_from_douban(book_url, image_filename):
13+
print book_url
14+
response = urllib2.urlopen(book_url)
15+
html = response.read()
16+
re_image_url = r"https://img\d\.doubanio\.com/lpic/s\d*\.jpg"
17+
image_url = re.search(re_image_url, html).group()
18+
with open(image_filename, 'w') as ft:
19+
response = urllib2.urlopen(image_url)
20+
image = response.read()
21+
ft.write(image)
22+
23+
if __name__ == '__main__':
24+
with open("zh-cn.csv", 'r') as ff:
25+
spamreader = csv.reader(ff, delimiter=',')
26+
for line in spamreader:
27+
print line
28+
indexString, _, _, _, _, _, douban_url = line
29+
if re.match(r'^https?:/{2}\w.+$', douban_url):
30+
image_filename = IMAGE_PATH + indexString + ".jpg"
31+
get_image_from_douban(douban_url, image_filename)
46.9 KB
18.5 KB

images-zh-cn/3dgpdx12.jpg

22.5 KB

images-zh-cn/3dgpforkids.jpg

41 KB

images-zh-cn/3dmathprimer2.jpg

51.2 KB

images-zh-cn/CC-BY-SA_icon.png

3.51 KB

images-zh-cn/advancedgi2.jpg

32.9 KB

images-zh-cn/advancedgp.jpg

43.8 KB
23.1 KB

0 commit comments

Comments
 (0)