Skip to content

Commit 20132ea

Browse files
committed
add beidaNews spider
1 parent a9ef991 commit 20132ea

File tree

7 files changed

+548
-0
lines changed

7 files changed

+548
-0
lines changed

0-Spider/beidaNewsSpider/.idea/beidaSpider.iml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0-Spider/beidaNewsSpider/.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0-Spider/beidaNewsSpider/.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0-Spider/beidaNewsSpider/.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0-Spider/beidaNewsSpider/.idea/workspace.xml

Lines changed: 409 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0-Spider/beidaNewsSpider/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
爬取北大要闻的所有新闻
2+
3+
url:http://pkunews.pku.edu.cn/xxfz/node_185.htm
4+
5+
news.sql 为数据备份(Mysql)

0-Spider/beidaNewsSpider/spider.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# coding: utf-8
2+
3+
import pymysql
4+
from bs4 import BeautifulSoup
5+
import urllib.request
6+
import time
7+
8+
'''
9+
创建数据库和数据表语句
10+
create database beidaspider default charset utf8;
11+
12+
create table news(
13+
title varchar(100),
14+
pub_date date,
15+
from_ varchar(50),
16+
content varchar(20000)
17+
);
18+
19+
数据库备份
20+
/usr/bin/mysqldump -uroot -proot beidaspider --default-character-set=utf8 --opt -Q -R >./news.sql
21+
22+
数据库恢复
23+
/usr/bin/mysql -uroot -proot beidaspider <./news.sql
24+
'''
25+
26+
27+
class BeiDaSpider:
28+
# 初始化
29+
def __init__(self):
30+
self.root_href = "http://pkunews.pku.edu.cn/xxfz/"
31+
32+
# 连接数据库
33+
def connMysql(self):
34+
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root',db='beidaspider',charset='utf8')
35+
cur = conn.cursor()
36+
return cur,conn
37+
38+
# 写入数据库
39+
def write(self,title,date,from_,content):
40+
cur,conn = self.connMysql()
41+
sql = """INSERT INTO news (title, pub_date, from_, content) VALUES ("%s", "%s", "%s", "%s")""" % (title,date,from_,content)
42+
cur.execute(sql)
43+
conn.commit()
44+
conn.close()
45+
46+
with open("news.txt","a") as fp:
47+
fp.write(title+"\t"+date+"\t"+from_+"\t"+content+"\n")
48+
fp.close()
49+
50+
# 解析每页,获取该页所有的新闻链接
51+
def parse_onePage_href(self,url):
52+
res = urllib.request.urlopen(url)
53+
body = BeautifulSoup(res.read())
54+
table = body.find('table',cellspacing="0",cellpadding="0",id="nav2_7Tabcontent_10")
55+
a_list = table.find_all('a')
56+
href_list = []
57+
for a in a_list:
58+
href_list.append(self.root_href + a.get('href'))
59+
return href_list
60+
61+
# 解析每个新闻,获取数据
62+
def parse_oneNew(self,url):
63+
res = urllib.request.urlopen(url)
64+
body = BeautifulSoup(res.read())
65+
66+
# 获取标题
67+
title = body.title.get_text().strip()
68+
print(title)
69+
70+
# 获取时间和来源
71+
#dataAndfrom =
72+
dataAndfrom = body.find('table',width="560",border="0",cellspacing="0",cellpadding="0")
73+
datafrom_list = dataAndfrom.find_all('tr')[0].get_text().strip().split("  ")
74+
date = datafrom_list[0].split(":")[1].strip()
75+
from_ = datafrom_list[1].split(":")[1].strip()
76+
print(date)
77+
#print(from_)
78+
79+
# 获取新闻内容
80+
content = body.find('table',width="710",border="0",cellspacing="0",cellpadding="0",style="margin-left:15px;").find_all('tr')[3].get_text().strip().replace("\n"," ")
81+
#print(content)
82+
83+
self.write(title,date,from_,content)
84+
85+
def start(self):
86+
for i in range(1,21):
87+
if i==1:
88+
href_list = self.parse_onePage_href(self.root_href + "node_185.htm")
89+
for href in href_list:
90+
self.parse_oneNew(href)
91+
time.sleep(5)
92+
# break
93+
else:
94+
href_list = self.parse_onePage_href(self.root_href + "node_185_" + str(i) + ".htm")
95+
for href in href_list:
96+
self.parse_oneNew(href)
97+
time.sleep(5)
98+
time.sleep(10)
99+
# break
100+
101+
102+
if __name__=="__main__":
103+
spi = BeiDaSpider()
104+
spi.start()

0 commit comments

Comments
 (0)