Skip to content

Commit 2520ad9

Browse files
committed
lagou.py
1 parent 8091eef commit 2520ad9

File tree

4 files changed

+83
-133
lines changed

4 files changed

+83
-133
lines changed

Crawer/README.MD

Lines changed: 0 additions & 2 deletions
This file was deleted.

Crawer/meizitu.py

Lines changed: 0 additions & 77 deletions
This file was deleted.

Crawer/qiubai_crawer.py

Lines changed: 0 additions & 54 deletions
This file was deleted.

爬虫集合/lagou.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import random
2+
import time
3+
4+
import requests
5+
from openpyxl import Workbook
6+
import pymysql.cursors
7+
8+
9+
def get_conn():
10+
'''建立数据库连接'''
11+
conn = pymysql.connect(host='localhost',
12+
user='root',
13+
password='root',
14+
db='python',
15+
charset='utf8mb4',
16+
cursorclass=pymysql.cursors.DictCursor)
17+
return conn
18+
19+
20+
def insert(conn, info):
21+
'''数据写入数据库'''
22+
with conn.cursor() as cursor:
23+
sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
24+
cursor.execute(sql, info)
25+
conn.commit()
26+
27+
28+
def get_json(url, page, lang_name):
29+
'''返回当前页面的信息列表'''
30+
headers = {
31+
'Host': 'www.lagou.com',
32+
'Connection': 'keep-alive',
33+
'Content-Length': '23',
34+
'Origin': 'https://www.lagou.com',
35+
'X-Anit-Forge-Code': '0',
36+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
37+
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
38+
'Accept': 'application/json, text/javascript, */*; q=0.01',
39+
'X-Requested-With': 'XMLHttpRequest',
40+
'X-Anit-Forge-Token': 'None',
41+
'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
42+
'Accept-Encoding': 'gzip, deflate, br',
43+
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
44+
}
45+
data = {'first': 'false', 'pn': page, 'kd': lang_name}
46+
json = requests.post(url, data, headers=headers).json()
47+
list_con = json['content']['positionResult']['result']
48+
info_list = []
49+
for i in list_con:
50+
info = []
51+
info.append(i.get('companyShortName', '无'))
52+
info.append(i.get('companyFullName', '无'))
53+
info.append(i.get('industryField', '无'))
54+
info.append(i.get('companySize', '无'))
55+
info.append(i.get('salary', '无'))
56+
info.append(i.get('city', '无'))
57+
info.append(i.get('education', '无'))
58+
info_list.append(info)
59+
return info_list
60+
61+
62+
def main():
63+
lang_name = 'python'
64+
wb = Workbook() # 打开 excel 工作簿
65+
conn = get_conn() # 建立数据库连接 不存数据库 注释此行
66+
for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市
67+
page = 1
68+
ws1 = wb.active
69+
ws1.title = lang_name
70+
url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
71+
while page < 31: # 每个城市30页信息
72+
info = get_json(url, page, lang_name)
73+
page += 1
74+
print(i, 'page', page)
75+
time.sleep(random.randint(10, 20))
76+
for row in info:
77+
# insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行
78+
ws1.append(row)
79+
conn.close() # 关闭数据库连接,不存数据库 注释此行
80+
wb.save('{}职位信息.xlsx'.format(lang_name))
81+
82+
if __name__ == '__main__':
83+
main()

0 commit comments

Comments
 (0)