1
+ import random
2
+ import time
3
+
4
+ import requests
5
+ from openpyxl import Workbook
6
+ import pymysql .cursors
7
+
8
+
9
+ def get_conn ():
10
+ '''建立数据库连接'''
11
+ conn = pymysql .connect (host = 'localhost' ,
12
+ user = 'root' ,
13
+ password = 'root' ,
14
+ db = 'python' ,
15
+ charset = 'utf8mb4' ,
16
+ cursorclass = pymysql .cursors .DictCursor )
17
+ return conn
18
+
19
+
20
+ def insert (conn , info ):
21
+ '''数据写入数据库'''
22
+ with conn .cursor () as cursor :
23
+ sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
24
+ cursor .execute (sql , info )
25
+ conn .commit ()
26
+
27
+
28
+ def get_json (url , page , lang_name ):
29
+ '''返回当前页面的信息列表'''
30
+ headers = {
31
+ 'Host' : 'www.lagou.com' ,
32
+ 'Connection' : 'keep-alive' ,
33
+ 'Content-Length' : '23' ,
34
+ 'Origin' : 'https://www.lagou.com' ,
35
+ 'X-Anit-Forge-Code' : '0' ,
36
+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0' ,
37
+ 'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8' ,
38
+ 'Accept' : 'application/json, text/javascript, */*; q=0.01' ,
39
+ 'X-Requested-With' : 'XMLHttpRequest' ,
40
+ 'X-Anit-Forge-Token' : 'None' ,
41
+ 'Referer' : 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=' ,
42
+ 'Accept-Encoding' : 'gzip, deflate, br' ,
43
+ 'Accept-Language' : 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
44
+ }
45
+ data = {'first' : 'false' , 'pn' : page , 'kd' : lang_name }
46
+ json = requests .post (url , data , headers = headers ).json ()
47
+ list_con = json ['content' ]['positionResult' ]['result' ]
48
+ info_list = []
49
+ for i in list_con :
50
+ info = []
51
+ info .append (i .get ('companyShortName' , '无' ))
52
+ info .append (i .get ('companyFullName' , '无' ))
53
+ info .append (i .get ('industryField' , '无' ))
54
+ info .append (i .get ('companySize' , '无' ))
55
+ info .append (i .get ('salary' , '无' ))
56
+ info .append (i .get ('city' , '无' ))
57
+ info .append (i .get ('education' , '无' ))
58
+ info_list .append (info )
59
+ return info_list
60
+
61
+
62
+ def main ():
63
+ lang_name = 'python'
64
+ wb = Workbook () # 打开 excel 工作簿
65
+ conn = get_conn () # 建立数据库连接 不存数据库 注释此行
66
+ for i in ['北京' , '上海' , '广州' , '深圳' , '杭州' ]: # 五个城市
67
+ page = 1
68
+ ws1 = wb .active
69
+ ws1 .title = lang_name
70
+ url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false' .format (i )
71
+ while page < 31 : # 每个城市30页信息
72
+ info = get_json (url , page , lang_name )
73
+ page += 1
74
+ print (i , 'page' , page )
75
+ time .sleep (random .randint (10 , 20 ))
76
+ for row in info :
77
+ # insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行
78
+ ws1 .append (row )
79
+ conn .close () # 关闭数据库连接,不存数据库 注释此行
80
+ wb .save ('{}职位信息.xlsx' .format (lang_name ))
81
+
82
+ if __name__ == '__main__' :
83
+ main ()
0 commit comments