Skip to content

Commit 8de3d30

Browse files
committed
'增加模拟登录'
1 parent da33c45 commit 8de3d30

File tree

2 files changed

+75
-16
lines changed

2 files changed

+75
-16
lines changed

爬虫脚本/lagou.py

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,65 @@
22
import json
33
import redis
44
from pyquery import PyQuery as pq
5+
import hashlib
6+
import re
7+
8+
#请求对象
9+
session = requests.session()
10+
11+
#请求头信息
12+
HEADERS = {
13+
'Referer': 'https://passport.lagou.com/login/login.html',
14+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:51.0) Gecko/20100101 Firefox/51.0',
15+
}
16+
headers = {}
17+
cookies = {}
18+
19+
def get_password(passwd):
20+
'''这里对密码进行了md5双重加密 veennike 这个值是在main.html_aio_f95e644.js文件找到的 '''
21+
passwd = hashlib.md5(passwd.encode('utf-8')).hexdigest()
22+
passwd = 'veenike' + passwd + 'veenike'
23+
passwd = hashlib.md5(passwd.encode('utf-8')).hexdigest()
24+
return passwd
25+
26+
def get_token():
27+
Forge_Token = ""
28+
Forge_Code = ""
29+
login_page = 'https://passport.lagou.com/login/login.html'
30+
data = session.get(login_page, headers=HEADERS)
31+
match_obj = re.match(r'.*X_Anti_Forge_Token = \'(.*?)\';.*X_Anti_Forge_Code = \'(\d+?)\'', data.text, re.DOTALL)
32+
if match_obj:
33+
Forge_Token = match_obj.group(1)
34+
Forge_Code = match_obj.group(2)
35+
return Forge_Token, Forge_Code
36+
37+
def login(username, passwd):
38+
X_Anti_Forge_Token, X_Anti_Forge_Code = get_token()
39+
login_headers = HEADERS.copy()
40+
login_headers.update({'X-Requested-With': 'XMLHttpRequest', 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Anit-Forge-Code': X_Anti_Forge_Code})
41+
postData = {
42+
'username': username,
43+
'password': get_password(passwd),
44+
'request_form_verifyCode': '',
45+
'submit': '',
46+
}
47+
response = session.post('https://passport.lagou.com/login/login.json', data=postData, headers=login_headers)
48+
json_data = response.json()
49+
if(json_data['state'] != 1):
50+
print("登录失败,退出")
51+
exit(-1)
52+
53+
def get_cookies():
54+
session.get("https://passport.lagou.com/grantServiceTicket/grant.html")
55+
return requests.utils.dict_from_cookiejar(session.cookies)
556

657
"""
758
获取职位列表
859
"""
9-
def fetch(url,headers={}):
60+
def fetch(url,headers,cookies):
1061
#用户登录后的cookie
1162
try:
12-
res = requests.get(url,headers=headers)
63+
res = session.get(url,headers=headers,cookies=cookies)
1364
except Exception as e:
1465
print(e)
1566
return False
@@ -43,15 +94,15 @@ def insert(data):
4394
resA = r.sadd("company",id)
4495

4596
if resA == 1:
46-
res = fetchDetail(positionId,headers=headers)
97+
res = fetch_detail(positionId,headers,cookies)
4798
r.hmset("postion:"+str(positionId),res)
4899
print(str(positionId)+"已经写入redis中")
49100
r.hmset(key,t)
50101

51102
# 获取职位详情
52-
def fetchDetail(id,headers={}):
103+
def fetch_detail(id,headers,cookies):
53104
detailUrl = 'http://m.lagou.com/jobs/'+str(id)+".html"
54-
res = requests.get(detailUrl,headers=headers)
105+
res = session.get(detailUrl,headers=headers,cookies=cookies)
55106
if res.status_code != 200:
56107
print("请求出错"+str(res.text))
57108
return False
@@ -68,18 +119,7 @@ def fetchDetail(id,headers={}):
68119
return d
69120
# print(q(".content").text())
70121

71-
if __name__ == '__main__':
72-
73-
#职位列表的json接口 需要登陆 带cookie
74-
page = 1
75-
pageSize = 100#最大支持100
76-
API_URL = 'http://m.lagou.com/listmore.json?pageNo='+str(page)+'&pageSize='+str(pageSize)
77-
#cookie替换成自己登录后的cookie
78-
cookie = 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1531361076,1532319384; LGUID=20180723131853-e4343a35-8e37-11e8-9ee6-5254005c3644; LGSID=20180723135216-8dffe0ce-8e3c-11e8-a327-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fpassport.lagou.com%2Flogin%2Flogin.html; LG_LOGIN_USER_ID=564f3b59e53c65db601e411a55fa01278ce7cf79cb588be8; _putrc=51AF6FA824D5BE21; login=true; unick=%E7%94%B0%E9%9B%B7; gate_login_token=a47b874276333f9f31e46da6185e63a0935573d160846c4d; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1532325155; LGRID=20180723135240-9c59f5b7-8e3c-11e8-a327-525400f775ce'
79-
headers = {'user-agent': 'my-app/0.0.1','cookie':cookie}
80122

81-
data = fetch(API_URL,headers=headers)
82-
insert(data)
83123

84124

85125

爬虫脚本/lagoutest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env python
2+
# -*- coding:utf-8 -*-
3+
from lagou import login,get_cookies,fetch_detail,fetch
4+
5+
if __name__ == "__main__":
6+
username = ''
7+
passwd = ''
8+
login(username, passwd)
9+
cookies = get_cookies()
10+
print(cookies)
11+
try:
12+
# 同步session
13+
url = 'https://m.lagou.com/listmore.json?pageNo=1&pageSize=10'
14+
headers = {}
15+
print("数据如下")
16+
data = fetch(url,headers=headers,cookies=cookies)
17+
print(data)
18+
except Exception as e:
19+
print(e)

0 commit comments

Comments
 (0)