diff --git "a/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" "b/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" new file mode 100644 index 00000000..1b23c160 --- /dev/null +++ "b/Python \351\273\221\351\255\224\346\263\225/Python \350\277\234\347\250\213\345\274\200\346\234\272.py" @@ -0,0 +1,28 @@ +def wake_up(request, mac='DC-4A-3E-78-3E-0A'): + MAC = mac + BROADCAST = "192.168.0.255" + if len(MAC) != 17: + raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'") + mac_address = MAC.replace("-", '') + data = ''.join(['FFFFFFFFFFFF', mac_address * 20]) # 构造原始数据格式 + send_data = b'' + + # 把原始数据转换为16进制字节数组, + for i in range(0, len(data), 2): + send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))]) + print(send_data) + + # 通过socket广播出去,为避免失败,间隔广播三次 + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) + sock.sendto(send_data, (BROADCAST, 7)) + time.sleep(1) + sock.sendto(send_data, (BROADCAST, 7)) + time.sleep(1) + sock.sendto(send_data, (BROADCAST, 7)) + return HttpResponse() + print("Done") + except Exception as e: + return HttpResponse() + print(e) \ No newline at end of file diff --git "a/Python \351\273\221\351\255\224\346\263\225/README.MD" "b/Python \351\273\221\351\255\224\346\263\225/README.MD" new file mode 100644 index 00000000..248602a7 --- /dev/null +++ "b/Python \351\273\221\351\255\224\346\263\225/README.MD" @@ -0,0 +1,4 @@ +# 代码详细说明请看文章 + +[Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ) + diff --git a/README.md b/README.md index ec80c6cd..005335e5 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,61 @@ -### 这是我日常遇到的一些小问题的解决办法,全部是基于Python3 -1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) +# 欢迎关注我的微信公众号【智能制造社区】 -2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) +## 左手代码,右手制造,分享智能制造相关技术和业务,包括 Python, C#, 数据库,工业大数据、物联网技术及MES/ERP/SAP等系统。 -3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) +## 可以通过微信公众号加我好友 +![二维码](qrcode.jpg) -4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) +# 内容列表 -5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) +## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) -6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) +- ### Python 微信公众号开发—小白篇(一) -7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) +- ### Python 公众号开发—颜值检测 -8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) +## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88) -9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) +- ### Python 爬虫入门(一)——爬取糗事百科 + +- ### Python 爬虫入门(二)——爬取妹子图 + +- ### Python 爬虫——Python 岗位分析报告 + +- ### Python 爬虫利器——Selenium介绍 + +- ### Python 爬虫—— 抖音 App 视频抓包爬取 + +## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95) + +- ### Python 远程关机 + +## SQL 数据库 + +- [1 小时 SQL 极速入门(一)](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA) +- [1 小时 SQL 极速入门(二)](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA) +- [1 小时 SQL 极速入门(三)](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q) +- [SQL 高级查询——(层次化查询,递归)](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q) +- [GROUP BY高级查询,ROLLUP,CUBE,GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ) +- [SQL 行转列,列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ) + +## 其他 + +- 1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) + +- 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) + +- 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) + +- 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) + +- 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) + +- 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) + +- 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) + +- 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) + +- 9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) diff --git a/biyingSpider.py b/biyingSpider.py index 86bf645b..b5fae1ed 100644 --- a/biyingSpider.py +++ b/biyingSpider.py @@ -5,10 +5,11 @@ url = 'http://cn.bing.com/' con = requests.get(url) content = con.text -reg = r"(http://s.cn.bing.net/az/hprichbg/rb/.*?.jpg)" +reg = r"(az/hprichbg/rb/.*?.jpg)" a = re.findall(reg, content, re.S)[0] print(a) -read = requests.get(a) +picUrl = url + a +read = requests.get(picUrl) f = open('%s.jpg' % local, 'wb') f.write(read.content) f.close() diff --git a/image_recognition_zhihu.py b/image_recognition_zhihu.py new file mode 100644 index 00000000..5dd0a53b --- /dev/null +++ b/image_recognition_zhihu.py @@ -0,0 +1,202 @@ +# -*- coding:UTF-8 -*- + +import requests , time ,random +import hmac ,json ,base64 +from bs4 import BeautifulSoup +from hashlib import sha1 +import TencentYoutuyun +from PIL import Image +import uuid + + + +def recognition_captcha(data): + ''' 识别验证码 ''' + + file_id = str(uuid.uuid1()) + filename = 'captcha_'+ file_id +'.gif' + filename_png = 'captcha_'+ file_id +'.png' + + if(data is None): + return + data = base64.b64decode(data.encode('utf-8')) + with open( filename ,'wb') as fb: + fb.write( data ) + + appid = 'appid' # 接入优图服务,注册账号获取 + secret_id = 'secret_id' + secret_key = 'secret_key' + userid= 'userid' + end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT + + youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化 + + # 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式 + im = Image.open( filename) + im.save( filename_png ,"png") + im.close() + + result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url + + return result + + +def get_captcha(sessiona,headers): + ''' 获取验证码 ''' + + need_cap = False + + while( need_cap is not True): + try: + sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf + resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket + need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 + time.sleep( 0.5 + random.randint(1,9)/10 ) + except Exception: + continue + + try: + resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put + img_data = json.loads(resp3.text)["img_base64"] + except Exception: + return + + + return img_data + +def create_point( point_data, confidence ): + ''' 获得点阵 ''' + + # 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下 + points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]} + wi = 0 + input_points = [] + + for word in ( point_data['items'][0]['words'] ): + wi = wi+1 + if( word['confidence'] < confidence ): + try: + input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5 + except KeyError: + continue + + if( len(input_points) > 2 or len(input_points) == 0 ): + return [] # 7个字中只有2个倒置中文的成功率高 + + result = {} + result['img_size']=[200,44] + result['input_points']=input_points + result = json.dumps(result) + print(result) + return result + +def bolting(k_low,k_hi,k3_confidence): + ''' 筛选把握大的进行验证 ''' + + start = time.time() + + is_success = False + while(is_success is not True): + + points_len = 1 + angle = -20 + img_ko = [] + + while(points_len != 21 or angle < k_low or angle > k_hi ): + img_data = get_captcha(sessiona,headers) + img_ko = recognition_captcha(img_data) + + ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False + # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) + # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理 + + # with open( "json.txt" ,'wb') as fb: + # fb.write( img_ko_json ) + + try: + points_len = len(img_ko['items'][0]['itemstring']) + angle = img_ko['angle'] + except Exception: + points_len = 1 + angle = -20 + continue + + # print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示 + # print('-'*50) + + input_text = create_point( img_ko ,k3_confidence ) + if(type(input_text) == type([])): + continue + + data = { + "input_text":input_text + } + + # 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟 + time.sleep( 4 + random.randint(1,9)/10 ) + try: + resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers) + except Exception: + continue + + print("angle: "+ str(angle) ) + print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心 + print('-'*50) + try: + is_success = json.loads(resp5.text)["success"] + except KeyError: + continue + + end = time.time() + + return end-start + + +if __name__ == "__main__": + + sessiona = requests.Session() + headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} + + k3_confidence = 0.71 + + ''' + # 可视化数据会被保存在云端供浏览 + # https://plot.ly/~weldon2010/4 + # 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快 + ''' + runtime_list_x = [] + runtime_list_y = [] + nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思 + + # 成功尝试100次,形成2维数据以热力图的方式展示 + for y in nn : + for x in nn : + runtime_list_x.append( bolting(-3,3,k3_confidence) ) + print( "y: " + str(runtime_list_y) ) + print( "x: " + str(runtime_list_x) ) + runtime_list_y.append(runtime_list_x.copy()) + runtime_list_x = [] + + print ("-"*30) + print( runtime_list_y ) + print ("-"*30) + + # pip install plotly 数据可视化 + import plotly + import plotly.graph_objs as go + plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册 + trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ]) + data=[trace] + plotly.plotly.plot(data, filename='weldon-time2-heatmap') + + # 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速 + # 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找 + +### chcp 65001 (win下改变cmd字符集) +### python c:\python34\image_recognition_zhihu.py + + + + + + diff --git a/login_zhihu.py b/login_zhihu.py index 2bf89335..7c5a6018 100644 --- a/login_zhihu.py +++ b/login_zhihu.py @@ -1,27 +1,84 @@ -import requests,time +# -*- coding:UTF-8 -*- + +import requests , time +import hmac ,json from bs4 import BeautifulSoup -url = 'https://www.zhihu.com/login/email' -def get_captcha(data): +from hashlib import sha1 + + +def get_captcha(data,need_cap): + ''' 处理验证码 ''' + if need_cap is False: + return with open('captcha.gif','wb') as fb: fb.write(data) - return input('captcha') + return input('captcha:') + +def get_signature(grantType,clientId,source,timestamp): + ''' 处理签名 ''' + + hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1) + hm.update(str.encode(grantType)) + hm.update(str.encode(clientId)) + hm.update(str.encode(source)) + hm.update(str.encode(timestamp)) -def login(username,password,oncaptcha): - sessiona = requests.Session() - headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} - xyz = sessiona.get('https://www.zhihu.com/#signin',headers=headers).content - _xsrf = BeautifulSoup(sessiona.get('https://www.zhihu.com/#signin',headers=headers).content,'html.parser').find('input',attrs={'name':'_xsrf'}).get('value') + return str(hm.hexdigest()) + + + +def login(username,password,oncaptcha,sessiona,headers): + ''' 处理登录 ''' + + resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf + resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket + need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 + + grantType = 'password' + clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20' + source ='com.zhihu.web' + timestamp = str((time.time()*1000)).split('.')[0] # 签名只按这个时间戳变化 + captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content + data = { - "_xsrf":_xsrf, - "email":username, + "client_id":clientId, + "grant_type":grantType, + "timestamp":timestamp, + "source":source, + "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名 + "username":username, "password":password, - "remember_me":True, - "captcha":oncaptcha(captcha_content) + "lang":"cn", + "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码 + "ref_source":"other_", + "utm_source":"" } - resp = sessiona.post('https://www.zhihu.com/login/email',data,headers=headers).content - print(resp) + + print("**2**: "+str(data)) + print("-"*50) + resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content + print(BeautifulSoup(resp,'html.parser')) + + print("-"*50) return resp if __name__ == "__main__": - login('email','password',get_captcha) \ No newline at end of file + sessiona = requests.Session() + headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} + + login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了 + resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers) # 登录进去了,可以看私信了 + print(BeautifulSoup(resp.content ,'html.parser')) + + + + +### chcp 65001 (win下改变cmd字符集) +### python c:\python34\login_zhihu.py +### 有非常无语的事情发生,还以为代码没生效 + + + + + diff --git a/qiubai_crawer.py b/qiubai_crawer.py new file mode 100644 index 00000000..e37e7e7d --- /dev/null +++ b/qiubai_crawer.py @@ -0,0 +1,54 @@ +import requests +from bs4 import BeautifulSoup + + +def download_page(url): + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + r = requests.get(url, headers=headers) + return r.text + + +def get_content(html, page): + output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" + soup = BeautifulSoup(html, 'html.parser') + con = soup.find(id='content-left') + con_list = con.find_all('div', class_="article") + for i in con_list: + author = i.find('h2').string # 获取作者名字 + content = i.find('div', class_='content').find('span').get_text() # 获取内容 + stats = i.find('div', class_='stats') + vote = stats.find('span', class_='stats-vote').find('i', class_='number').string + comment = stats.find('span', class_='stats-comments').find('i', class_='number').string + author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 + if author_info is not None: # 非匿名用户 + class_list = author_info['class'] + if "womenIcon" in class_list: + gender = '女' + elif "manIcon" in class_list: + gender = '男' + else: + gender = '' + age = author_info.string # 获取年龄 + else: # 匿名用户 + gender = '' + age = '' + + save_txt(output.format(page, author, gender, age, vote, comment, content)) + + +def save_txt(*args): + for i in args: + with open('qiubai.txt', 'a', encoding='utf-8') as f: + f.write(i) + + +def main(): + # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, + # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 + for i in range(1, 14): + url = 'https://qiushibaike.com/text/page/%7B%7D'.format(i) + html = download_page(url) + get_content(html, i) + +if __name__ == '__main__': + main() diff --git a/qrcode.jpg b/qrcode.jpg new file mode 100644 index 00000000..1c2532c3 Binary files /dev/null and b/qrcode.jpg differ diff --git a/wechat/README.MD b/wechat/README.MD new file mode 100644 index 00000000..f4f5386c --- /dev/null +++ b/wechat/README.MD @@ -0,0 +1,5 @@ +# 详细使用请看文章 + +[Python微信公众号开发—小白篇(一)](https://mp.weixin.qq.com/s/iMPUC0yxI-zuf4AjtyAu6g) + +[Python公众号开发—颜值检测](https://mp.weixin.qq.com/s/I0DxhIHkeqhc2LeQ2ICHeA) \ No newline at end of file diff --git a/wechat/connect.py b/wechat/connect.py new file mode 100644 index 00000000..3e0ca008 --- /dev/null +++ b/wechat/connect.py @@ -0,0 +1,52 @@ +# -*-coding:utf-8 -*- +import falcon +from falcon import uri +from wechatpy.utils import check_signature +from wechatpy.exceptions import InvalidSignatureException +from wechatpy import parse_message +from wechatpy.replies import TextReply, ImageReply + +from utils import img_download, img_upload +from face_id import access_api + + +class Connect(object): + + def on_get(self, req, resp): + query_string = req.query_string + query_list = query_string.split('&') + b = {} + for i in query_list: + b[i.split('=')[0]] = i.split('=')[1] + + try: + check_signature(token='lengxiao', signature=b['signature'], timestamp=b['timestamp'], nonce=b['nonce']) + resp.body = (b['echostr']) + except InvalidSignatureException: + pass + resp.status = falcon.HTTP_200 + + def on_post(self, req, resp): + xml = req.stream.read() + msg = parse_message(xml) + if msg.type == 'text': + reply = TextReply(content=msg.content, message=msg) + xml = reply.render() + resp.body = (xml) + resp.status = falcon.HTTP_200 + elif msg.type == 'image': + name = img_download(msg.image, msg.source) + print(name) + r = access_api('images/' + name) + if r == 'success': + media_id = img_upload('image', 'faces/' + name) + reply = ImageReply(media_id=media_id, message=msg) + else: + reply = TextReply(content='人脸检测失败,请上传1M以下人脸清晰的照片', message=msg) + xml = reply.render() + resp.body = (xml) + resp.status = falcon.HTTP_200 + +app = falcon.API() +connect = Connect() +app.add_route('/connect', connect) diff --git a/wechat/face_id.py b/wechat/face_id.py new file mode 100644 index 00000000..6726f153 --- /dev/null +++ b/wechat/face_id.py @@ -0,0 +1,137 @@ +# -*-coding:utf-8 -*- +import time +import random +import base64 +import hashlib +import requests +from urllib.parse import urlencode +import cv2 +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import os + + +# 一.计算接口鉴权,构造请求参数 + +def random_str(): + '''得到随机字符串nonce_str''' + str = 'abcdefghijklmnopqrstuvwxyz' + r = '' + for i in range(15): + index = random.randint(0,25) + r += str[index] + return r + + +def image(name): + with open(name, 'rb') as f: + content = f.read() + return base64.b64encode(content) + + +def get_params(img): + '''组织接口请求的参数形式,并且计算sign接口鉴权信息, + 最终返回接口请求所需要的参数字典''' + params = { + 'app_id': '1106860829', + 'time_stamp': str(int(time.time())), + 'nonce_str': random_str(), + 'image': img, + 'mode': '0' + + } + + sort_dict = sorted(params.items(), key=lambda item: item[0], reverse=False) # 排序 + sort_dict.append(('app_key', 'P8Gt8nxi6k8vLKbS')) # 添加app_key + rawtext = urlencode(sort_dict).encode() # URL编码 + sha = hashlib.md5() + sha.update(rawtext) + md5text = sha.hexdigest().upper() # 计算出sign,接口鉴权 + params['sign'] = md5text # 添加到请求参数列表中 + return params + +# 二.请求接口URL + + +def access_api(img): + frame = cv2.imread(img) + nparry_encode = cv2.imencode('.jpg', frame)[1] + data_encode = np.array(nparry_encode) + img_encode = base64.b64encode(data_encode) # 图片转为base64编码格式 + url = 'https://api.ai.qq.com/fcgi-bin/face/face_detectface' + res = requests.post(url, get_params(img_encode)).json() # 请求URL,得到json信息 + # 把信息显示到图片上 + if res['ret'] == 0: # 0代表请求成功 + pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # 把opencv格式转换为PIL格式,方便写汉字 + draw = ImageDraw.Draw(pil_img) + for obj in res['data']['face_list']: + img_width = res['data']['image_width'] # 图像宽度 + img_height = res['data']['image_height'] # 图像高度 + # print(obj) + x = obj['x'] # 人脸框左上角x坐标 + y = obj['y'] # 人脸框左上角y坐标 + w = obj['width'] # 人脸框宽度 + h = obj['height'] # 人脸框高度 + # 根据返回的值,自定义一下显示的文字内容 + if obj['glass'] == 1: # 眼镜 + glass = '有' + else: + glass = '无' + if obj['gender'] >= 70: # 性别值从0-100表示从女性到男性 + gender = '男' + elif 50 <= obj['gender'] < 70: + gender = "娘" + elif obj['gender'] < 30: + gender = '女' + else: + gender = '女汉子' + if 90 < obj['expression'] <= 100: # 表情从0-100,表示笑的程度 + expression = '一笑倾城' + elif 80 < obj['expression'] <= 90: + expression = '心花怒放' + elif 70 < obj['expression'] <= 80: + expression = '兴高采烈' + elif 60 < obj['expression'] <= 70: + expression = '眉开眼笑' + elif 50 < obj['expression'] <= 60: + expression = '喜上眉梢' + elif 40 < obj['expression'] <= 50: + expression = '喜气洋洋' + elif 30 < obj['expression'] <= 40: + expression = '笑逐颜开' + elif 20 < obj['expression'] <= 30: + expression = '似笑非笑' + elif 10 < obj['expression'] <= 20: + expression = '半嗔半喜' + elif 0 <= obj['expression'] <= 10: + expression = '黯然伤神' + delt = h // 5 # 确定文字垂直距离 + # 写入图片 + if len(res['data']['face_list']) > 1: # 检测到多个人脸,就把信息写入人脸框内 + font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8') # 提前把字体文件下载好 + draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) + elif img_width - x - w < 170: # 避免图片太窄,导致文字显示不完全 + font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8') + draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) + draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) + else: + font = ImageFont.truetype('yahei.ttf', 20, encoding='utf-8') + draw.text((x + w + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) + draw.text((x + w + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) + draw.text((x + w + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) + draw.text((x + w + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) + draw.text((x + w + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) + + draw.rectangle((x, y, x + w, y + h), outline="#4CB050") # 画出人脸方框 + cv2img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) # 把 pil 格式转换为 cv + cv2.imwrite('faces/{}'.format(os.path.basename(img)), cv2img) # 保存图片到 face 文件夹下 + return 'success' + else: + return 'fail' \ No newline at end of file diff --git a/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg b/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg new file mode 100644 index 00000000..1b06d215 Binary files /dev/null and b/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg differ diff --git a/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg b/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg new file mode 100644 index 00000000..4b2b43c9 Binary files /dev/null and b/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg differ diff --git a/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg b/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg new file mode 100644 index 00000000..47a1d9b9 Binary files /dev/null and b/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg differ diff --git a/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg b/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg new file mode 100644 index 00000000..a45619de Binary files /dev/null and b/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg differ diff --git a/wechat/requirements.txt b/wechat/requirements.txt new file mode 100644 index 00000000..7a8bbcc9 --- /dev/null +++ b/wechat/requirements.txt @@ -0,0 +1,17 @@ +certifi==2018.4.16 +chardet==3.0.4 +falcon==1.4.1 +idna==2.6 +numpy==1.14.5 +opencv-python==3.4.1.15 +optionaldict==0.1.1 +Pillow==5.1.0 +pycrypto==2.6.1 +python-dateutil==2.7.3 +python-mimeparse==1.6.0 +requests==2.18.4 +six==1.11.0 +urllib3==1.22 +waitress==1.1.0 +wechatpy==1.7.0 +xmltodict==0.11.0 diff --git a/wechat/utils.py b/wechat/utils.py new file mode 100644 index 00000000..c082b6f2 --- /dev/null +++ b/wechat/utils.py @@ -0,0 +1,44 @@ +# -*-coding:utf-8 -*- +import requests +import json +import threading +import time +import os + +token = '' +app_id = '开发者ID(AppID)' +secret = '开发者密码(AppSecret)' + + +def img_download(url, name): + r = requests.get(url) + with open('images/{}-{}.jpg'.format(name, time.strftime("%Y_%m_%d%H_%M_%S", time.localtime())), 'wb') as fd: + fd.write(r.content) + if os.path.getsize(fd.name) >= 1048576: + return 'large' + # print('namename', os.path.basename(fd.name)) + return os.path.basename(fd.name) + + +def get_access_token(appid, secret): + '''获取access_token,100分钟刷新一次''' + + url = 'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}'.format(appid, secret) + r = requests.get(url) + parse_json = json.loads(r.text) + global token + token = parse_json['access_token'] + global timer + timer = threading.Timer(6000, get_access_token) + timer.start() + + +def img_upload(mediaType, name): + global token + url = "https://api.weixin.qq.com/cgi-bin/media/upload?access_token=%s&type=%s" % (token, mediaType) + files = {'media': open('{}'.format(name), 'rb')} + r = requests.post(url, files=files) + parse_json = json.loads(r.text) + return parse_json['media_id'] + +get_access_token(app_id, secret) \ No newline at end of file diff --git a/wechat/yahei.ttf b/wechat/yahei.ttf new file mode 100644 index 00000000..66baa6c1 Binary files /dev/null and b/wechat/yahei.ttf differ diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" "b/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" new file mode 100644 index 00000000..4a64f572 --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/README.MD" @@ -0,0 +1,11 @@ +# 代码详细说明请看文章 + +[Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw) + +[Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew) + +[Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA) + +[Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw) + +[Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g) \ No newline at end of file diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" new file mode 100644 index 00000000..e0982308 --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/lagou.py" @@ -0,0 +1,83 @@ +import random +import time + +import requests +from openpyxl import Workbook +import pymysql.cursors + + +def get_conn(): + '''建立数据库连接''' + conn = pymysql.connect(host='localhost', + user='root', + password='root', + db='python', + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor) + return conn + + +def insert(conn, info): + '''数据写入数据库''' + with conn.cursor() as cursor: + sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)" + cursor.execute(sql, info) + conn.commit() + + +def get_json(url, page, lang_name): + '''返回当前页面的信息列表''' + headers = { + 'Host': 'www.lagou.com', + 'Connection': 'keep-alive', + 'Content-Length': '23', + 'Origin': 'https://www.lagou.com/', + 'X-Anit-Forge-Code': '0', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Accept': 'application/json, text/javascript, */*; q=0.01', + 'X-Requested-With': 'XMLHttpRequest', + 'X-Anit-Forge-Token': 'None', + 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' + } + data = {'first': 'false', 'pn': page, 'kd': lang_name} + json = requests.post(url, data, headers=headers).json() + list_con = json['content']['positionResult']['result'] + info_list = [] + for i in list_con: + info = [] + info.append(i.get('companyShortName', '无')) + info.append(i.get('companyFullName', '无')) + info.append(i.get('industryField', '无')) + info.append(i.get('companySize', '无')) + info.append(i.get('salary', '无')) + info.append(i.get('city', '无')) + info.append(i.get('education', '无')) + info_list.append(info) + return info_list + + +def main(): + lang_name = 'python' + wb = Workbook() # 打开 excel 工作簿 + conn = get_conn() # 建立数据库连接 不存数据库 注释此行 + for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市 + page = 1 + ws1 = wb.active + ws1.title = lang_name + url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i) + while page < 31: # 每个城市30页信息 + info = get_json(url, page, lang_name) + page += 1 + print(i, 'page', page) + time.sleep(random.randint(10, 20)) + for row in info: + insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行 + ws1.append(row) + conn.close() # 关闭数据库连接,不存数据库 注释此行 + wb.save('{}职位信息.xlsx'.format(lang_name)) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/meizitu.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/meizitu.py" new file mode 100644 index 00000000..4f6932db --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/meizitu.py" @@ -0,0 +1,77 @@ +import requests +import os +import time +import threading +from bs4 import BeautifulSoup + + +def download_page(url): + ''' + 用于下载页面 + ''' + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + r = requests.get(url, headers=headers) + r.encoding = 'gb2312' + return r.text + + +def get_pic_list(html): + ''' + 获取每个页面的套图列表,之后循环调用get_pic函数获取图片 + ''' + soup = BeautifulSoup(html, 'html.parser') + pic_list = soup.find_all('li', class_='wp-item') + for i in pic_list: + a_tag = i.find('h3', class_='tit').find('a') + link = a_tag.get('href') + text = a_tag.get_text() + get_pic(link, text) + + +def get_pic(link, text): + ''' + 获取当前页面的图片,并保存 + ''' + html = download_page(link) # 下载界面 + soup = BeautifulSoup(html, 'html.parser') + pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片 + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + create_dir('pic/{}'.format(text)) + for i in pic_list: + pic_link = i.get('src') # 拿到图片的具体 url + r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件 + with open('pic/{}/{}'.format(text, pic_link.split('/')[-1]), 'wb') as f: + f.write(r.content) + time.sleep(1) # 休息一下,不要给网站太大压力,避免被封 + + +def create_dir(name): + if not os.path.exists(name): + os.makedirs(name) + + +def execute(url): + page_html = download_page(url) + get_pic_list(page_html) + + +def main(): + create_dir('pic') + queue = [i for i in range(1, 72)] # 构造 url 链接 页码。 + threads = [] + while len(queue) > 0: + for thread in threads: + if not thread.is_alive(): + threads.remove(thread) + while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5 + cur_page = queue.pop(0) + url = 'http://meizitu.com/a/more_%7B%7D.html'.format(cur_page) + thread = threading.Thread(target=execute, args=(url,)) + thread.setDaemon(True) + thread.start() + print('{}正在下载{}页'.format(threading.current_thread().name, cur_page)) + threads.append(thread) + + +if __name__ == '__main__': + main() diff --git "a/\347\210\254\350\231\253\351\233\206\345\220\210/qiubai_crawer.py" "b/\347\210\254\350\231\253\351\233\206\345\220\210/qiubai_crawer.py" new file mode 100644 index 00000000..e37e7e7d --- /dev/null +++ "b/\347\210\254\350\231\253\351\233\206\345\220\210/qiubai_crawer.py" @@ -0,0 +1,54 @@ +import requests +from bs4 import BeautifulSoup + + +def download_page(url): + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} + r = requests.get(url, headers=headers) + return r.text + + +def get_content(html, page): + output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" + soup = BeautifulSoup(html, 'html.parser') + con = soup.find(id='content-left') + con_list = con.find_all('div', class_="article") + for i in con_list: + author = i.find('h2').string # 获取作者名字 + content = i.find('div', class_='content').find('span').get_text() # 获取内容 + stats = i.find('div', class_='stats') + vote = stats.find('span', class_='stats-vote').find('i', class_='number').string + comment = stats.find('span', class_='stats-comments').find('i', class_='number').string + author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 + if author_info is not None: # 非匿名用户 + class_list = author_info['class'] + if "womenIcon" in class_list: + gender = '女' + elif "manIcon" in class_list: + gender = '男' + else: + gender = '' + age = author_info.string # 获取年龄 + else: # 匿名用户 + gender = '' + age = '' + + save_txt(output.format(page, author, gender, age, vote, comment, content)) + + +def save_txt(*args): + for i in args: + with open('qiubai.txt', 'a', encoding='utf-8') as f: + f.write(i) + + +def main(): + # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, + # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 + for i in range(1, 14): + url = 'https://qiushibaike.com/text/page/%7B%7D'.format(i) + html = download_page(url) + get_content(html, i) + +if __name__ == '__main__': + main()