diff --git a/2020/README.md b/2020/README.md new file mode 100644 index 00000000..d2185705 --- /dev/null +++ b/2020/README.md @@ -0,0 +1,38 @@ +# Python Spider 2020 + +由于这个项目时间太长了,陆陆续续,很多实战示例也早已失效。 + +网络爬虫,是一门比较通用的基础技术,各个领域都会有所涉及,比如我做视觉算法的,也需要用到网络爬虫,例如调用 API 接口清洗数据等,这本质也都是一个小的爬虫程序。 + +为了提供各位更好的学习示例,我决定重写这一系列教程,对一些失效的示例,重新找例子,并查缺补漏,完善这一些列教程。 + +2020年,最新版的 Python3 网络爬虫实战系列教程。 + +原创文章每周最少两篇,**后续最新文章**会在[【公众号】](https://cuijiahua.com/wp-content/uploads/2020/05/gzh-w.jpg)首发,视频[【B站】](https://space.bilibili.com/331507846)首发,大家可以加我[【微信】](https://cuijiahua.com/wp-content/uploads/2020/05/gzh-w.jpg)进**交流群**,技术交流或提意见都可以,欢迎**Star**! + +
+ +## Python3 网络爬虫教程 2020 +| 文章 | 公众号 | 代码 | +| :------ | :--------: | :--------: | +| Python3 网络爬虫(一):初识网络爬虫之夜探老王家 | [公众号](https://mp.weixin.qq.com/s/1rcq9RQYuAuHFg1w1j8HXg "Python3 网络爬虫(一)") | no | +| Python3 网络爬虫(二):下载小说的正确姿势 | [公众号](https://mp.weixin.qq.com/s/5e2_r0QXUISVp9GdDsqbzg "Python3 网络爬虫(二)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/xbqg "Python3 网络爬虫(二)") | +| Python3 网络爬虫(三):漫画下载,动态加载、反爬虫这都不叫事!| [公众号](https://mp.weixin.qq.com/s/wyS-OP04K3Vs9arSelRlyA "Python3网络爬虫(三)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/dmzj "Python3 网络爬虫(三)") | +| Python3 网络爬虫(四):视频下载,那些事儿!| [公众号](https://mp.weixin.qq.com/s/_geNA6Dwo4kx25X7trJzlg "Python3 网络爬虫(四)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/zycjw "Python3 网络爬虫(四)") | +| Python3 网络爬虫(五):老板,需要特殊服务吗?| [公众号](https://mp.weixin.qq.com/s/PPTSnIHV71b-wB3oRiYnIA "Python3 网络爬虫(五)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/api "Python3 网络爬虫(五)") | +| Python3 网络爬虫(六):618,爱他/她,就清空他/她的购物车!| [公众号](https://mp.weixin.qq.com/s/lXXDfzyLVrf3f-aqJN1C3A "Python3 网络爬虫(六)") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/taobao "Python3 网络爬虫(六)") | +| 宝藏B站UP主,视频弹幕尽收囊中!| [公众号](https://mp.weixin.qq.com/s/aWratg1j9RBAjIghoY66yQ "宝藏B站UP主,视频弹幕尽收囊中!") | [Code](https://github.com/Jack-Cherish/python-spider/tree/master/2020/bilibili "宝藏B站UP主,视频弹幕尽收囊中!") | + +更多精彩,敬请期待! + + + +
diff --git a/2020/api/api.py b/2020/api/api.py
new file mode 100644
index 00000000..4ed08497
--- /dev/null
+++ b/2020/api/api.py
@@ -0,0 +1,65 @@
+import requests
+import base64
+import json
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+%matplotlib inline
+
+
+beautify_url = "https://api-cn.faceplusplus.com/facepp/v2/beautify"
+# 你创建的应用的 API Key 和 API Secret(也叫 Secret Key)
+AK = ''
+SK = ''
+
+# 可选参数,不填写,默认50
+# 美白程度 0 - 100
+whitening = 80
+# 磨皮程度 0 - 100
+smoothing = 80
+# 瘦脸程度 0 - 100
+thinface = 20
+# 小脸程度 0 - 100
+shrink_face = 50
+# 大眼程度 0 - 100
+enlarge_eye = 50
+# 去眉毛程度 0 - 100
+remove_eyebrow = 50
+# 滤镜名称,不填写,默认无滤镜
+filter_type = ''
+
+# 二进制方式打开图片
+img_name = 'test_1.png'
+f = open(img_name, 'rb')
+# 转 base64
+img_base64 = base64.b64encode(f.read())
+
+# 使用 whitening、smoothing、thinface 三个可选参数,其他用默认值
+data = {
+ 'api_key': AK,
+ 'api_secret': SK,
+ 'image_base64': img_base64,
+ 'whitening': whitening,
+ 'smoothing': smoothing,
+ 'thinface': thinface,
+ }
+
+r = requests.post(url=beautify_url, data=data)
+html = json.loads(r.text)
+
+# 解析base64图片
+base64_data = html['result']
+imgData = base64.b64decode(base64_data)
+nparr = np.frombuffer(imgData, np.uint8)
+img_res = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+img_res_BGR = cv2.cvtColor(img_res, cv2.COLOR_RGB2BGR)
+
+# 原始图片
+img = cv2.imread(img_name)
+img_BGR = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+
+# 显示图片
+fig, axs = plt.subplots(nrows=1, ncols=2, sharex=False, sharey=False, figsize=(10,10))
+axs[0].imshow(img_BGR)
+axs[1].imshow(img_res_BGR)
+plt.show()
diff --git a/2020/api/test_1.png b/2020/api/test_1.png
new file mode 100644
index 00000000..38e8def3
Binary files /dev/null and b/2020/api/test_1.png differ
diff --git a/2020/bilibili/download.py b/2020/bilibili/download.py
new file mode 100644
index 00000000..b8aff376
--- /dev/null
+++ b/2020/bilibili/download.py
@@ -0,0 +1,120 @@
+# -*-coding:utf-8 -*-
+# Website: https://cuijiahua.com
+# Author: Jack Cui
+# Date: 2020.07.22
+import requests
+import json
+import re
+import json
+import math
+import xml2ass
+import time
+from contextlib import closing
+
+from bs4 import BeautifulSoup
+
+import os
+from win32com.client import Dispatch
+
+def addTasktoXunlei(down_url):
+ flag = False
+ o = Dispatch('ThunderAgent.Agent64.1')
+ try:
+ o.AddTask(down_url, "", "", "", "", -1, 0, 5)
+ o.CommitTasks()
+ flag = True
+ except Exception:
+ print(Exception.message)
+ print(" AddTask is fail!")
+ return flag
+
+def get_download_url(https://github.com/arcurl):
+ # 微信搜索 JackCui-AI 关注公众号,后台回复「B 站」获取视频解析地址
+ jiexi_url = 'xxx'
+ payload = {'url': arcurl}
+ jiexi_req = requests.get(jiexi_url, params=payload)
+ jiexi_bf = BeautifulSoup(jiexi_req.text)
+ jiexi_dn_url = jiexi_bf.iframe.get('src')
+ dn_req = requests.get(jiexi_dn_url)
+ dn_bf = BeautifulSoup(dn_req.text)
+ video_script = dn_bf.find('script',src = None)
+ DPlayer = str(video_script.string)
+ download_url = re.findall('\'(http[s]?:(?:[a-zA-Z]|[0-9]|[$-_@.&~+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)\'', DPlayer)[0]
+ download_url = download_url.replace('\\', '')
+ return download_url
+
+space_url = 'https://space.bilibili.com/280793434'
+search_url = 'https://api.bilibili.com/x/space/arc/search'
+mid = space_url.split('/')[-1]
+sess = requests.Session()
+search_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Accept': 'application/json, text/plain, */*'}
+
+# 获取视频个数
+ps = 1
+pn = 1
+search_params = {'mid': mid,
+ 'ps': ps,
+ 'tid': 0,
+ 'pn': pn}
+req = sess.get(url=search_url, headers=search_headers, params=search_params, verify=False)
+info = json.loads(req.text)
+video_count = info['data']['page']['count']
+
+ps = 10
+page = math.ceil(video_count/ps)
+videos_list = []
+for pn in range(1, page+1):
+ search_params = {'mid': mid,
+ 'ps': ps,
+ 'tid': 0,
+ 'pn': pn}
+ req = sess.get(url=search_url, headers=search_headers, params=search_params, verify=False)
+ info = json.loads(req.text)
+ vlist = info['data']['list']['vlist']
+ for video in vlist:
+ title = video['title']
+ bvid = video['bvid']
+ vurl = 'https://www.bilibili.com/video/' + bvid
+ videos_list.append([title, vurl])
+print('共 %d 个视频' % len(videos_list))
+all_video = {}
+# 下载前 10 个视频
+for video in videos_list[:10]:
+ download_url = get_download_url(https://github.com/video[1])
+ print(video[0] + ':' + download_url)
+ # 记录视频名字
+ xunlei_video_name = download_url.split('?')[0].split('/')[-1]
+ filename = video[0]
+ for c in u'´☆\n<':
+ return 'Bilibili' # Komica, with the same file format as Bilibili
+ elif tmp == 'xml version="1.0" encoding="UTF-8"?>\n<':
+ return 'MioMio'
+ elif tmp == 'p':
+ return 'Niconico' # Himawari Douga, with the same file format as Niconico Douga
+
+
+#
+# ReadComments**** protocol
+#
+# Input:
+# f: Input file
+# fontsize: Default font size
+#
+# Output:
+# yield a tuple:
+# (timeline, timestamp, no, comment, pos, color, size, height, width)
+# timeline: The position when the comment is replayed
+# timestamp: The UNIX timestamp when the comment is submitted
+# no: A sequence of 1, 2, 3, ..., used for sorting
+# comment: The content of the comment
+# pos: 0 for regular moving comment,
+# 1 for bottom centered comment,
+# 2 for top centered comment,
+# 3 for reversed moving comment
+# color: Font color represented in 0xRRGGBB,
+# e.g. 0xffffff for white
+# size: Font size
+# height: The estimated height in pixels
+# i.e. (comment.count('\n')+1)*size
+# width: The estimated width in pixels
+# i.e. CalculateLength(comment)*size
+#
+# After implementing ReadComments****, make sure to update ProbeCommentFormat
+# and CommentFormatMap.
+#
+
+
+def ReadCommentsNiconico(f, fontsize):
+ NiconicoColorMap = {'red': 0xff0000, 'pink': 0xff8080, 'orange': 0xffcc00, 'yellow': 0xffff00, 'green': 0x00ff00, 'cyan': 0x00ffff, 'blue': 0x0000ff, 'purple': 0xc000ff, 'black': 0x000000, 'niconicowhite': 0xcccc99, 'white2': 0xcccc99, 'truered': 0xcc0033, 'red2': 0xcc0033, 'passionorange': 0xff6600, 'orange2': 0xff6600, 'madyellow': 0x999900, 'yellow2': 0x999900, 'elementalgreen': 0x00cc66, 'green2': 0x00cc66, 'marineblue': 0x33ffcc, 'blue2': 0x33ffcc, 'nobleviolet': 0x6633cc, 'purple2': 0x6633cc}
+ dom = xml.dom.minidom.parse(f)
+ comment_element = dom.getElementsByTagName('chat')
+ for comment in comment_element:
+ try:
+ c = str(comment.childNodes[0].wholeText)
+ if c.startswith('/'):
+ continue # ignore advanced comments
+ pos = 0
+ color = 0xffffff
+ size = fontsize
+ for mailstyle in str(comment.getAttribute('mail')).split():
+ if mailstyle == 'ue':
+ pos = 1
+ elif mailstyle == 'shita':
+ pos = 2
+ elif mailstyle == 'big':
+ size = fontsize*1.44
+ elif mailstyle == 'small':
+ size = fontsize*0.64
+ elif mailstyle in NiconicoColorMap:
+ color = NiconicoColorMap[mailstyle]
+ yield (max(int(comment.getAttribute('vpos')), 0)*0.01, int(comment.getAttribute('date')), int(comment.getAttribute('no')), c, pos, color, size, (c.count('\n')+1)*size, CalculateLength(c)*size)
+ except (AssertionError, AttributeError, IndexError, TypeError, ValueError):
+ logging.warning(_('Invalid comment: %s') % comment.toxml())
+ continue
+
+
+def ReadCommentsAcfun(f, fontsize):
+ comment_element = json.load(f)
+ for i, comment in enumerate(comment_element):
+ try:
+ p = str(comment['c']).split(',')
+ assert len(p) >= 6
+ assert p[2] in ('1', '2', '4', '5', '7')
+ size = int(p[3])*fontsize/25.0
+ if p[2] != '7':
+ c = str(comment['m']).replace('\\r', '\n').replace('\r', '\n')
+ yield (float(p[0]), int(p[5]), i, c, {'1': 0, '2': 0, '4': 2, '5': 1}[p[2]], int(p[1]), size, (c.count('\n')+1)*size, CalculateLength(c)*size)
+ else:
+ c = dict(json.loads(comment['m']))
+ yield (float(p[0]), int(p[5]), i, c, 'acfunpos', int(p[1]), size, 0, 0)
+ except (AssertionError, AttributeError, IndexError, TypeError, ValueError):
+ logging.warning(_('Invalid comment: %r') % comment)
+ continue
+
+
+def ReadCommentsBilibili(f, fontsize):
+ dom = xml.dom.minidom.parse(f)
+ comment_element = dom.getElementsByTagName('d')
+ for i, comment in enumerate(comment_element):
+ try:
+ p = str(comment.getAttribute('p')).split(',')
+ assert len(p) >= 5
+ assert p[1] in ('1', '4', '5', '6', '7')
+ if p[1] != '7':
+ c = str(comment.childNodes[0].wholeText).replace('/n', '\n')
+ size = int(p[2])*fontsize/25.0
+ yield (float(p[0]), int(p[4]), i, c, {'1': 0, '4': 2, '5': 1, '6': 3}[p[1]], int(p[3]), size, (c.count('\n')+1)*size, CalculateLength(c)*size)
+ else: # positioned comment
+ c = str(comment.childNodes[0].wholeText)
+ yield (float(p[0]), int(p[4]), i, c, 'bilipos', int(p[3]), int(p[2]), 0, 0)
+ except (AssertionError, AttributeError, IndexError, TypeError, ValueError):
+ logging.warning(_('Invalid comment: %s') % comment.toxml())
+ continue
+
+
+def ReadCommentsTudou(f, fontsize):
+ comment_element = json.load(f)
+ for i, comment in enumerate(comment_element['comment_list']):
+ try:
+ assert comment['pos'] in (3, 4, 6)
+ c = str(comment['data'])
+ assert comment['size'] in (0, 1, 2)
+ size = {0: 0.64, 1: 1, 2: 1.44}[comment['size']]*fontsize
+ yield (int(comment['replay_time']*0.001), int(comment['commit_time']), i, c, {3: 0, 4: 2, 6: 1}[comment['pos']], int(comment['color']), size, (c.count('\n')+1)*size, CalculateLength(c)*size)
+ except (AssertionError, AttributeError, IndexError, TypeError, ValueError):
+ logging.warning(_('Invalid comment: %r') % comment)
+ continue
+
+
+def ReadCommentsMioMio(f, fontsize):
+ NiconicoColorMap = {'red': 0xff0000, 'pink': 0xff8080, 'orange': 0xffc000, 'yellow': 0xffff00, 'green': 0x00ff00, 'cyan': 0x00ffff, 'blue': 0x0000ff, 'purple': 0xc000ff, 'black': 0x000000}
+ dom = xml.dom.minidom.parse(f)
+ comment_element = dom.getElementsByTagName('data')
+ for i, comment in enumerate(comment_element):
+ try:
+ message = comment.getElementsByTagName('message')[0]
+ c = str(message.childNodes[0].wholeText)
+ pos = 0
+ size = int(message.getAttribute('fontsize'))*fontsize/25.0
+ yield (float(comment.getElementsByTagName('playTime')[0].childNodes[0].wholeText), int(calendar.timegm(time.strptime(comment.getElementsByTagName('times')[0].childNodes[0].wholeText, '%Y-%m-%d %H:%M:%S')))-28800, i, c, {'1': 0, '4': 2, '5': 1}[message.getAttribute('mode')], int(message.getAttribute('color')), size, (c.count('\n')+1)*size, CalculateLength(c)*size)
+ except (AssertionError, AttributeError, IndexError, TypeError, ValueError):
+ logging.warning(_('Invalid comment: %s') % comment.toxml())
+ continue
+
+
+def ReadCommentsSH5V(f, fontsize):
+ comment_element = json.load(f)
+ for i, comment in enumerate(comment_element["root"]["bgs"]):
+ try:
+ c_at = str(comment['at'])
+ c_type = str(comment['type'])
+ c_date = str(comment['timestamp'])
+ c_color = str(comment['color'])
+ c = str(comment['text'])
+ size = fontsize
+ if c_type != '7':
+ yield (float(c_at), int(c_date), i, c, {'0': 0, '1': 0, '4': 2, '5': 1}[c_type], int(c_color[1:], 16), size, (c.count('\n')+1)*size, CalculateLength(c)*size)
+ else:
+ c_x = float(comment['x'])
+ c_y = float(comment['y'])
+ size = int(comment['size'])
+ dur = int(comment['dur'])
+ data1 = float(comment['data1'])
+ data2 = float(comment['data2'])
+ data3 = int(comment['data3'])
+ data4 = int(comment['data4'])
+ yield (float(c_at), int(c_date), i, c, 'sH5Vpos', int(c_color[1:], 16), size, 0, 0, c_x, c_y, dur, data1, data2, data3, data4)
+ except (AssertionError, AttributeError, IndexError, TypeError, ValueError):
+ logging.warning(_('Invalid comment: %r') % comment)
+ continue
+
+
+CommentFormatMap = {None: None, 'Niconico': ReadCommentsNiconico, 'Acfun': ReadCommentsAcfun, 'Bilibili': ReadCommentsBilibili, 'Tudou': ReadCommentsTudou, 'MioMio': ReadCommentsMioMio, 'sH5V': ReadCommentsSH5V}
+
+
+def WriteCommentBilibiliPositioned(f, c, width, height, styleid):
+ #BiliPlayerSize = (512, 384) # Bilibili player version 2010
+ #BiliPlayerSize = (540, 384) # Bilibili player version 2012
+ BiliPlayerSize = (672, 438) # Bilibili player version 2014
+ ZoomFactor = GetZoomFactor(BiliPlayerSize, (width, height))
+
+ def GetPosition(InputPos, isHeight):
+ isHeight = int(isHeight) # True -> 1
+ if isinstance(InputPos, int):
+ return ZoomFactor[0]*InputPos+ZoomFactor[isHeight+1]
+ elif isinstance(InputPos, float):
+ if InputPos > 1:
+ return ZoomFactor[0]*InputPos+ZoomFactor[isHeight+1]
+ else:
+ return BiliPlayerSize[isHeight]*ZoomFactor[0]*InputPos+ZoomFactor[isHeight+1]
+ else:
+ try:
+ InputPos = int(InputPos)
+ except ValueError:
+ InputPos = float(InputPos)
+ return GetPosition(InputPos, isHeight)
+
+ try:
+ comment_args = safe_list(json.loads(c[3]))
+ text = ASSEscape(str(comment_args[4]).replace('/n', '\n'))
+ from_x = comment_args.get(0, 0)
+ from_y = comment_args.get(1, 0)
+ to_x = comment_args.get(7, from_x)
+ to_y = comment_args.get(8, from_y)
+ from_x = round(GetPosition(from_x, False))
+ from_y = round(GetPosition(from_y, True))
+ to_x = round(GetPosition(to_x, False))
+ to_y = round(GetPosition(to_y, True))
+ alpha = safe_list(str(comment_args.get(2, '1')).split('-'))
+ from_alpha = float(alpha.get(0, 1))
+ to_alpha = float(alpha.get(1, from_alpha))
+ from_alpha = 255-round(from_alpha*255)
+ to_alpha = 255-round(to_alpha*255)
+ rotate_z = int(comment_args.get(5, 0))
+ rotate_y = int(comment_args.get(6, 0))
+ lifetime = float(comment_args.get(3, 4500))
+ duration = int(comment_args.get(9, lifetime*1000))
+ delay = int(comment_args.get(10, 0))
+ fontface = comment_args.get(12)
+ isborder = comment_args.get(11, 'true')
+ styles = []
+ if (from_x, from_y) == (to_x, to_y):
+ styles.append('\\pos(%s, %s)' % (from_x, from_y))
+ else:
+ styles.append('\\move(%s, %s, %s, %s, %s, %s)' % (from_x, from_y, to_x, to_y, delay, delay+duration))
+ styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(rotate_y, rotate_z, (from_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (from_y-ZoomFactor[2])/(height-ZoomFactor[2]*2)))
+ if (from_x, from_y) != (to_x, to_y):
+ styles.append('\\t(%s, %s, ' % (delay, delay+duration))
+ styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(rotate_y, rotate_z, (to_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (to_y-ZoomFactor[2])/(height-ZoomFactor[2]*2)))
+ styles.append(')')
+ if fontface:
+ styles.append('\\fn%s' % ASSEscape(fontface))
+ styles.append('\\fs%s' % round(c[6]*ZoomFactor[0]))
+ if c[5] != 0xffffff:
+ styles.append('\\c&H%02X%02X%02X&' % (c[5] & 0xff, (c[5] >> 8) & 0xff, (c[5] >> 16) & 0xff))
+ if c[5] == 0x000000:
+ styles.append('\\3c&HFFFFFF&')
+ if from_alpha == to_alpha:
+ styles.append('\\alpha&H%02X' % from_alpha)
+ elif (from_alpha, to_alpha) == (255, 0):
+ styles.append('\\fad(%s,0)' % (lifetime*1000))
+ elif (from_alpha, to_alpha) == (0, 255):
+ styles.append('\\fad(0, %s)' % (lifetime*1000))
+ else:
+ styles.append('\\fade(%(from_alpha)s, %(to_alpha)s, %(to_alpha)s, 0, %(end_time)s, %(end_time)s, %(end_time)s)' % {'from_alpha': from_alpha, 'to_alpha': to_alpha, 'end_time': lifetime*1000})
+ if isborder == 'false':
+ styles.append('\\bord0')
+ f.write('Dialogue: -1,%(start)s,%(end)s,%(styleid)s,,0,0,0,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(c[0]), 'end': ConvertTimestamp(c[0]+lifetime), 'styles': ''.join(styles), 'text': text, 'styleid': styleid})
+ except (IndexError, ValueError) as e:
+ try:
+ logging.warning(_('Invalid comment: %r') % c[3])
+ except IndexError:
+ logging.warning(_('Invalid comment: %r') % c)
+
+
+def WriteCommentAcfunPositioned(f, c, width, height, styleid):
+ AcfunPlayerSize = (560, 400)
+ ZoomFactor = GetZoomFactor(AcfunPlayerSize, (width, height))
+
+ def GetPosition(InputPos, isHeight):
+ isHeight = int(isHeight) # True -> 1
+ return AcfunPlayerSize[isHeight]*ZoomFactor[0]*InputPos*0.001+ZoomFactor[isHeight+1]
+
+ def GetTransformStyles(x=None, y=None, scale_x=None, scale_y=None, rotate_z=None, rotate_y=None, color=None, alpha=None):
+ styles = []
+ if x is not None and y is not None:
+ styles.append('\\pos(%s, %s)' % (x, y))
+ if scale_x is not None:
+ styles.append('\\fscx%s' % scale_x)
+ if scale_y is not None:
+ styles.append('\\fscy%s' % scale_y)
+ if rotate_z is not None and rotate_y is not None:
+ assert x is not None
+ assert y is not None
+ styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(rotate_y, rotate_z, (x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (y-ZoomFactor[2])/(height-ZoomFactor[2]*2)))
+ if color is not None:
+ styles.append('\\c&H%02X%02X%02X&' % (color & 0xff, (color >> 8) & 0xff, (color >> 16) & 0xff))
+ if color == 0x000000:
+ styles.append('\\3c&HFFFFFF&')
+ if alpha is not None:
+ alpha = 255-round(alpha*255)
+ styles.append('\\alpha&H%02X' % alpha)
+ return styles
+
+ def FlushCommentLine(f, text, styles, start_time, end_time, styleid):
+ if end_time > start_time:
+ f.write('Dialogue: -1,%(start)s,%(end)s,%(styleid)s,,0,0,0,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(start_time), 'end': ConvertTimestamp(end_time), 'styles': ''.join(styles), 'text': text, 'styleid': styleid})
+
+ try:
+ comment_args = c[3]
+ text = ASSEscape(str(comment_args['n']).replace('\r', '\n').replace('\r', '\n'))
+ common_styles = []
+ anchor = {0: 7, 1: 8, 2: 9, 3: 4, 4: 5, 5: 6, 6: 1, 7: 2, 8: 3}.get(comment_args.get('c', 0), 7)
+ if anchor != 7:
+ common_styles.append('\\an%s' % anchor)
+ font = comment_args.get('w')
+ if font:
+ font = dict(font)
+ fontface = font.get('f')
+ if fontface:
+ common_styles.append('\\fn%s' % ASSEscape(str(fontface)))
+ fontbold = bool(font.get('b'))
+ if fontbold:
+ common_styles.append('\\b1')
+ common_styles.append('\\fs%s' % round(c[6]*ZoomFactor[0]))
+ isborder = bool(comment_args.get('b', True))
+ if not isborder:
+ common_styles.append('\\bord0')
+ to_pos = dict(comment_args.get('p', {'x': 0, 'y': 0}))
+ to_x = round(GetPosition(int(to_pos.get('x', 0)), False))
+ to_y = round(GetPosition(int(to_pos.get('y', 0)), True))
+ to_scale_x = round(float(comment_args.get('e', 1.0))*100)
+ to_scale_y = round(float(comment_args.get('f', 1.0))*100)
+ to_rotate_z = float(comment_args.get('r', 0.0))
+ to_rotate_y = float(comment_args.get('k', 0.0))
+ to_color = c[5]
+ to_alpha = float(comment_args.get('a', 1.0))
+ from_time = float(comment_args.get('t', 0.0))
+ action_time = float(comment_args.get('l', 3.0))
+ actions = list(comment_args.get('z', []))
+ transform_styles = GetTransformStyles(to_x, to_y, to_scale_x, to_scale_y, to_rotate_z, to_rotate_y, to_color, to_alpha)
+ FlushCommentLine(f, text, common_styles+transform_styles, c[0]+from_time, c[0]+from_time+action_time, styleid)
+ for action in actions:
+ action = dict(action)
+ from_x, from_y = to_x, to_y
+ from_scale_x, from_scale_y = to_scale_x, to_scale_y
+ from_rotate_z, from_rotate_y = to_rotate_z, to_rotate_y
+ from_color, from_alpha = to_color, to_alpha
+ from_time += action_time
+ action_time = float(action.get('l', 0.0))
+ action_styles = []
+ if 'x' in action:
+ to_x = round(GetPosition(int(action['x']), False))
+ if 'y' in action:
+ to_y = round(GetPosition(int(action['y']), True))
+ if 'f' in action:
+ to_scale_x = round(float(action['f'])*100)
+ action_styles.append('\\fscx%s' % to_scale_x)
+ if 'g' in action:
+ to_scale_y = round(float(action['g'])*100)
+ action_styles.append('\\fscy%s' % to_scale_y)
+ if 'c' in action:
+ to_color = int(action['c'])
+ action_styles.append('\\c&H%02X%02X%02X&' % (to_color & 0xff, (to_color >> 8) & 0xff, (to_color >> 16) & 0xff))
+ if 't' in action:
+ to_alpha = float(action['t'])
+ action_styles.append('\\alpha&H%02X' % (255-round(to_alpha*255)))
+ if 'd' in action:
+ to_rotate_z = float(action['d'])
+ if 'e' in action:
+ to_rotate_y = float(action['e'])
+ if ('x' in action) or ('y' in action):
+ transform_styles = GetTransformStyles(None, None, from_scale_x, from_scale_y, None, None, from_color, from_alpha)
+ transform_styles.append('\\move(%s, %s, %s, %s)' % (from_x, from_y, to_x, to_y))
+ action_styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(to_rotate_y, to_rotate_z, (to_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (to_y-ZoomFactor[2])/(width-ZoomFactor[2]*2)))
+ elif ('d' in action) or ('e' in action):
+ action_styles.append('\\frx%s\\fry%s\\frz%s\\fax%s\\fay%s' % ConvertFlashRotation(to_rotate_y, to_rotate_z, (to_x-ZoomFactor[1])/(width-ZoomFactor[1]*2), (to_y-ZoomFactor[2])/(width-ZoomFactor[2]*2)))
+ else:
+ transform_styles = GetTransformStyles(from_x, from_y, from_scale_x, from_scale_y, from_rotate_z, from_rotate_y, from_color, from_alpha)
+ if action_styles:
+ transform_styles.append('\\t(%s)' % (''.join(action_styles)))
+ FlushCommentLine(f, text, common_styles+transform_styles, c[0]+from_time, c[0]+from_time+action_time, styleid)
+ except (IndexError, ValueError) as e:
+ logging.warning(_('Invalid comment: %r') % c[3])
+
+
+def WriteCommentSH5VPositioned(f, c, width, height, styleid):
+
+ def GetTransformStyles(x=None, y=None, fsize=None, rotate_z=None, rotate_y=None, color=None, alpha=None):
+ styles = []
+ if x is not None and y is not None:
+ styles.append('\\pos(%s, %s)' % (x, y))
+ if fsize is not None:
+ styles.append('\\fs%s' % fsize)
+ if rotate_y is not None and rotate_z is not None:
+ styles.append('\\frz%s' % round(rotate_z))
+ styles.append('\\fry%s' % round(rotate_y))
+ if color is not None:
+ styles.append('\\c&H%02X%02X%02X&' % (color & 0xff, (color >> 8) & 0xff, (color >> 16) & 0xff))
+ if color == 0x000000:
+ styles.append('\\3c&HFFFFFF&')
+ if alpha is not None:
+ alpha = 255-round(alpha*255)
+ styles.append('\\alpha&H%02X' % alpha)
+ return styles
+
+ def FlushCommentLine(f, text, styles, start_time, end_time, styleid):
+ if end_time > start_time:
+ f.write('Dialogue: -1,%(start)s,%(end)s,%(styleid)s,,0,0,0,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(start_time), 'end': ConvertTimestamp(end_time), 'styles': ''.join(styles), 'text': text, 'styleid': styleid})
+
+ try:
+ text = ASSEscape(str(c[3]))
+ to_x = round(float(c[9])*width)
+ to_y = round(float(c[10])*height)
+ to_rotate_z = -int(c[14])
+ to_rotate_y = -int(c[15])
+ to_color = c[5]
+ to_alpha = float(c[12])
+ #Note: Alpha transition hasn't been worked out yet.
+ to_size = round(int(c[6])*math.sqrt(width*height/307200))
+ #Note: Because sH5V's data is the absolute size of font,temporarily solve by it at present.[*math.sqrt(width/640*height/480)]
+ #But it seems to be working fine...
+ from_time = float(c[0])
+ action_time = float(c[11])/1000
+ transform_styles = GetTransformStyles(to_x, to_y, to_size, to_rotate_z, to_rotate_y, to_color, to_alpha)
+ FlushCommentLine(f, text, transform_styles, from_time, from_time+action_time, styleid)
+ except (IndexError, ValueError) as e:
+ logging.warning(_('Invalid comment: %r') % c[3])
+
+
+# Result: (f, dx, dy)
+# To convert: NewX = f*x+dx, NewY = f*y+dy
+def GetZoomFactor(SourceSize, TargetSize):
+ try:
+ if (SourceSize, TargetSize) == GetZoomFactor.Cached_Size:
+ return GetZoomFactor.Cached_Result
+ except AttributeError:
+ pass
+ GetZoomFactor.Cached_Size = (SourceSize, TargetSize)
+ try:
+ SourceAspect = SourceSize[0]/SourceSize[1]
+ TargetAspect = TargetSize[0]/TargetSize[1]
+ if TargetAspect < SourceAspect: # narrower
+ ScaleFactor = TargetSize[0]/SourceSize[0]
+ GetZoomFactor.Cached_Result = (ScaleFactor, 0, (TargetSize[1]-TargetSize[0]/SourceAspect)/2)
+ elif TargetAspect > SourceAspect: # wider
+ ScaleFactor = TargetSize[1]/SourceSize[1]
+ GetZoomFactor.Cached_Result = (ScaleFactor, (TargetSize[0]-TargetSize[1]*SourceAspect)/2, 0)
+ else:
+ GetZoomFactor.Cached_Result = (TargetSize[0]/SourceSize[0], 0, 0)
+ return GetZoomFactor.Cached_Result
+ except ZeroDivisionError:
+ GetZoomFactor.Cached_Result = (1, 0, 0)
+ return GetZoomFactor.Cached_Result
+
+
+# Calculation is based on https://github.com/jabbany/CommentCoreLibrary/issues/5#issuecomment-40087282
+# and https://github.com/m13253/danmaku2ass/issues/7#issuecomment-41489422
+# Input: X relative horizonal coordinate: 0 for left edge, 1 for right edge.
+# Y relative vertical coordinate: 0 for top edge, 1 for bottom edge.
+# FOV = 1.0/math.tan(100*math.pi/360.0)
+# Result: (rotX, rotY, rotZ, shearX, shearY)
+def ConvertFlashRotation(rotY, rotZ, X, Y, FOV=math.tan(2*math.pi/9.0)):
+ def WrapAngle(deg):
+ return 180-((180-deg)%360)
+ def CalcPerspectiveCorrection(alpha, X, FOV=FOV):
+ alpha = WrapAngle(alpha)
+ if FOV is None:
+ return alpha
+ if 0 <= alpha <= 180:
+ costheta = (FOV*math.cos(alpha*math.pi/180.0)-X*math.sin(alpha*math.pi/180.0))/(FOV+max(2, abs(X)+1)*math.sin(alpha*math.pi/180.0))
+ try:
+ if costheta > 1:
+ costheta = 1
+ raise ValueError
+ elif costheta < -1:
+ costheta = -1
+ raise ValueError
+ except ValueError:
+ logging.error('Clipped rotation angle: (alpha=%s, X=%s), it is a bug!' % (alpha, X))
+ theta = math.acos(costheta)*180/math.pi
+ else:
+ costheta = (FOV*math.cos(alpha*math.pi/180.0)-X*math.sin(alpha*math.pi/180.0))/(FOV-max(2, abs(X)+1)*math.sin(alpha*math.pi/180.0))
+ try:
+ if costheta > 1:
+ costheta = 1
+ raise ValueError
+ elif costheta < -1:
+ costheta = -1
+ raise ValueError
+ except ValueError:
+ logging.error('Clipped rotation angle: (alpha=%s, X=%s), it is a bug!' % (alpha, X))
+ theta = -math.acos(costheta)*180/math.pi
+ return WrapAngle(theta)
+ X = 2*X-1
+ Y = 2*Y-1
+ rotY = WrapAngle(rotY)
+ rotZ = WrapAngle(rotZ)
+ if rotY == 0 or rotZ == 0:
+ outX = 0
+ outY = -rotY # Positive value means clockwise in Flash
+ outZ = -rotZ
+ else:
+ rotY = rotY*math.pi/180.0
+ rotZ = rotZ*math.pi/180.0
+ outY = math.atan2(-math.sin(rotY)*math.cos(rotZ), math.cos(rotY))*180/math.pi
+ outZ = math.atan2(-math.cos(rotY)*math.sin(rotZ), math.cos(rotZ))*180/math.pi
+ outX = math.asin(math.sin(rotY)*math.sin(rotZ))*180/math.pi
+ if FOV is not None:
+ #outX = CalcPerspectiveCorrection(outX, -Y, FOV*0.75)
+ outY = CalcPerspectiveCorrection(outY, X, FOV)
+ return (WrapAngle(round(outX)), WrapAngle(round(outY)), WrapAngle(round(outZ)), 0, round(-0.75*Y*math.sin(outY*math.pi/180.0), 3))
+
+
+def ProcessComments(comments, f, width, height, bottomReserved, fontface, fontsize, alpha, lifetime, reduced, progress_callback):
+ styleid = 'Danmaku2ASS_%04x' % random.randint(0, 0xffff)
+ WriteASSHead(f, width, height, fontface, fontsize, alpha, styleid)
+ rows = [[None]*(height-bottomReserved+1) for i in range(4)]
+ for idx, i in enumerate(comments):
+ if progress_callback and idx % 1000 == 0:
+ progress_callback(idx, len(comments))
+ if isinstance(i[4], int):
+ row = 0
+ rowmax = height-bottomReserved-i[7]
+ while row <= rowmax:
+ freerows = TestFreeRows(rows, i, row, width, height, bottomReserved, lifetime)
+ if freerows >= i[7]:
+ MarkCommentRow(rows, i, row)
+ WriteComment(f, i, row, width, height, bottomReserved, fontsize, lifetime, styleid)
+ break
+ else:
+ row += freerows or 1
+ else:
+ if not reduced:
+ row = FindAlternativeRow(rows, i, height, bottomReserved)
+ MarkCommentRow(rows, i, row)
+ WriteComment(f, i, row, width, height, bottomReserved, fontsize, lifetime, styleid)
+ elif i[4] == 'bilipos':
+ WriteCommentBilibiliPositioned(f, i, width, height, styleid)
+ elif i[4] == 'acfunpos':
+ WriteCommentAcfunPositioned(f, i, width, height, styleid)
+ elif i[4] == 'sH5Vpos':
+ WriteCommentSH5VPositioned(f, i, width, height, styleid)
+ else:
+ logging.warning(_('Invalid comment: %r') % i[3])
+ if progress_callback:
+ progress_callback(len(comments), len(comments))
+
+
+def TestFreeRows(rows, c, row, width, height, bottomReserved, lifetime):
+ res = 0
+ rowmax = height-bottomReserved
+ targetRow = None
+ if c[4] in (1, 2):
+ while row < rowmax and res < c[7]:
+ if targetRow != rows[c[4]][row]:
+ targetRow = rows[c[4]][row]
+ if targetRow and targetRow[0]+lifetime > c[0]:
+ break
+ row += 1
+ res += 1
+ else:
+ try:
+ thresholdTime = c[0]-lifetime*(1-width/(c[8]+width))
+ except ZeroDivisionError:
+ thresholdTime = c[0]-lifetime
+ while row < rowmax and res < c[7]:
+ if targetRow != rows[c[4]][row]:
+ targetRow = rows[c[4]][row]
+ try:
+ if targetRow and (targetRow[0] > thresholdTime or targetRow[0]+targetRow[8]*lifetime/(targetRow[8]+width) > c[0]):
+ break
+ except ZeroDivisionError:
+ pass
+ row += 1
+ res += 1
+ return res
+
+
+def FindAlternativeRow(rows, c, height, bottomReserved):
+ res = 0
+ for row in range(height-bottomReserved-math.ceil(c[7])):
+ if not rows[c[4]][row]:
+ return row
+ elif rows[c[4]][row][0] < rows[c[4]][res][0]:
+ res = row
+ return res
+
+
+def MarkCommentRow(rows, c, row):
+ try:
+ for i in range(row, row+math.ceil(c[7])):
+ rows[c[4]][i] = c
+ except IndexError:
+ pass
+
+
+def WriteASSHead(f, width, height, fontface, fontsize, alpha, styleid):
+ f.write(
+'''
+[Script Info]
+; Script generated by Danmaku2ASS
+; https://github.com/m13253/danmaku2ass
+Script Updated By: Danmaku2ASS (https://github.com/m13253/danmaku2ass)
+ScriptType: v4.00+
+WrapStyle: 2
+Collisions: Normal
+PlayResX: %(width)s
+PlayResY: %(height)s
+ScaledBorderAndShadow: yes
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: %(styleid)s, %(fontface)s, %(fontsize)s, &H%(alpha)02XFFFFFF, &H%(alpha)02XFFFFFF, &H%(alpha)02X000000, &H%(alpha)02X000000, 0, 0, 0, 0, 100, 100, 0.00, 0.00, 1, %(outline)s, 0, 7, 0, 0, 0, 0
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+''' % {'width': width, 'height': height, 'fontface': fontface, 'fontsize': round(fontsize), 'alpha': 255-round(alpha*255), 'outline': round(fontsize/25), 'styleid': styleid}
+ )
+
+
+def WriteComment(f, c, row, width, height, bottomReserved, fontsize, lifetime, styleid):
+ text = ASSEscape(c[3])
+ styles = []
+ if c[4] == 1:
+ styles.append('\\an8\\pos(%(halfwidth)s, %(row)s)' % {'halfwidth': round(width/2), 'row': row})
+ elif c[4] == 2:
+ styles.append('\\an2\\pos(%(halfwidth)s, %(row)s)' % {'halfwidth': round(width/2), 'row': ConvertType2(row, height, bottomReserved)})
+ elif c[4] == 3:
+ styles.append('\\move(%(neglen)s, %(row)s, %(width)s, %(row)s)' % {'width': width, 'row': row, 'neglen': -math.ceil(c[8])})
+ else:
+ styles.append('\\move(%(width)s, %(row)s, %(neglen)s, %(row)s)' % {'width': width, 'row': row, 'neglen': -math.ceil(c[8])})
+ if not (-1 < c[6]-fontsize < 1):
+ styles.append('\\fs%s' % round(c[6]))
+ if c[5] != 0xffffff:
+ styles.append('\\c&H%02X%02X%02X&' % (c[5] & 0xff, (c[5] >> 8) & 0xff, (c[5] >> 16) & 0xff))
+ if c[5] == 0x000000:
+ styles.append('\\3c&HFFFFFF&')
+ f.write('Dialogue: 2,%(start)s,%(end)s,%(styleid)s,,0000,0000,0000,,{%(styles)s}%(text)s\n' % {'start': ConvertTimestamp(c[0]), 'end': ConvertTimestamp(c[0]+lifetime), 'styles': ''.join(styles), 'text': text, 'styleid': styleid})
+
+
+def ASSEscape(s):
+ return '\\N'.join((i or ' ' for i in str(s).replace('\\', '\\\\').replace('{', '\\{').replace('}', '\\}').split('\n')))
+
+
+def CalculateLength(s):
+ return max(map(len, s.split('\n'))) # May not be accurate
+
+
+def ConvertTimestamp(timestamp):
+ timestamp = round(timestamp*100.0)
+ hour, minute = divmod(timestamp, 360000)
+ minute, second = divmod(minute, 6000)
+ second, centsecond = divmod(second, 100)
+ return '%d:%02d:%02d.%02d' % (int(hour), int(minute), int(second), int(centsecond))
+
+
+def ConvertType2(row, height, bottomReserved):
+ return height-bottomReserved-row
+
+
+def ConvertToFile(filename_or_file, *args, **kwargs):
+ if isinstance(filename_or_file, bytes):
+ filename_or_file = str(bytes(filename_or_file).decode('utf-8', 'replace'))
+ if isinstance(filename_or_file, str):
+ return open(filename_or_file, *args, **kwargs)
+ else:
+ return filename_or_file
+
+
+def FilterBadChars(f):
+ s = f.read()
+ s = re.sub('[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f]', '\ufffd', s)
+ return io.StringIO(s)
+
+
+class safe_list(list):
+ def get(self, index, default=None):
+ try:
+ return self[index]
+ except IndexError:
+ return default
+
+
+def export(func):
+ global __all__
+ try:
+ __all__.append(func.__name__)
+ except NameError:
+ __all__ = [func.__name__]
+ return func
+
+
+@export
+def Danmaku2ASS(input_files, output_file, stage_width, stage_height, reserve_blank=0, font_face=_('(FONT) sans-serif')[7:], font_size=25.0, text_opacity=1.0, comment_duration=5.0, is_reduce_comments=False, progress_callback=None):
+ fo = None
+ comments = ReadComments(input_files, font_size)
+ try:
+ if output_file:
+ fo = ConvertToFile(output_file, 'w', encoding='utf-8-sig', errors='replace', newline='\r\n')
+ else:
+ fo = sys.stdout
+ ProcessComments(comments, fo, stage_width, stage_height, reserve_blank, font_face, font_size, text_opacity, comment_duration, is_reduce_comments, progress_callback)
+ finally:
+ if output_file and fo != output_file:
+ fo.close()
+
+
+@export
+def ReadComments(input_files, font_size=25.0, progress_callback=None):
+ if isinstance(input_files, bytes):
+ input_files = str(bytes(input_files).decode('utf-8', 'replace'))
+ if isinstance(input_files, str):
+ input_files = [input_files]
+ else:
+ input_files = list(input_files)
+ comments = []
+ for idx, i in enumerate(input_files):
+ if progress_callback:
+ progress_callback(idx, len(input_files))
+ with ConvertToFile(i, 'r', encoding='utf-8', errors='replace') as f:
+ CommentProcessor = GetCommentProcessor(f)
+ if not CommentProcessor:
+ raise ValueError(_('Unknown comment file format: %s') % i)
+ comments.extend(CommentProcessor(FilterBadChars(f), font_size))
+ if progress_callback:
+ progress_callback(len(input_files), len(input_files))
+ comments.sort()
+ return comments
+
+
+@export
+def GetCommentProcessor(input_file):
+ return CommentFormatMap[ProbeCommentFormat(input_file)]
+
+
+def main():
+ if len(sys.argv) == 1:
+ sys.argv.append('--help')
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-o', '--output', metavar=_('OUTPUT'), help=_('Output file'))
+ parser.add_argument('-s', '--size', metavar=_('WIDTHxHEIGHT'), required=True, help=_('Stage size in pixels'))
+ parser.add_argument('-fn', '--font', metavar=_('FONT'), help=_('Specify font face [default: %s]') % _('(FONT) sans-serif')[7:], default=_('(FONT) sans-serif')[7:])
+ parser.add_argument('-fs', '--fontsize', metavar=_('SIZE'), help=(_('Default font size [default: %s]') % 25), type=float, default=25.0)
+ parser.add_argument('-a', '--alpha', metavar=_('ALPHA'), help=_('Text opacity'), type=float, default=1.0)
+ parser.add_argument('-l', '--lifetime', metavar=_('SECONDS'), help=_('Duration of comment display [default: %s]') % 5, type=float, default=5.0)
+ parser.add_argument('-p', '--protect', metavar=_('HEIGHT'), help=_('Reserve blank on the bottom of the stage'), type=int, default=0)
+ parser.add_argument('-r', '--reduce', action='https://github.com/store_true', help=_('Reduce the amount of comments if stage is full'))
+ parser.add_argument('file', metavar=_('FILE'), nargs='+', help=_('Comment file to be processed'))
+ args = parser.parse_args()
+ try:
+ width, height = str(args.size).split('x', 1)
+ width = int(width)
+ height = int(height)
+ except ValueError:
+ raise ValueError(_('Invalid stage size: %r') % args.size)
+ Danmaku2ASS(args.file, args.output, width, height, args.protect, args.font, args.fontsize, args.alpha, args.lifetime, args.reduce)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/2020/dmzj/cartoon.py b/2020/dmzj/cartoon.py
new file mode 100644
index 00000000..a1546a0b
--- /dev/null
+++ b/2020/dmzj/cartoon.py
@@ -0,0 +1,74 @@
+import requests
+import os
+import re
+from bs4 import BeautifulSoup
+from contextlib import closing
+from tqdm import tqdm
+import time
+
+"""
+ Author:
+ Jack Cui
+ Wechat:
+ https://mp.weixin.qq.com/s/OCWwRVDFNslIuKyiCVUoTA
+"""
+
+# 创建保存目录
+save_dir = '妖神记'
+if save_dir not in os.listdir('./'):
+ os.mkdir(save_dir)
+
+target_url = "https://www.dmzj.com/info/yaoshenji.html"
+
+# 获取动漫章节链接和章节名
+r = requests.get(url = target_url)
+bs = BeautifulSoup(r.text, 'lxml')
+list_con_li = bs.find('ul', class_="list_con_li")
+cartoon_list = list_con_li.find_all('a')
+chapter_names = []
+chapter_urls = []
+for cartoon in cartoon_list:
+ href = cartoon.get('href')
+ name = cartoon.text
+ chapter_names.insert(0, name)
+ chapter_urls.insert(0, href)
+
+# 下载漫画
+for i, url in enumerate(tqdm(chapter_urls)):
+ download_header = {
+ 'Referer': url
+ }
+ name = chapter_names[i]
+ # 去掉.
+ while '.' in name:
+ name = name.replace('.', '')
+ chapter_save_dir = os.path.join(save_dir, name)
+ if name not in os.listdir(save_dir):
+ os.mkdir(chapter_save_dir)
+ r = requests.get(url = url)
+ html = BeautifulSoup(r.text, 'lxml')
+ script_info = html.script
+ pics = re.findall('\d{13,14}', str(script_info))
+ for j, pic in enumerate(pics):
+ if len(pic) == 13:
+ pics[j] = pic + '0'
+ pics = sorted(pics, key=lambda x:int(x))
+ chapterpic_hou = re.findall('\|(\d{5})\|', str(script_info))[0]
+ chapterpic_qian = re.findall('\|(\d{4})\|', str(script_info))[0]
+ for idx, pic in enumerate(pics):
+ if pic[-1] == '0':
+ url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic[:-1] + '.jpg'
+ else:
+ url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic + '.jpg'
+ pic_name = '%03d.jpg' % (idx + 1)
+ pic_save_path = os.path.join(chapter_save_dir, pic_name)
+ with closing(requests.get(url, headers = download_header, stream = True)) as response:
+ chunk_size = 1024
+ content_size = int(response.headers['content-length'])
+ if response.status_code == 200:
+ with open(pic_save_path, "wb") as file:
+ for data in response.iter_content(chunk_size=chunk_size):
+ file.write(data)
+ else:
+ print('链接异常')
+ time.sleep(10)
\ No newline at end of file
diff --git a/2020/images/gzh-1.jpg b/2020/images/gzh-1.jpg
new file mode 100644
index 00000000..b49e5753
Binary files /dev/null and b/2020/images/gzh-1.jpg differ
diff --git a/2020/taobao/1.png b/2020/taobao/1.png
new file mode 100644
index 00000000..2d207c97
Binary files /dev/null and b/2020/taobao/1.png differ
diff --git a/2020/taobao/taobao_login.py b/2020/taobao/taobao_login.py
new file mode 100644
index 00000000..a24d2016
--- /dev/null
+++ b/2020/taobao/taobao_login.py
@@ -0,0 +1,99 @@
+from selenium import webdriver
+import logging
+import time
+from selenium.common.exceptions import NoSuchElementException, WebDriverException
+from retrying import retry
+from selenium.webdriver import ActionChains
+
+import pyautogui
+pyautogui.PAUSE = 0.5
+
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+"""
+微信公众号 JackCui-AI
+更多精彩教程、源码尽在微信公众号
+"""
+
+class taobao():
+ def __init__(self):
+ self.browser = webdriver.Chrome("path\to\your\chromedriver.exe")
+ # 最大化窗口
+ self.browser.maximize_window()
+ self.browser.implicitly_wait(5)
+ self.domain = 'http://www.taobao.com/'
+ self.action_chains = ActionChains(self.browser)
+
+ def login(self, username, password):
+ while True:
+ self.browser.get(self.domain)
+ time.sleep(1)
+
+ #会xpath可以简化这几步
+ #self.browser.find_element_by_class_name('h').click()
+ #self.browser.find_element_by_id('fm-login-id').send_keys(username)
+ #self.browser.find_element_by_id('fm-login-password').send_keys(password)
+ self.browser.find_element_by_xpath('//*[@id="J_SiteNavLogin"]/div[1]/div[1]/a[1]').click()
+ self.browser.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(username)
+ self.browser.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(password)
+ time.sleep(1)
+
+ try:
+ # 出现验证码,滑动验证
+ slider = self.browser.find_element_by_xpath("//span[contains(@class, 'btn_slide')]")
+ if slider.is_displayed():
+ # 拖拽滑块
+ self.action_chains.drag_and_drop_by_offset(slider, 258, 0).perform()
+ time.sleep(0.5)
+ # 释放滑块,相当于点击拖拽之后的释放鼠标
+ self.action_chains.release().perform()
+ except (NoSuchElementException, WebDriverException):
+ logger.info('未出现登录验证码')
+
+ # 会xpath可以简化点击登陆按钮,但都无法登录,需要使用 pyautogui 完成点击事件
+ #self.browser.find_element_by_class_name('password-login').click()
+ #self.browser.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
+ # 图片地址
+ coords = pyautogui.locateOnScreen('1.png')
+ x, y = pyautogui.center(coords)
+ pyautogui.leftClick(x, y)
+
+ nickname = self.get_nickname()
+ if nickname:
+ logger.info('登录成功,呢称为:' + nickname)
+ break
+ logger.debug('登录出错,5s后继续登录')
+ time.sleep(5)
+
+ def get_nickname(self):
+ self.browser.get(self.domain)
+ time.sleep(0.5)
+ try:
+ return self.browser.find_element_by_class_name('site-nav-user').text
+ except NoSuchElementException:
+ return ''
+
+ def clear_cart(self):
+ cart = self.browser.find_element_by_xpath('//*[@id="J_MiniCart"]')
+ if cart.is_displayed():
+ cart.click()
+ select = self.browser.find_element_by_xpath('//*[@id="J_SelectAll1"]/div/label')
+ if select.is_displayed():
+ select.click()
+ time.sleep(0.5)
+ go = self.browser.find_element_by_xpath('//*[@id="J_Go"]')
+ if go.is_displayed():
+ go.click()
+ submit = self.browser.find_element_by_xpath('//*[@id="submitOrderPC_1"]/div/a[2]')
+ if submit.is_displayed():
+ submit.click()
+
+
+if __name__ == '__main__':
+ # 填入自己的用户名,密码
+ username = 'username'
+ password = 'password'
+ tb = taobao()
+ tb.login(username, password)
+ #tb.clear_cart()
diff --git a/2020/xbqg/xbqg_spider.py b/2020/xbqg/xbqg_spider.py
new file mode 100644
index 00000000..5dcd10b7
--- /dev/null
+++ b/2020/xbqg/xbqg_spider.py
@@ -0,0 +1,40 @@
+import requests
+import time
+from tqdm import tqdm
+from bs4 import BeautifulSoup
+
+"""
+ Author:
+ Jack Cui
+ Wechat:
+ https://mp.weixin.qq.com/s/OCWwRVDFNslIuKyiCVUoTA
+"""
+
+def get_content(target):
+ req = requests.get(url = target)
+ req.encoding = 'utf-8'
+ html = req.text
+ bf = BeautifulSoup(html, 'lxml')
+ texts = bf.find('div', id='content')
+ content = texts.text.strip().split('\xa0'*4)
+ return content
+
+if __name__ == '__main__':
+ server = 'https://www.xsbiquge.com/'
+ book_name = '诡秘之主.txt'
+ target = 'https://www.xsbiquge.com/15_15338/'
+ req = requests.get(url = target)
+ req.encoding = 'utf-8'
+ html = req.text
+ chapter_bs = BeautifulSoup(html, 'lxml')
+ chapters = chapter_bs.find('div', id='list')
+ chapters = chapters.find_all('a')
+ for chapter in tqdm(chapters):
+ chapter_name = chapter.string
+ url = server + chapter.get('href')
+ content = get_content(url)
+ with open(book_name, 'a', encoding='utf-8') as f:
+ f.write(chapter_name)
+ f.write('\n')
+ f.write('\n'.join(content))
+ f.write('\n')
\ No newline at end of file
diff --git a/2020/zycjw/video_download.py b/2020/zycjw/video_download.py
new file mode 100644
index 00000000..89914ab7
--- /dev/null
+++ b/2020/zycjw/video_download.py
@@ -0,0 +1,64 @@
+import os
+import ffmpy3
+import requests
+from bs4 import BeautifulSoup
+from multiprocessing.dummy import Pool as ThreadPool
+
+search_keyword = '越狱第一季'
+search_url = 'http://www.jisudhw.com/index.php'
+serach_params = {
+ 'm': 'vod-search'
+}
+serach_headers = {
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
+ 'Referer': 'http://www.jisudhw.com/',
+ 'Origin': 'http://www.jisudhw.com/',
+ 'Host': 'www.jisudhw.com'
+}
+serach_datas = {
+ 'wd': search_keyword,
+ 'submit': 'search'
+}
+
+
+video_dir = ''
+
+r = requests.post(url=search_url, params=serach_params, headers=serach_headers, data=serach_datas)
+r.encoding = 'utf-8'
+server = 'http://www.jisudhw.com/'
+search_html = BeautifulSoup(r.text, 'lxml')
+search_spans = search_html.find_all('span', class_='xing_vb4')
+for span in search_spans:
+ url = server + span.a.get('href')
+ name = span.a.string
+ print(name)
+ print(url)
+ video_dir = name
+ if name not in os.listdir('./'):
+ os.mkdir(name)
+
+ detail_url = url
+ r = requests.get(url = detail_url)
+ r.encoding = 'utf-8'
+ detail_bf = BeautifulSoup(r.text, 'lxml')
+ num = 1
+ serach_res = {}
+ for each_url in detail_bf.find_all('input'):
+ if 'm3u8' in each_url.get('value'):
+ url = each_url.get('value')
+ if url not in serach_res.keys():
+ serach_res[url] = num
+ print('第%03d集:' % num)
+ print(url)
+ num += 1
+
+def downVideo(url):
+ num = serach_res[url]
+ name = os.path.join(video_dir, '第%03d集.mp4' % num)
+ ffmpy3.FFmpeg(inputs={url: None}, outputs={name:None}).run()
+
+# 开8个线程池
+pool = ThreadPool(8)
+results = pool.map(downVideo, serach_res.keys())
+pool.close()
+pool.join()
\ No newline at end of file
diff --git a/README.md b/README.md
index 7158f4cf..1d0f06b8 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,28 @@
-# Python Spider
+# 注:2020年最新连载教程请移步:[Python Spider 2020](https://github.com/Jack-Cherish/python-spider/tree/master/2020 "Python Spider 2020")
-* 贵有恒,何必三更起五更睡;最无益,只怕一日暴十寒。
-* Python3爬虫实战:实战源码+博客讲解
-* [个人网站](http://cuijiahua.com "悬停显示")
-* [CSDN博客](http://blog.csdn.net/c406495762 "悬停显示")
-* [CSDN爬虫专栏](http://blog.csdn.net/column/details/15321.html "悬停显示")
-* 学习交流群【328127489】
-* 分享技术,乐享生活:Jack Cui公众号每周五推送“程序员欢乐送”系列资讯类文章,欢迎您的关注!
-
diff --git a/bilibili_luckyman/README.md b/bilibili_luckyman/README.md
new file mode 100644
index 00000000..776424e5
--- /dev/null
+++ b/bilibili_luckyman/README.md
@@ -0,0 +1,7 @@
+## 说明
+
+B 站 30 万粉丝抽奖,自己写了一个转发抽奖助手。
+
+上次活动:
+
+https://t.bilibili.com/675922191916728342
diff --git a/bilibili_luckyman/bilibili_luckyman.py b/bilibili_luckyman/bilibili_luckyman.py
new file mode 100644
index 00000000..7c254360
--- /dev/null
+++ b/bilibili_luckyman/bilibili_luckyman.py
@@ -0,0 +1,89 @@
+# -*- coding:utf-8 -*-
+import requests
+import json
+import re
+import random
+import time
+
+def get_dynamic_id(url):
+ dynamic_id = re.findall(r'\d+', url)
+ return dynamic_id
+
+def get_data(detail_url, params):
+ req = requests.get(url = detail_url, params = params)
+ req_text = json.loads(req.text)
+ data = req_text['data']
+ offset = data['offset']
+ items = data['items']
+ return offset, items
+
+def get_uses(dynamic_id):
+ detail_url = "https://api.bilibili.com/x/polymer/web-dynamic/v1/detail/forward"
+ params = {'id': dynamic_id}
+
+ offset, items = get_data(detail_url, params)
+
+ all_user_name = []
+ all_user_text = []
+ all_user_mid = []
+
+ while offset != "":
+ for item in items:
+ name = item['user']['name']
+ all_user_name.append(name)
+ mid = item['user']['mid']
+ all_user_mid.append(mid)
+ text = item['desc']['text']
+ all_user_text.append(text)
+
+ params = {
+ 'id': dynamic_id,
+ 'offset': offset
+ }
+ offset, items = get_data(detail_url, params)
+
+ return all_user_name, all_user_mid, all_user_text
+
+def get_lucky_man(num, lucky_num):
+
+ tmp = [i for i in range(0, num)]
+ random.shuffle(tmp)
+ top30_shuffle_id = tmp[:lucky_num]
+ return top30_shuffle_id
+
+def get_local_time():
+ localtime = "[" + str(time.strftime('%H:%M:%S',time.localtime(time.time()))) + "]"
+ return localtime
+
+if __name__ == "__main__":
+ print ("+----------------------------------------+")
+ print (" |动态转发抽奖助手 by Jack Cui|")
+ print ("+----------------------------------------+")
+ # 动态链接,修改为你自己的动态
+ url = "https://t.bilibili.com/675922191916728342"
+ print (get_local_time() + " 正在获取转发数据中......")
+
+ awards = [
+ "动手深度学习",
+ "机器学习公式详解",
+ "Easy RL 强化学习教程",
+ "数学之美",
+ "浪潮之巅 第四版",
+ "C Primer Plus(第6版)中文版"
+ ] * 5
+
+ # 设置随机数种子,保证随机数固定,这里种子数设为转发数+评论数+点赞数
+ random.seed(1462 + 213 + 399)
+ random.shuffle(awards)
+
+ dynamic_id = get_dynamic_id(url)
+ all_user_name, all_user_mid, all_user_text = get_uses(dynamic_id)
+
+ top30_shuffle_id = get_lucky_man(len(all_user_name), 30)
+ print (get_local_time() + " 中奖用户信息:\n")
+ for idx, id_ in enumerate(top30_shuffle_id):
+ print("用户名:{}".format(all_user_name[id_]))
+ print("用户主页:{}".format("https://space.bilibili.com/" + str(all_user_mid[id_])))
+ print("转发内容:{}".format(all_user_text[id_]))
+ print("获得奖品:{}".format(awards[idx]))
+ print("*" * 50)
diff --git a/douyin/README.md b/douyin/README.md
index 4e26f5ba..56545fb2 100644
--- a/douyin/README.md
+++ b/douyin/README.md
@@ -14,4 +14,6 @@
python douyin.py
-关于重新链接次数: 用户视频通常重新链接30次以内会成功,而收藏视频目前链接成功机率极低,当然有耐心也能等他成功为止。。
+签名服务来源:https://github.com/coder-fly/douyin-signature(.+?)<\/p>') nickname = _nickname_re.search(share_user.text).group(1) - urllib.request.urlretrieve('https://raw.githubusercontent.com/Jack-Cherish/python-spider/master/douyin/fuck-byted-acrawler.js', 'fuck-byted-acrawler.js') - try: - Popen(['node', '-v'], stdout=PIPE, stderr=PIPE).communicate() - except (OSError, IOError) as err: - print('请先安装 node.js: https://nodejs.org/') - sys.exit() - user_url_prefix = 'https://www.douyin.com/aweme/v1/aweme/favorite' if type_flag == 'f' else 'https://www.douyin.com/aweme/v1/aweme/post' + data = { + 'tac': tac.split('|')[0], + 'user_id': user_id, + } + req = requests.post(sign_api, data=data) + while req.status_code != 200: + req = requests.post(sign_api, data=data) + sign = req.json().get('signature') + user_url_prefix = 'https://www.iesdouyin.com/web/api/v2/aweme/like' if type_flag == 'f' else 'https://www.iesdouyin.com/web/api/v2/aweme/post' print('解析视频链接中') while has_more != 0: - process = Popen(['node', 'fuck-byted-acrawler.js', str(user_id)], stdout=PIPE, stderr=PIPE) - _sign = process.communicate()[0].decode().strip('\n').strip('\r') - user_url = user_url_prefix + '/?user_id=%s&max_cursor=%s&count=21&aid=1128&_signature=%s&dytk=%s' % (user_id, max_cursor, _sign, dytk) + user_url = user_url_prefix + '/?user_id=%s&sec_uid=&count=21&max_cursor=%s&aid=1128&_signature=%s&dytk=%s' % (user_id, max_cursor, sign, dytk) req = requests.get(user_url, headers=self.headers) while req.status_code != 200: req = requests.get(user_url, headers=self.headers) html = json.loads(req.text) - try: - while html['aweme_list'] == []: - i = i + 1 - sys.stdout.write('已重新链接' + str(i) + '次 (若超过100次,请ctrl+c强制停止再重来)' + '\r') - sys.stdout.flush() - process = Popen(['node', 'fuck-byted-acrawler.js', str(user_id)], stdout=PIPE, stderr=PIPE) - _sign = process.communicate()[0].decode().strip('\n').strip('\r') - user_url = user_url_prefix + '/?user_id=%s&max_cursor=%s&count=21&aid=1128&_signature=%s&dytk=%s' % (user_id, max_cursor, _sign, dytk) - req = requests.get(user_url, headers=self.headers) - while req.status_code != 200: - req = requests.get(user_url, headers=self.headers) - html = json.loads(req.text) - except: - pass - i = 0 for each in html['aweme_list']: try: - url = 'https://aweme.snssdk.com/aweme/v1/play/?video_id=%s&line=0&ratio=720p&media_type=4&vr_type=0&test_cdn=None&improve_bitrate=0' - uri = each['video']['play_addr']['uri'] - video_url = url % uri + url = 'https://aweme.snssdk.com/aweme/v1/play/?video_id=%s&line=0&ratio=720p&media_type=4&vr_type=0&improve_bitrate=0&is_play_url=1&is_support_h265=0&source=PackSourceEnum_PUBLISH' + vid = each['video']['vid'] + video_url = url % vid except: continue - share_desc = each['share_info']['share_desc'] + share_desc = each['desc'] if os.name == 'nt': for c in r'\/:*?"<>|': nickname = nickname.replace(c, '').strip().strip('\.') @@ -97,7 +79,8 @@ def get_video_urls(self, user_id, type_flag='f'): video_names.append(share_id + '.mp4') else: video_names.append(share_id + '-' + share_desc + '.mp4') - share_urls.append(each['share_info']['share_url']) + share_url = 'https://www.iesdouyin.com/share/video/%s' % share_id + share_urls.append(share_url) video_urls.append(video_url) max_cursor = html['max_cursor'] has_more = html['has_more'] @@ -133,7 +116,7 @@ def video_downloader(self, video_url, video_name, watermark_flag=False): """ size = 0 video_url = self.get_download_url(https://github.com/video_url,%20watermark_flag=watermark_flag) - with closing(requests.get(video_url, headers=self.headers, stream=True)) as response: + with closing(requests.get(video_url, headers=self.headers1, stream=True)) as response: chunk_size = 1024 content_size = int(response.headers['content-length']) if response.status_code == 200: @@ -157,9 +140,9 @@ def run(self): None """ self.hello() - print('搜索api需要登录,暂时使用UID下载\n分享用户页面,用浏览器打开短链接,原始链接中/share/user/后的数字即是UID') - user_id = input('请输入ID (例如95006183):') - user_id = user_id if user_id else '95006183' + print('UID取得方式:\n分享用户页面,用浏览器打开短链接,原始链接中/share/user/后的数字即是UID') + user_id = input('请输入UID (例如60388937600):') + user_id = user_id if user_id else '60388937600' watermark_flag = input('是否下载带水印的视频 (0-否(默认), 1-是):') watermark_flag = watermark_flag if watermark_flag!='' else '0' watermark_flag = bool(int(watermark_flag)) diff --git a/geetest.py b/geetest.py index 929063c6..e78fc867 100644 --- a/geetest.py +++ b/geetest.py @@ -75,7 +75,7 @@ def save_full_bg(driver, full_bg_path="fbg.png", full_bg_class="geetest_canvas_f class Crack(): def __init__(self,keyword): - self.url = 'http://bj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml' + self.url = '*' self.browser = webdriver.Chrome('D:\\chromedriver.exe') self.wait = WebDriverWait(self.browser, 100) self.keyword = keyword diff --git a/zhengfang_system_spider/README.md b/zhengfang_system_spider/README.md index 36d9d187..29eb71aa 100644 --- a/zhengfang_system_spider/README.md +++ b/zhengfang_system_spider/README.md @@ -1,5 +1,5 @@ # ZhengFang_System_Spider -对正方教务管理系统个人课表,学生成绩,绩点等简单爬取 +对正方教务管理系统的个人课表,个人学生成绩,绩点等简单爬取 ## 依赖环境 python 3.6 diff --git a/zhengfang_system_spider/requirements.txt b/zhengfang_system_spider/requirements.txt index b136a831..522810d0 100644 --- a/zhengfang_system_spider/requirements.txt +++ b/zhengfang_system_spider/requirements.txt @@ -1,4 +1,4 @@ -lxml==4.2.1 +lxml==4.6.3 requests==2.20.0 -Pillow==5.2.0 +Pillow>=6.2.2 beautifulsoup4==4.6.0