多线程下载sis001的网友自拍贴图版面的图片-CSDN博客

多线程下载sis001的网友自拍贴图版面的图片，这个程序并不能下载全部版面而是只下载第二页的内容。因为第二页是最新的内容哦。你需要一个sis001，5级以上的用户名和密码否则无法访问此版面。

#!/usr/bin/python
# -*- coding: cp936 -*-
#coding utf-8


import urllib
import urllib2
import re
import cookielib
import Queue
import threading


def downPic(tiezi_url):
    req = urllib2.Request(q.get(),None,headers)
    tiezi_html = opener.open(req).read()
    #print tiezi_html
    re_img = re.compile(r'\<img src\=\"(http\:\/\/.*?\.jpg|attachments\/.*?.jpg)\"')
    img_list = re_img.findall(tiezi_html)
    #print img_list
    for i in img_list:
        if re.match("http",i):
            print "%s downloading..."%i
            filename = re.split(r'/',i)
            try:
                req = urllib2.Request(i,None,headers)
                res = opener.open(req).read()
                open(filename[-1],'wb').write(res)
            except:
                pass
        else:
            img_url = "http://38.103.161.185/forum/%s"%i
            filename = re.split(r'/',img_url)
            print "inner link %s"%img_url
            try:
                req = urllib2.Request(img_url,None,headers)
                res = opener.open(req).read()
                open(filename[-1],'wb').write(res)
            except:
                pass



cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#urllib2.install_opener(opener)
headers ={"User-agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}
data =  {
"formhash":"3fec4925",
"referer":"index.php",
"loginfield":"username",
"240aa46b3893fb57c436c0a3785b61e7":"xxxx",
"ea32b1cadbde4b66ca614e0bb593d1c9":"xxxx",
"questionid":"0",
"answer":"",
"cookietime":"2592000",
"loginmode":"",
"styleid":"",
"loginsubmit":"true"}
post_data = urllib.urlencode(data)
req = urllib2.Request("http://38.103.161.185/forum/logging.php?action=login&",post_data,headers)
content=opener.open(req)
#print content.read()
req2 = urllib2.Request("http://38.103.161.185/forum/forum-62-2.html",None,headers)
board_html = opener.open(req2).read()
#print board_html
re_link = re.compile(r'\<a href\=\"(thread-\d{7}-1-2.html)')
title_list = re_link.findall(board_html)
#http://38.103.161.185/forum/thread-(4917300-1-1.html)
#http://38.103.161.185/forum/forum-62-(2).html
#[\u4e00-\u9fa5]
#print title_list
q = Queue.Queue(100)
for i in title_list:
    tiezi_url = "http://38.103.161.185/forum/%s"%i
    #print tiezi_url
    q.put(tiezi_url)
print "total title:%s"%q.qsize()
while True:
    if q.qsize()>0:
        th = threading.Thread(target=downPic,args=(tiezi_url,) )
        th.start()
    else:
        break

=====

高效

#!/usr/bin/python
# -*- coding: cp936 -*-
#coding utf-8


import urllib
import urllib2
import re
import cookielib
import Queue
import threading
import socket
import time
import sys
import random

def log(message):
    log = open("log.txt","a")
    log.write(time.ctime()+" "+message+"\n")
    log.close()
	
def getPic():
	i = q.get()
	if re.match("http",i):     
		#print "%s downloading..."%i
		filename = re.split(r'/',i)
		try:
			req = urllib2.Request(i,None,headers)
			res = opener.open(req).read()
			savefile = '.\\img\\'+filename[-1]+ str(int(random.random()*100000000))+'.jpg'
			open(savefile,'wb').write(res)
		except:
			etype, value, tb = sys.exc_info()
			errormsg = i + "||"+str(etype) +"||"+ str(value)
			log(errormsg)
			pass
	else:
		img_url = "http://38.103.161.185/forum/%s"%i
		filename = re.split(r'/',img_url)
		#print "%s"%img_url
		try:
			req = urllib2.Request(img_url,None,headers)
			res = opener.open(req).read()
			savefile = '.\\img\\'+filename[-1]+ str(int(random.random()*100000000))+'.jpg'
			open(savefile,'wb').write(res)
		except:
			etype, value, tb = sys.exc_info()
			errormsg = i + "||"+str(etype) +"||"+ str(value)
			log(errormsg)
		pass
	

def downPic(tiezi_url,q):
    req = urllib2.Request(tiezi_url,None,headers)
    tiezi_html = opener.open(req).read()
    #print tiezi_html
    re_img = re.compile(r'\<img src\=\"(http\:\/\/.*?\.jpg|attachments\/.*?.jpg)\"')
    img_list = re_img.findall(tiezi_html)
    img_list = list(set(img_list))
    #print img_list
    for i in img_list:
        q.put(i)
    while True:
        if q.qsize()>0:
            th = threading.Thread(target=getPic)
            th.start()
            #print "Queue %s"%q.qsize()
        else:
            break


headers ={"User-agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}
socket.setdefaulttimeout(30)
cj = cookielib.CookieJar()
#proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#urllib2.install_opener(opener)
data =  {
    "formhash":"3fec4925",
    "referer":"index.php",
    "loginfield":"username",
    "240aa46b3893fb57c436c0a3785b61e7":"xxx",
    "ea32b1cadbde4b66ca614e0bb593d1c9":"xxx",
    "questionid":"0",
    "answer":"",
    "cookietime":"2592000",
    "loginmode":"",
    "styleid":"",
    "loginsubmit":"true"}
post_data = urllib.urlencode(data)
req = urllib2.Request("http://38.103.161.185/forum/logging.php?action=login&",post_data,headers)
content=opener.open(req)
#print content.read()
req2 = urllib2.Request("http://38.103.161.185/forum/forum-62-1.html",None,headers)
board_html = opener.open(req2).read()
#print board_html
re_link = re.compile(r'\<a href\=\"(thread-\d{7}-1-\d{1}.html)')
title_list = re_link.findall(board_html)
title_list = list(set(title_list)) #去除list中的重复项
#http://38.103.161.185/forum/thread-(4917300)-1-(1).html
#http://38.103.161.185/forum/forum-62-(2).html
#[\u4e00-\u9fa5]
#print title_list

for i in title_list:
    tiezi_url = "http://38.103.161.185/forum/%s"%i
    print tiezi_url
    q = Queue.Queue(0)
    downPic(tiezi_url,q)

print 'All threads terminate!'