Skip to content

Commit b07ea4c

Browse files
author
linjun
committed
用代理池批量检查域名有效性完成。
1 parent e442974 commit b07ea4c

File tree

4 files changed

+232
-29
lines changed

4 files changed

+232
-29
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
.idea
1+
.idea
2+
*.pyc

domainChecker.py

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,75 @@
44
import sys
55
import array
66

7+
import msvcrt
8+
9+
from utils.proxyManager import ProxyManager
710
from utils.utils import *
811
from multiprocessing.dummy import Pool as ThreadPool
912

10-
DOMAIN_NAMES_FILE = "DomainNamesFile.txt"
11-
DomainCheckResult = "DomainCheckResult.txt"
12-
13+
man=ProxyManager()
14+
class config:
15+
MAX_THREAD_COUNT = 30
16+
FailedRetryTimes = 20
17+
DomainNamesList = "DomainNamesList.txt"
18+
resultFilename = "DomainCheck_result.txt"
1319

14-
def checkDomain(name):
20+
def checkWithBaiduWhois(name):
21+
#http://whois.bj.baidubce.com/whois?format=javascript&domain=whatthesdfksdjf.com
22+
domainName = name + ".com"
23+
url = r"http://whois.bj.baidubce.com/whois?format=javascript&domain=" + domainName
24+
# url=r"https://checkapi.aliyun.com/check/checkdomain?callback=result&domain=azzzz.shop&token=check-web-hichina-com%3A7s88yvbgwterj87mk5llo6k6owagkw4u"
25+
return GetUrlContent2(url,proxies=man.popProxies())
26+
def checkWithBaiduApi(name):
27+
# https://cloud.baidu.com/api/bcd/search/status
28+
# {"domainNames":[{"label":"sz","tld":"com"}]}
29+
domainName = name
30+
url = r"https://cloud.baidu.com/api/bcd/search/status"
31+
32+
header = {
33+
'Connection': 'keep-alive',
34+
'Content-Length': '44',
35+
'Pragma': 'no-cache',
36+
'Cache-Control': 'no-cache',
37+
'Accept': 'application/json, text/javascript, */*; q=0.01',
38+
'Origin': 'https://cloud.baidu.com',
39+
'X-Requested-With': 'XMLHttpRequest',
40+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
41+
'Content-Type': 'application/json',
42+
'Referer': 'https://cloud.baidu.com/product/bcd/search.html?keyword=sz',
43+
'Accept-Encoding': 'gzip, deflate, br',
44+
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
45+
'Cookie': 'BAIDUID=ABDF12D91049CBD5AA0895DAFCA1144D:FG=1; PSTM=1491968808; BIDUPSID=D0C77823B4F11112E550D605C212D8D9; BDUSS=DVpUlk5eUZnWmdkM09WUHE3VmJQNVhCekY2bE9Wd0tRNmFBMnpCR3RKQ202d3RhTVFBQUFBJCQAAAAAAAAAAAEAAADG3h0rY29kZXJsaW4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKZe5FmmXuRZW; MCITY=-340%3A; BCLID=10955523308864486684; BDSFRCVID=5G_sJeC62Ra1LkRAhCCwhM3RN2K2PtoTH6aopK_Jf9Dcn3ODE-HlEG0PJU8g0KubVI2-ogKK3gOTH4nP; H_BDCLCKID_SF=tbAjVIPaf-3bfTrTKb55-P_3qxby26nfMmJeaJ5nJD_BSl6q5TJNqJ8Vjf7uJbkjWgcXKqTOQpP-HqTYLp3b2h-ghJoKah5eKjREKl0MLPbYbb0xyn_VKP_bDfnMBMPe52OnaIb8LIF-MK8xejK2en-W5gT0tC62aKDX3buQJlIMqpcNLTDK2Mty2R393CrWBmr32COytn5ZjnTIylO1j4_e3bjw54vmWmO0bRIEtfTbJh5jDh3Ub6ksD-Rte4on-6Ry0hvctb3cShPmhl00Djb-jN0qJ6FsKKJ03bk8KRREJt5kq4bohjn0QnneBtQmJJrN3Cj12MoNjhOJ5P7YDpDND44HQn3yQg-q3R7MWM7keCTe-PI5XU0kqt7Q0x-jLgQPVn0MW-5DSlI4qtnJyUnybPnnBT3XLnQ2QJ8BJDtKMCQP; PSINO=6; H_PS_PSSID=25245_1423_13289_21097_20697_25439_25178_20719; CAMPAIGN_TRACK=cp%3Aaladdin%7Ckw%3A139; CAMPAIGN_TRACK_TIME=2017-12-23+18%3A15%3A11; Hm_lvt_28a17f66627d87f1d046eae152a1c93d=1513070462,1514024132; Hm_lpvt_28a17f66627d87f1d046eae152a1c93d=1514024141',
46+
}
47+
data = "{\"domainNames\":[{\"label\":\"" + domainName + "\",\"tld\":\"com\"}]}"
48+
return PostUrlWithDataAndHeader(url, data, header)
49+
50+
51+
def checkWithAliyunApi(name):
1552
domainName = name + ".com"
16-
reprint("checking "+domainName+" ...")
1753
url = r"https://checkapi.aliyun.com/check/checkdomain?callback=result&domain=" + domainName + r"&token=check-web-hichina-com%3A7s88yvbgwterj87mk5llo6k6owagkw4u"
1854
# url=r"https://checkapi.aliyun.com/check/checkdomain?callback=result&domain=azzzz.shop&token=check-web-hichina-com%3A7s88yvbgwterj87mk5llo6k6owagkw4u"
19-
response = GetUrlContent2(url)
20-
appendStrToFile(response + "\n", DomainCheckResult)
55+
return GetUrlContent2(url)
56+
57+
58+
def checkDomain(name):
59+
reprint("checking " + name + " ...")
60+
retryCount = 0
61+
while True:
62+
try:
63+
# response= checkWithAliyunApi(name)
64+
# response = checkWithBaiduApi(name)
65+
response = checkWithBaiduWhois(name)
66+
if "\"status\":0" in response:
67+
break;
68+
except BaseException as e:
69+
pass
70+
retryCount += 1
71+
if retryCount > config.FailedRetryTimes:
72+
response = "Failed:" + e.message
73+
break
74+
reprint("checking %s ... retry #%d"%(name,retryCount))
75+
loglToFile("%s\t%d\t--->\t" % (name, retryCount) + response, config.resultFilename)
2176
# logl(response)
2277
return "ok"
2378

@@ -38,8 +93,8 @@ def strPlusOne(t):
3893

3994

4095
def BuildDomainList(fileName):
41-
startName = 'aaaaa'
42-
endName = 'zzzzz'
96+
startName = 'aa'
97+
endName = 'zz'
4398
# nameList = []
4499
logl("Building name list:%s to %s..." % (startName, endName))
45100
while startName != endName:
@@ -61,17 +116,42 @@ def LoadDomainList(fileName):
61116

62117

63118
def CheckAllDomains(nameList):
64-
pool = ThreadPool()
119+
pool = ThreadPool(config.MAX_THREAD_COUNT)
120+
logl(pool._processes)
65121
pool.map(checkDomain, nameList)
66122
pool.close()
67123
pool.join()
68124
logl('All done.')
69125

70126

127+
def usage():
128+
logl("usage:%s DOMAIN_NAMES_FILE")
129+
130+
131+
def raise_test():
132+
raise BaseException("test")
133+
134+
135+
def test():
136+
try:
137+
raise_test()
138+
except BaseException as a:
139+
logl("Exception:" + a.message)
140+
141+
71142
def main():
72-
logl("Loading domain names from %s .." % DOMAIN_NAMES_FILE)
73-
nameList = LoadDomainList(DOMAIN_NAMES_FILE)
74-
logl("%d names loaded."%len(nameList))
143+
# test()
144+
# return
145+
logl("test end.")
146+
if len(sys.argv) == 2:
147+
config.DomainNamesListFile = sys.argv[1]
148+
config.resultFilename = "result_" + config.DomainNamesListFile
149+
else:
150+
usage()
151+
return
152+
logl("Loading domain names from %s .." % config.DomainNamesListFile)
153+
nameList = LoadDomainList(config.DomainNamesListFile)
154+
logl("%d names loaded." % len(nameList))
75155
logl("Checking...")
76156
CheckAllDomains(nameList)
77157
logl("All done.")
@@ -80,4 +160,6 @@ def main():
80160
if __name__ == '__main__':
81161
# BuildDomainList(DOMAIN_NAMES_FILE)
82162
main()
83-
# checkDomain("aaaaa")
163+
logl("<<")
164+
msvcrt.getch()
165+
# checkDomain("as")

utils/proxyManager.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# -*- coding: utf-8 -*-
2+
import json
3+
import threading
4+
from utils import *
5+
6+
SingletonLock = threading.Lock()
7+
8+
9+
class Singleton(object):
10+
# 定义静态变量实例
11+
__instance = None
12+
13+
def __init__(self):
14+
pass
15+
16+
def __new__(cls, *args, **kwargs):
17+
if not cls.__instance:
18+
try:
19+
SingletonLock.acquire()
20+
# double check
21+
if not cls.__instance:
22+
cls.__instance = super(Singleton, cls).__new__(cls, *args, **kwargs)
23+
finally:
24+
SingletonLock.release()
25+
return cls.__instance
26+
27+
28+
class ProxyManager(Singleton):
29+
__first_init = True
30+
31+
def __init__(self):
32+
if self.__first_init:
33+
self.__first_init = False
34+
self.proxyLock = threading.Lock()
35+
self.proxies = []
36+
self.proxy_pool_url = "http://127.0.0.1:8000"
37+
self.refresh()
38+
logl("init..")
39+
self.count = 0
40+
41+
def refresh(self):
42+
response = GetUrlContent2(self.proxy_pool_url)
43+
self.proxies = json.loads(response)
44+
45+
def pop(self):
46+
proxy = None
47+
try:
48+
self.proxyLock.acquire()
49+
if len(self.proxies) < 1:
50+
self.refresh()
51+
proxy = self.proxies.pop(0)
52+
self.proxies.append(proxy)
53+
# self.count += 1
54+
# logl(self.count)
55+
56+
finally:
57+
self.proxyLock.release()
58+
return proxy
59+
60+
def popProxies(self):
61+
proxy = self.pop()
62+
proxies = {"http": "http://%s:%s" % (proxy[0], proxy[1]), "https": "http://%s:%s" % (proxy[0], proxy[1]), }
63+
return proxies
64+
65+
66+
man = ProxyManager()
67+
68+
69+
def test_singleton_in_thread():
70+
# logl(id(man))
71+
tp = man.popProxies()
72+
logl(str(tp))
73+
try:
74+
response = GetUrlContent2("http://pv.sohu.com/cityjson", proxies=tp)
75+
except BaseException as e:
76+
logl(e.message)
77+
response = str(e.message)
78+
logl("proxies:%s result:%s" % (str(tp), response))
79+
80+
81+
if __name__ == "__main__":
82+
man = ProxyManager()
83+
threadList = []
84+
for i in range(100):
85+
t = threading.Thread(target=test_singleton_in_thread, args=[])
86+
t.setDaemon(True)
87+
t.start()
88+
threadList.append(t)
89+
for th in threadList:
90+
th.join()
91+
logl("all threads end.")

utils/utils.py

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
import chardet
1010
import sys
11+
reload(sys)
12+
sys.setdefaultencoding('utf8')
1113

1214
import requests
1315
from requests import ConnectionError
@@ -49,6 +51,7 @@
4951
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
5052
]
5153

54+
5255
def appendStrToFile(sstr, filepath):
5356
f = open(filepath, "a+")
5457
try:
@@ -62,7 +65,7 @@ def appendStrToFile(sstr, filepath):
6265
def logToFile(msg, filepath):
6366
prefix = "[" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "]"
6467
logtext = prefix + "\t" + msg
65-
appendStrToFile(msg, filepath)
68+
appendStrToFile(logtext, filepath)
6669

6770

6871
def loglToFile(msg, filepath):
@@ -97,13 +100,10 @@ def logldebug(msg):
97100

98101

99102
def logl(msg):
100-
msg=str(msg)
103+
msg = str(msg)
101104
log("[.] " + msg + "\n")
102105

103106

104-
105-
106-
107107
def GetEncoding(data):
108108
chardit1 = chardet.detect(data)
109109
encoding = chardit1['encoding']
@@ -201,27 +201,31 @@ def writeFile(filename, data):
201201
f.write(data)
202202
f.close()
203203

204+
204205
def getFileMd5(filepath):
205206
return DoCmd("md5sum " + filepath + "|grep -oE '[a-zA-Z0-9]{32}'").strip()
206207
pass
207208

209+
208210
def reprint(msg):
209-
sys.stdout.write("\r" + msg )
211+
sys.stdout.write("\r" + msg)
210212

211213

212-
def urlencode( val):
214+
def urlencode(val):
213215
if isinstance(val, unicode):
214216
return urllib.quote(str(val), safe='/:?=')
215217
return urllib.quote(val, safe='/:?=')
216218

217-
def GetUrlContent( url):
219+
220+
def GetUrlContent(url):
218221
request = urllib2.Request(urlencode(url))
219222
response = urllib2.urlopen(request)
220223
page = response.read()
221224
logl(page)
222225
encoding = GetEncoding(page)
223226
return page.decode(encoding)
224227

228+
225229
def get_header():
226230
return {
227231
'User-Agent': random.choice(USER_AGENTS),
@@ -231,12 +235,37 @@ def get_header():
231235
'Accept-Encoding': 'gzip, deflate',
232236
}
233237

238+
234239
TIMEOUT = 5
235-
def GetUrlContent2(url):
236-
r = requests.get(url=url, headers=get_header(), timeout=TIMEOUT)
237-
r.encoding = chardet.detect(r.content)['encoding']
240+
241+
242+
def GetUrlContent2(url,proxies=None):
243+
try:
244+
if proxies is None:
245+
r = requests.get(url=url, headers=get_header(), timeout=TIMEOUT)
246+
else:
247+
r = requests.get(url=url, headers=get_header(), timeout=TIMEOUT,proxies=proxies)
248+
r.encoding = chardet.detect(r.content)['encoding']
249+
except Exception as e:
250+
raise BaseException("#ConnectionException:"+str(e.message))
251+
if not r.ok:
252+
# raise ConnectionError
253+
raise BaseException("#ConnectionError:"+str(r.reason))
254+
else:
255+
return r.text
256+
257+
258+
def PostUrlWithDataAndHeader(url, data, header,proxies=None):
259+
try:
260+
if proxies is None:
261+
r = requests.post(url=url, data=data, headers=header, timeout=TIMEOUT)
262+
else:
263+
r = requests.post(url=url, data=data, headers=header, timeout=TIMEOUT,proxies=proxies)
264+
r.encoding = chardet.detect(r.content)['encoding']
265+
except Exception as e:
266+
raise BaseException("#ConnectionException:"+e.message)
238267
if not r.ok:
239-
#raise ConnectionError
240-
return "#ConnectionError"
268+
# raise ConnectionError
269+
raise BaseException("#ConnectionError"+str(r.reason))
241270
else:
242-
return r.text
271+
return r.text

0 commit comments

Comments
 (0)