Skip to content

Commit dacd344

Browse files
committed
lots of fixes
1 parent 6fdbc75 commit dacd344

File tree

3 files changed

+61
-45
lines changed

3 files changed

+61
-45
lines changed

PyCrawler.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from query import CrawlerDb
22
from content_processor import ContentProcessor
33
from settings import VERBOSE
4-
import sys, urlparse, urllib2
4+
import sys, urlparse, urllib2, robotparser
55

66
# ===== Init stuff =====
77

@@ -12,6 +12,9 @@
1212
# content processor init
1313
processor = ContentProcessor(None, None, None)
1414

15+
# robot parser init
16+
robot = robotparser.RobotFileParser()
17+
1518
if len(sys.argv) < 2:
1619
print "Error: No start url was passed"
1720
sys.exit()
@@ -22,52 +25,62 @@
2225

2326
def crawl():
2427
print "starting..."
25-
queue_empty = False
2628
while True:
2729
url = cdb.dequeue()
28-
print url
30+
u = urlparse.urlparse(url)
31+
robot.set_url('http://'+u[1]+"/robots.txt")
32+
if not robot.can_fetch('PyCrawler', url):
33+
print "Url disallowed by robots.txt: %s " % url
34+
continue
35+
if not url.startswith('http'):
36+
print "Unfollowable link found at %s " % url
37+
continue
38+
2939
if cdb.checkCrawled(url):
3040
continue
3141
if url is False:
32-
queue_empty = True
33-
34-
# Get HTTPConnection
35-
#connection = httplib.HTTPConnection(parsed_url.netloc)
36-
# Make the request
37-
#connection.request("GET", parsed_url.path)
38-
# Get response
39-
#response = connection.getresponse()
40-
#data = response.read()
42+
break
4143
status = 0
4244
request = None
4345
try:
4446
request = urllib2.urlopen(str(url))
4547
except urllib2.URLError, e:
46-
print e.reason
48+
print e
49+
print "Exception at url: %s" % url
50+
continue
4751
except urllib2.HTTPError, e:
4852
status = e.code
4953
if status == 0:
5054
status = 200
5155
data = request.read()
56+
processor.setInfo(str(url), status, data)
57+
ret = processor.process()
58+
if status != 200:
59+
continue
60+
add_queue = []
61+
for q in ret:
62+
if not cdb.checkCrawled(q):
63+
add_queue.append(q)
5264

65+
l = len(add_queue)
5366
if VERBOSE:
5467
print "Got %s status from %s" % (status, url)
55-
processor.setInfo(str(url), status, data)
56-
add_queue = processor.process()
57-
l = len(add_queue)
58-
print "Found %i links" % l
68+
print "Found %i links" % l
5969
if l > 0:
60-
if queue_empty == True:
61-
queue_empty = False
6270
cdb.enqueue(add_queue)
6371
cdb.addPage(processor.getDataDict())
6472
processor.reset()
65-
if queue_empty:
66-
break
6773

6874
print "finishing..."
6975
cdb.close()
7076
print "done! goodbye!"
7177

7278
if __name__ == "__main__":
73-
crawl()
79+
try:
80+
crawl()
81+
except KeyboardInterrupt:
82+
print "Stopping"
83+
sys.exit()
84+
except Exception, e:
85+
print "EXCEPTION: %s " % e
86+

content_processor.py

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,17 @@ def rankKeywords(text):
1212
if t in invalid_keywords:
1313
continue
1414
if not ranks.has_key(t):
15-
print "adding %s" % t
15+
#print "adding %s" % t
1616
ranks[t] = 1
1717
else:
1818
ranks[t] += 1
19-
print "setting %s to %i" % (t, ranks[t])
19+
#print "setting %s to %i" % (t, ranks[t])
2020
return ranks
2121

2222
def stripPunctuation(text):
2323
pattern = re.compile(r'[^\w\s]')
2424
return pattern.sub('', text)
25+
2526
class ContentProcessor:
2627

2728
def __init__(self, url, status, text):
@@ -67,38 +68,38 @@ def combineKeywordLists(self):
6768
for k,v in l.items():
6869
if self.keywords.has_key(k):
6970
self.keywords[k] += v
70-
print "setting %s to %i" %(k,self.keywords[k])
71+
#print "setting %s to %i" %(k,self.keywords[k])
7172
else:
7273
self.keywords[k] = v
73-
print "setting %s to %i" %(k,v)
74+
#print "setting %s to %i" %(k,v)
7475

7576
# returns links to queue
7677
def processBody(self):
7778
queue = ready_queue(self.url, self.body)
78-
print "found %i links to queue" % len(queue)
79+
#print "found %i links to queue" % len(queue)
7980
self.text = stripPunctuation(self.remove_html_tags(self.body))
8081
if len(self.text) > 5000:
8182
offset = 0
8283
i = 0
8384
l = []
84-
print "splitting text"
85+
#print "splitting text"
8586
while True:
8687
j = self.findnth(self.text[i:],' ',500)
8788
offset += j
88-
print "SPLIT: 500th space at %i" % j
89+
#print "SPLIT: 500th space at %i" % j
8990
if j == -1:
90-
print "appending from %i on" % i
91+
#print "appending from %i on" % i
9192
l.append(self.text[i:])
9293
break
93-
print "appending from %i to %i" % (i,j)
94+
#print "appending from %i to %i" % (i,j)
9495
l.append(self.text[i:j])
9596
i = offset + j+1
96-
print "processing with %i threads" % len(l)
97+
#print "processing with %i threads" % len(l)
9798
pool = Pool(processes=(len(l)))
9899
self.keyword_dicts = pool.map(rankKeywords, l)
99-
print "processed, returned %i dicts" % len(self.keyword_dicts)
100+
#print "processed, returned %i dicts" % len(self.keyword_dicts)
100101
else:
101-
self.keyword_dicts.append(self.rankKeywords(self.text))
102+
self.keyword_dicts.append(rankKeywords(self.text))
102103
return queue
103104

104105
def processHead(self):
@@ -117,20 +118,23 @@ def findnth(self, haystack, needle, n):
117118
# returns the queue from processBody
118119
def process(self):
119120
text_lower = self.text.lower()
120-
print "Finding title"
121+
#print "Finding title"
121122
self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
122-
print "Found title: %s" % self.title
123-
print "Finding head"
123+
#print "Found title: %s" % self.title
124+
#print "Finding head"
124125
self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
125-
print "Found head of length %i" % len(self.head)
126+
#print "Found head of length %i" % len(self.head)
126127
self.processHead()
127-
print "Finding body"
128+
#print "Finding body"
128129
self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
129-
print "Found body of length %i" % len(self.body)
130+
#print "Found body of length %i" % len(self.body)
130131
queue = self.processBody()
131-
print "combining keyword lists"
132+
#print "combining keyword lists"
132133
self.combineKeywordLists()
133134
return queue
134135

135136
def getDataDict(self):
137+
for k,v in self.keywords.items():
138+
if v < 3:
139+
del self.keywords[k]
136140
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

query.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def enqueue(self, urls):
5656
return False
5757
if len(urls) == 0:
5858
return True
59-
args = [{'address':u} for u in urls]
59+
args = [{'address':unicode(u)} for u in urls]
6060
result = self.connection.execute(self.queue_table.insert(), args)
6161
if result:
6262
return True
@@ -77,12 +77,11 @@ def dequeue(self):
7777
if not delres:
7878
return False
7979
# Return the row
80-
print result
8180
return result[0][1]
8281
return False
8382

8483
def checkCrawled(self, url):
85-
s = select([self.crawl_table]).where(self.crawl_table.c.address == url)
84+
s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
8685
result = self.connection.execute(s)
8786
if len(result.fetchall()) > 0:
8887
result.close()
@@ -101,7 +100,7 @@ def addPage(self, data):
101100
if not self.connected:
102101
return False
103102
# Add the page to the crawl table
104-
result = self.connection.execute(self.crawl_table.insert().values(address=data['address'],http_status=data['status'],title=data['title'],size=data['size']))
103+
result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
105104
if not result:
106105
return False
107106
# generate list of argument dictionaries for the insert many statement

0 commit comments

Comments
 (0)