lots of fixes

theanti9 · theanti9 · commit dacd34472a86 · 2011-11-01T18:13:08.000-04:00
diff --git a/PyCrawler.py b/PyCrawler.py
@@ -1,7 +1,7 @@
 from query import CrawlerDb
 from content_processor import ContentProcessor
 from settings import VERBOSE
-import sys, urlparse, urllib2
+import sys, urlparse, urllib2, robotparser
 
 # ===== Init stuff =====
 
@@ -12,6 +12,9 @@
 # content processor init
 processor = ContentProcessor(None, None, None)
 
+# robot parser init
+robot = robotparser.RobotFileParser()
+
 if len(sys.argv) < 2:
 	print "Error: No start url was passed"
 	sys.exit()
@@ -22,52 +25,62 @@
 
 def crawl():
 	print "starting..."
-	queue_empty = False
 	while True:
 		url = cdb.dequeue()
-		print url
+		u = urlparse.urlparse(url)
+		robot.set_url('http://'+u[1]+"/robots.txt")
+		if not robot.can_fetch('http://github.com/PyCrawler', url):
+			print "Url disallowed by robots.txt: %s " % url
+			continue
+		if not url.startswith('http'):
+			print "Unfollowable link found at %s " % url
+			continue
+
 		if cdb.checkCrawled(url):
 			continue
 		if url is False:
-			queue_empty = True
-
-		# Get HTTPConnection
-		#connection = httplib.HTTPConnection(parsed_url.netloc)
-		# Make the request
-		#connection.request("GET", parsed_url.path)
-		# Get response
-		#response = connection.getresponse()
-		#data = response.read()
+			break
 		status = 0
 		request = None
 		try:
 			request = urllib2.urlopen(str(url))
 		except urllib2.URLError, e:
-			print e.reason
+			print e
+			print "Exception at url: %s" % url
+			continue
 		except urllib2.HTTPError, e:
 			status = e.code
 		if status == 0:
 			status = 200
 		data = request.read()
+		processor.setInfo(str(url), status, data)
+		ret = processor.process()
+		if status != 200:
+			continue
+		add_queue = []
+		for q in ret:
+			if not cdb.checkCrawled(q):
+				add_queue.append(q)
 
+		l = len(add_queue)
 		if VERBOSE:
 			print "Got %s status from %s" % (status, url)
-		processor.setInfo(str(url), status, data)
-		add_queue = processor.process()
-		l = len(add_queue)
-		print "Found %i links" % l
+			print "Found %i links" % l
 		if l > 0:
-			if queue_empty == True:
-				queue_empty = False
 			cdb.enqueue(add_queue)	
 		cdb.addPage(processor.getDataDict())
 		processor.reset()
-		if queue_empty:
-			break
 
 	print "finishing..."
 	cdb.close()
 	print "done! goodbye!"
 
 if __name__ == "__main__":
-	crawl()
+	try:
+		crawl()
+	except KeyboardInterrupt:
+		print "Stopping"
+		sys.exit()
+	except Exception, e:
+		print "EXCEPTION: %s " % e
+	
diff --git a/content_processor.py b/content_processor.py
@@ -12,16 +12,17 @@ def rankKeywords(text):
 		if t in invalid_keywords:
 			continue
 		if not ranks.has_key(t):
-			print "adding %s" % t
+			#print "adding %s" % t
 			ranks[t] = 1
 		else:
 			ranks[t] += 1
-			print "setting %s to %i" % (t, ranks[t])
+			#print "setting %s to %i" % (t, ranks[t])
 	return ranks
 
 def stripPunctuation(text):
 	pattern = re.compile(r'[^\w\s]')
 	return pattern.sub('', text)
+
 class ContentProcessor:
 	
 	def __init__(self, url, status, text):
@@ -67,38 +68,38 @@ def combineKeywordLists(self):
 			for k,v in l.items():
 				if self.keywords.has_key(k):
 					self.keywords[k] += v
-					print "setting %s to %i" %(k,self.keywords[k])
+					#print "setting %s to %i" %(k,self.keywords[k])
 				else:
 					self.keywords[k] = v
-					print "setting %s to %i" %(k,v)
+					#print "setting %s to %i" %(k,v)
 	
 	# returns links to queue	
 	def processBody(self):
 		queue = ready_queue(self.url, self.body)
-		print "found %i links to queue" % len(queue)
+		#print "found %i links to queue" % len(queue)
 		self.text = stripPunctuation(self.remove_html_tags(self.body))
 		if len(self.text) > 5000:
 			offset = 0
 			i = 0
 			l = []
-			print "splitting text"
+			#print "splitting text"
 			while True:
 				j = self.findnth(self.text[i:],' ',500)
 				offset += j
-				print "SPLIT: 500th space at %i" % j
+				#print "SPLIT: 500th space at %i" % j
 				if j == -1:
-					print "appending from %i on" % i
+					#print "appending from %i on" % i
 					l.append(self.text[i:])
 					break
-				print "appending from %i to %i" % (i,j)
+				#print "appending from %i to %i" % (i,j)
 				l.append(self.text[i:j])
 				i = offset + j+1
-			print "processing with %i threads" % len(l)
+			#print "processing with %i threads" % len(l)
 			pool = Pool(processes=(len(l)))
 			self.keyword_dicts = pool.map(rankKeywords, l)
-			print "processed, returned %i dicts" % len(self.keyword_dicts)
+			#print "processed, returned %i dicts" % len(self.keyword_dicts)
 		else:
-			self.keyword_dicts.append(self.rankKeywords(self.text))
+			self.keyword_dicts.append(rankKeywords(self.text))
 		return queue
 		
 	def processHead(self):
@@ -117,20 +118,23 @@ def findnth(self, haystack, needle, n):
 	# returns the queue from processBody
 	def process(self):
 		text_lower = self.text.lower()
-		print "Finding title"
+		#print "Finding title"
 		self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
-		print "Found title: %s" % self.title
-		print "Finding head"
+		#print "Found title: %s" % self.title
+		#print "Finding head"
 		self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
-		print "Found head of length %i" % len(self.head)
+		#print "Found head of length %i" % len(self.head)
 		self.processHead()
-		print "Finding body"
+		#print "Finding body"
 		self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
-		print "Found body of length %i" % len(self.body)
+		#print "Found body of length %i" % len(self.body)
 		queue = self.processBody()
-		print "combining keyword lists"
+		#print "combining keyword lists"
 		self.combineKeywordLists()
 		return queue
 
 	def getDataDict(self):
+		for k,v in self.keywords.items():
+			if v < 3:
+				del self.keywords[k]
 		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
diff --git a/query.py b/query.py
@@ -56,7 +56,7 @@ def enqueue(self, urls):
 			return False
 		if len(urls) == 0:
 			return True
-		args = [{'address':u} for u in urls]
+		args = [{'address':unicode(u)} for u in urls]
 		result = self.connection.execute(self.queue_table.insert(), args)
 		if result:
 			return True
@@ -77,12 +77,11 @@ def dequeue(self):
 			if not delres:
 				return False
 			# Return the row
-			print result
 			return result[0][1]
 		return False
 	
 	def checkCrawled(self, url):
-		s =  select([self.crawl_table]).where(self.crawl_table.c.address == url)
+		s =  select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
 		result = self.connection.execute(s)
 		if len(result.fetchall()) > 0:
 			result.close()
@@ -101,7 +100,7 @@ def addPage(self, data):
 		if not self.connected:
 			return False
 		# Add the page to the crawl table
-		result = self.connection.execute(self.crawl_table.insert().values(address=data['address'],http_status=data['status'],title=data['title'],size=data['size']))
+		result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
 		if not result:
 			return False
 		# generate list of argument dictionaries for the insert many statement