cryptixcoder
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎ColorStreamHandler.py
Lines changed: 68 additions & 0 deletions b/‎ColorStreamHandler.py
Lines changed: 68 additions & 0 deletions
diff --git a/‎PyCrawler.db.1
-220 KB b/‎PyCrawler.db.1
-220 KB
diff --git a/‎PyCrawler.py
Lines changed: 22 additions & 26 deletions b/‎PyCrawler.py
Lines changed: 22 additions & 26 deletions
diff --git a/‎cPrinter.py
Lines changed: 0 additions & 36 deletions b/‎cPrinter.py
Lines changed: 0 additions & 36 deletions
diff --git a/‎content_processor.py
Lines changed: 18 additions & 5 deletions b/‎content_processor.py
Lines changed: 18 additions & 5 deletions
diff --git a/‎query.py
Lines changed: 4 additions & 1 deletion b/‎query.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎settings.py
Lines changed: 44 additions & 4 deletions b/‎settings.py
Lines changed: 44 additions & 4 deletions
@@ -1,3 +1,4 @@
 *.pyc
 .DS_Store
-*.db
+*.db
+*.log
@@ -0,0 +1,68 @@
+import logging
+import curses
+
+class ColorStreamHandler(logging.Handler):
+
+	def __init__(self, use_colors):
+		logging.Handler.__init__(self)
+		self.use_colors = use_colors
+
+		# Initialize environment
+		curses.setupterm()
+
+		# Get the foreground color attribute for this environment
+		self.fcap = curses.tigetstr('setaf')
+
+		#Get the normal attribute
+		self.COLOR_NORMAL = curses.tigetstr('sgr0')
+
+		# Get + Save the color sequences
+		self.COLOR_INFO = curses.tparm(self.fcap, curses.COLOR_GREEN)
+		self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
+		self.COLOR_WARNING = curses.tparm(self.fcap, curses.COLOR_YELLOW)
+		self.COLOR_DEBUG = curses.tparm(self.fcap, curses.COLOR_BLUE)
+
+	def color(self, msg, level):
+		if level == "INFO":
+			return "%s%s%s" % (self.COLOR_INFO, msg, self.COLOR_NORMAL)
+		elif level == "WARNING":
+			return "%s%s%s" % (self.COLOR_WARNING, msg, self.COLOR_NORMAL)
+		elif level == "ERROR":
+			return "%s%s%s" % (self.COLOR_ERROR, msg, self.COLOR_NORMAL)
+		elif level == "DEBUG":
+			return "%s%s%s" % (self.COLOR_DEBUG, msg, self.COLOR_NORMAL)
+		else:
+			return msg
+	
+	def emit(self, record):
+		record.msg = record.msg.encode('utf-8', 'ignore')
+		msg = self.format(record)
+
+		# This just removes the date and milliseconds from asctime
+		temp = msg.split(']')
+		msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]
+
+		if self.use_colors:
+			msg = self.color(msg, record.levelname)
+		print msg
+
+# 'record' has the following attributes:
+# threadName
+# name
+# thread
+# created
+# process
+# processName
+# args
+# module
+# filename
+# levelno
+# exc_text
+# pathname
+# lineno
+# msg
+# exc_info
+# funcName
+# relativeCreated
+# levelname
+# msecs
@@ -1,8 +1,9 @@
 from query import CrawlerDb
 from content_processor import ContentProcessor
-from settings import VERBOSE, USE_COLORS, DATABASE_ENGINE, DATABASE_NAME, SQLITE_ROTATE_DATABASE_ON_STARTUP
+from settings import LOGGING
 import sys, urlparse, urllib2, shutil, glob, robotparser
-import cPrinter
+import logging, logging.config
+import traceback
 
 # ===== Init stuff =====
 
@@ -13,45 +14,47 @@
 # content processor init
 processor = ContentProcessor(None, None, None)
 
-# get cprinter
-printer = cPrinter.Printer(USE_COLORS)
+# logging setup
+logging.config.dictConfig(LOGGING)
+logger = logging.getLogger("crawler_logger")
 
 # robot parser init
 robot = robotparser.RobotFileParser()
 
 if len(sys.argv) < 2:
-	printer.p("Error: No start url was passed", printer.other)
+	logger.info("Error: No start url was passed")
 	sys.exit()
 
 l = sys.argv[1:]
 
 cdb.enqueue(l)
 
 def crawl():
-	printer.p("starting...", printer.other)
+	logger.info("Starting (%s)..." % sys.argv[1])
 	while True:
 		url = cdb.dequeue()
 		u = urlparse.urlparse(url)
 		robot.set_url('http://'+u[1]+"/robots.txt")
-		if not robot.can_fetch('PyCrawler', url):
-			printer.p("Url disallowed by robots.txt: %s " % url, printer.other)
+		if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')):
+			logger.warning("Url disallowed by robots.txt: %s " % url)
 			continue
 		if not url.startswith('http'):
-			printer.p("Unfollowable link found at %s " % url, printer.other)
+			logger.warning("Unfollowable link found at %s " % url)
 			continue
 
 		if cdb.checkCrawled(url):
 			continue
 		if url is False:
 			break
 		status = 0
+		req = urllib2.Request(str(url))
+		req.add_header('User-Agent', 'PyCrawler 0.2.0')
 		request = None
+
 		try:
-			request = urllib2.urlopen(str(url))
+			request = urllib2.urlopen(req)
 		except urllib2.URLError, e:
-			printer.p(e.reason, printer.error)
-			printer.p("Exception at url: %s" % url, printer.error)
-			
+			logger.error("Exception at url: %s\n%s" % (url, e))
 			continue
 		except urllib2.HTTPError, e:
 			status = e.code
@@ -70,30 +73,23 @@ def crawl():
 		processor.setInfo(str(url), status, data)
 		add_queue = processor.process()
 		l = len(add_queue)
-		if VERBOSE:
-			printer.p("Got %s status from %s" % (status, url), printer.success)
-			printer.p("Found %i links" % l, printer.success)
+		logger.info("Got %s status from %s (Found %i links)" % (status, url, l))
 		if l > 0:
 			cdb.enqueue(add_queue)	
 		cdb.addPage(processor.getDataDict())
 		processor.reset()
 
-	printer.p("finishing...", printer.other)
+	logger.info("Finishing...")
 	cdb.close()
-	printer.p("done! goodbye!", printer.success)
+	logger.info("Done! Goodbye!")
 
 if __name__ == "__main__":
-	if DATABASE_ENGINE == "sqlite" and SQLITE_ROTATE_DATABASE_ON_STARTUP:
-		dbs = glob.glob("*.db*")
-		index = 1;
-		while("%s.db.%s" % (DATABASE_NAME, index) in dbs):
-			index += 1
-		shutil.copy2(dbs[len(dbs)-1], "%s.db.%s" % (DATABASE_NAME, index))
 	try:
 		crawl()
 	except KeyboardInterrupt:
-		printer.p("Stopping", printer.error)
+		logger.error("Stopping (KeyboardInterrupt)")
 		sys.exit()
 	except Exception, e:
-		printer.p("EXCEPTION: %s " % e, printer.error)
+		logger.error("EXCEPTION: %s " % e)
+		traceback.print_exc()
 
@@ -1,8 +1,9 @@
-from ready_queue import ready_queue
-
 from multiprocessing import Pool
+import re, sys, logging
+
+from ready_queue import ready_queue
 
-import re, sys
+logger = logging.getLogger("crawler_logger")
 
 def rankKeywords(text):
 	invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
@@ -84,8 +85,20 @@ def processBody(self):
 					break
 				l.append(self.text[i:j])
 				i = offset + j+1
-			pool = Pool(processes=(len(l)))
-			self.keyword_dicts = pool.map(rankKeywords, l)
+			logger.debug("processing with %i threads" % len(l))
+			try:
+				if len(l) == 0:
+					return []
+				pool = Pool(processes=(len(l)))
+				self.keyword_dicts = pool.map(rankKeywords, l)
+			except KeyboardInterrupt:
+				pool.terminate()
+				pool.join()
+				sys.exit()
+			else:
+				pool.close()
+				pool.join()
+			logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
 		else:
 			self.keyword_dicts.append(rankKeywords(self.text))
 		return queue
 
@@ -100,7 +100,10 @@ def addPage(self, data):
 		if not self.connected:
 			return False
 		# Add the page to the crawl table
-		result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
+		try:
+			result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
+		except UnicodeDecodeError:
+			return False
 		if not result:
 			return False
 		# generate list of argument dictionaries for the insert many statement
 
@@ -1,4 +1,4 @@
-# settings.py
+import logging
 
 DATABASE_ENGINE = "sqlite"		# sqlite or mysql
 DATABASE_NAME = "PyCrawler"		# Database name
@@ -7,8 +7,48 @@
 DATABASE_USER = ""				# Not used with sqlite
 DATABASE_PASS = ""				# Not used with sqlite
 
-SQLITE_ROTATE_DATABASE_ON_STARTUP = True # Rotate the database to a new one on startup
+DEBUG = True 					# Whether or not to show DEBUG level messages
+USE_COLORS = True 				# Whether or not colors should be used when outputting text
 
-VERBOSE = True
+LOGGING = {						# dictConfig for output stream and file logging
+	'version': 1,              
+    'disable_existing_loggers': False,
 
-USE_COLORS = True 				# Whether or not colors should be used when printing text
+	'formatters': {
+		'console': {
+			'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s',
+		},
+		'file': {
+			'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s',
+		},
+	},
+
+	'handlers': {
+		'console': {
+			'class': 'ColorStreamHandler.ColorStreamHandler',
+			'formatter':'console',
+			'level': 'DEBUG',
+			'use_colors': USE_COLORS,
+		},
+		'file': {
+			'class': 'logging.handlers.TimedRotatingFileHandler',
+			'formatter':'file',
+			'level': 'INFO',
+			'when': 'midnight',
+			'filename': 'pycrawler.log',
+			'interval': 1,
+			'backupCount': 0,
+			'encoding': None,
+			'delay': False,
+			'utc': False,
+		},
+	},
+
+	'loggers': {
+		'crawler_logger': {
+			'handlers': ['console', 'file'],
+			'level': 'DEBUG' if DEBUG else 'INFO',
+			'propagate': True,
+		},
+	}
+}
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 *.pyc
 .DS_Store
 -*.db
 +*.db
 +*.log