Skip to content

Commit 5db8daf

Browse files
committed
Merge branch 'master' of https://github.com/theanti9/PyCrawler
Conflicts: content_processor.py
2 parents ad55cd8 + ca6d7a8 commit 5db8daf

8 files changed

+158
-73
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
*.pyc
22
.DS_Store
3-
*.db
3+
*.db
4+
*.log

ColorStreamHandler.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import logging
2+
import curses
3+
4+
class ColorStreamHandler(logging.Handler):
5+
6+
def __init__(self, use_colors):
7+
logging.Handler.__init__(self)
8+
self.use_colors = use_colors
9+
10+
# Initialize environment
11+
curses.setupterm()
12+
13+
# Get the foreground color attribute for this environment
14+
self.fcap = curses.tigetstr('setaf')
15+
16+
#Get the normal attribute
17+
self.COLOR_NORMAL = curses.tigetstr('sgr0')
18+
19+
# Get + Save the color sequences
20+
self.COLOR_INFO = curses.tparm(self.fcap, curses.COLOR_GREEN)
21+
self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
22+
self.COLOR_WARNING = curses.tparm(self.fcap, curses.COLOR_YELLOW)
23+
self.COLOR_DEBUG = curses.tparm(self.fcap, curses.COLOR_BLUE)
24+
25+
def color(self, msg, level):
26+
if level == "INFO":
27+
return "%s%s%s" % (self.COLOR_INFO, msg, self.COLOR_NORMAL)
28+
elif level == "WARNING":
29+
return "%s%s%s" % (self.COLOR_WARNING, msg, self.COLOR_NORMAL)
30+
elif level == "ERROR":
31+
return "%s%s%s" % (self.COLOR_ERROR, msg, self.COLOR_NORMAL)
32+
elif level == "DEBUG":
33+
return "%s%s%s" % (self.COLOR_DEBUG, msg, self.COLOR_NORMAL)
34+
else:
35+
return msg
36+
37+
def emit(self, record):
38+
record.msg = record.msg.encode('utf-8', 'ignore')
39+
msg = self.format(record)
40+
41+
# This just removes the date and milliseconds from asctime
42+
temp = msg.split(']')
43+
msg = '[' + temp[0].split(' ')[1].split(',')[0] + ']' + temp[1]
44+
45+
if self.use_colors:
46+
msg = self.color(msg, record.levelname)
47+
print msg
48+
49+
# 'record' has the following attributes:
50+
# threadName
51+
# name
52+
# thread
53+
# created
54+
# process
55+
# processName
56+
# args
57+
# module
58+
# filename
59+
# levelno
60+
# exc_text
61+
# pathname
62+
# lineno
63+
# msg
64+
# exc_info
65+
# funcName
66+
# relativeCreated
67+
# levelname
68+
# msecs

PyCrawler.db.1

-220 KB
Binary file not shown.

PyCrawler.py

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from query import CrawlerDb
22
from content_processor import ContentProcessor
3-
from settings import VERBOSE, USE_COLORS, DATABASE_ENGINE, DATABASE_NAME, SQLITE_ROTATE_DATABASE_ON_STARTUP
3+
from settings import LOGGING
44
import sys, urlparse, urllib2, shutil, glob, robotparser
5-
import cPrinter
5+
import logging, logging.config
6+
import traceback
67

78
# ===== Init stuff =====
89

@@ -13,45 +14,47 @@
1314
# content processor init
1415
processor = ContentProcessor(None, None, None)
1516

16-
# get cprinter
17-
printer = cPrinter.Printer(USE_COLORS)
17+
# logging setup
18+
logging.config.dictConfig(LOGGING)
19+
logger = logging.getLogger("crawler_logger")
1820

1921
# robot parser init
2022
robot = robotparser.RobotFileParser()
2123

2224
if len(sys.argv) < 2:
23-
printer.p("Error: No start url was passed", printer.other)
25+
logger.info("Error: No start url was passed")
2426
sys.exit()
2527

2628
l = sys.argv[1:]
2729

2830
cdb.enqueue(l)
2931

3032
def crawl():
31-
printer.p("starting...", printer.other)
33+
logger.info("Starting (%s)..." % sys.argv[1])
3234
while True:
3335
url = cdb.dequeue()
3436
u = urlparse.urlparse(url)
3537
robot.set_url('http://'+u[1]+"/robots.txt")
36-
if not robot.can_fetch('PyCrawler', url):
37-
printer.p("Url disallowed by robots.txt: %s " % url, printer.other)
38+
if not robot.can_fetch('PyCrawler', url.encode('ascii', 'replace')):
39+
logger.warning("Url disallowed by robots.txt: %s " % url)
3840
continue
3941
if not url.startswith('http'):
40-
printer.p("Unfollowable link found at %s " % url, printer.other)
42+
logger.warning("Unfollowable link found at %s " % url)
4143
continue
4244

4345
if cdb.checkCrawled(url):
4446
continue
4547
if url is False:
4648
break
4749
status = 0
50+
req = urllib2.Request(str(url))
51+
req.add_header('User-Agent', 'PyCrawler 0.2.0')
4852
request = None
53+
4954
try:
50-
request = urllib2.urlopen(str(url))
55+
request = urllib2.urlopen(req)
5156
except urllib2.URLError, e:
52-
printer.p(e.reason, printer.error)
53-
printer.p("Exception at url: %s" % url, printer.error)
54-
57+
logger.error("Exception at url: %s\n%s" % (url, e))
5558
continue
5659
except urllib2.HTTPError, e:
5760
status = e.code
@@ -70,30 +73,23 @@ def crawl():
7073
processor.setInfo(str(url), status, data)
7174
add_queue = processor.process()
7275
l = len(add_queue)
73-
if VERBOSE:
74-
printer.p("Got %s status from %s" % (status, url), printer.success)
75-
printer.p("Found %i links" % l, printer.success)
76+
logger.info("Got %s status from %s (Found %i links)" % (status, url, l))
7677
if l > 0:
7778
cdb.enqueue(add_queue)
7879
cdb.addPage(processor.getDataDict())
7980
processor.reset()
8081

81-
printer.p("finishing...", printer.other)
82+
logger.info("Finishing...")
8283
cdb.close()
83-
printer.p("done! goodbye!", printer.success)
84+
logger.info("Done! Goodbye!")
8485

8586
if __name__ == "__main__":
86-
if DATABASE_ENGINE == "sqlite" and SQLITE_ROTATE_DATABASE_ON_STARTUP:
87-
dbs = glob.glob("*.db*")
88-
index = 1;
89-
while("%s.db.%s" % (DATABASE_NAME, index) in dbs):
90-
index += 1
91-
shutil.copy2(dbs[len(dbs)-1], "%s.db.%s" % (DATABASE_NAME, index))
9287
try:
9388
crawl()
9489
except KeyboardInterrupt:
95-
printer.p("Stopping", printer.error)
90+
logger.error("Stopping (KeyboardInterrupt)")
9691
sys.exit()
9792
except Exception, e:
98-
printer.p("EXCEPTION: %s " % e, printer.error)
93+
logger.error("EXCEPTION: %s " % e)
94+
traceback.print_exc()
9995

cPrinter.py

Lines changed: 0 additions & 36 deletions
This file was deleted.

content_processor.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from ready_queue import ready_queue
2-
31
from multiprocessing import Pool
2+
import re, sys, logging
3+
4+
from ready_queue import ready_queue
45

5-
import re, sys
6+
logger = logging.getLogger("crawler_logger")
67

78
def rankKeywords(text):
89
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
@@ -84,8 +85,20 @@ def processBody(self):
8485
break
8586
l.append(self.text[i:j])
8687
i = offset + j+1
87-
pool = Pool(processes=(len(l)))
88-
self.keyword_dicts = pool.map(rankKeywords, l)
88+
logger.debug("processing with %i threads" % len(l))
89+
try:
90+
if len(l) == 0:
91+
return []
92+
pool = Pool(processes=(len(l)))
93+
self.keyword_dicts = pool.map(rankKeywords, l)
94+
except KeyboardInterrupt:
95+
pool.terminate()
96+
pool.join()
97+
sys.exit()
98+
else:
99+
pool.close()
100+
pool.join()
101+
logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
89102
else:
90103
self.keyword_dicts.append(rankKeywords(self.text))
91104
return queue

query.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,10 @@ def addPage(self, data):
100100
if not self.connected:
101101
return False
102102
# Add the page to the crawl table
103-
result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
103+
try:
104+
result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
105+
except UnicodeDecodeError:
106+
return False
104107
if not result:
105108
return False
106109
# generate list of argument dictionaries for the insert many statement

settings.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# settings.py
1+
import logging
22

33
DATABASE_ENGINE = "sqlite" # sqlite or mysql
44
DATABASE_NAME = "PyCrawler" # Database name
@@ -7,8 +7,48 @@
77
DATABASE_USER = "" # Not used with sqlite
88
DATABASE_PASS = "" # Not used with sqlite
99

10-
SQLITE_ROTATE_DATABASE_ON_STARTUP = True # Rotate the database to a new one on startup
10+
DEBUG = True # Whether or not to show DEBUG level messages
11+
USE_COLORS = True # Whether or not colors should be used when outputting text
1112

12-
VERBOSE = True
13+
LOGGING = { # dictConfig for output stream and file logging
14+
'version': 1,
15+
'disable_existing_loggers': False,
1316

14-
USE_COLORS = True # Whether or not colors should be used when printing text
17+
'formatters': {
18+
'console': {
19+
'format': '[%(asctime)s] %(levelname)s::%(module)s - %(message)s',
20+
},
21+
'file': {
22+
'format': '[%(asctime)s] %(levelname)s::(P:%(process)d T:%(thread)d)::%(module)s - %(message)s',
23+
},
24+
},
25+
26+
'handlers': {
27+
'console': {
28+
'class': 'ColorStreamHandler.ColorStreamHandler',
29+
'formatter':'console',
30+
'level': 'DEBUG',
31+
'use_colors': USE_COLORS,
32+
},
33+
'file': {
34+
'class': 'logging.handlers.TimedRotatingFileHandler',
35+
'formatter':'file',
36+
'level': 'INFO',
37+
'when': 'midnight',
38+
'filename': 'pycrawler.log',
39+
'interval': 1,
40+
'backupCount': 0,
41+
'encoding': None,
42+
'delay': False,
43+
'utc': False,
44+
},
45+
},
46+
47+
'loggers': {
48+
'crawler_logger': {
49+
'handlers': ['console', 'file'],
50+
'level': 'DEBUG' if DEBUG else 'INFO',
51+
'propagate': True,
52+
},
53+
}
54+
}

0 commit comments

Comments
 (0)