Skip to content

Commit 4425b4c

Browse files
committed
Created Printer class for color printing
1 parent 6fdbc75 commit 4425b4c

File tree

5 files changed

+54
-39
lines changed

5 files changed

+54
-39
lines changed

PyCrawler.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from query import CrawlerDb
22
from content_processor import ContentProcessor
3-
from settings import VERBOSE
3+
from settings import VERBOSE, COLOR_ERROR, COLOR_SUCCESS
44
import sys, urlparse, urllib2
5+
import cPrinter
56

67
# ===== Init stuff =====
78

@@ -12,50 +13,44 @@
1213
# content processor init
1314
processor = ContentProcessor(None, None, None)
1415

16+
# get cprinter
17+
printer = cPrinter.Printer(COLOR_SUCCESS, COLOR_ERROR)
18+
1519
if len(sys.argv) < 2:
16-
print "Error: No start url was passed"
20+
printer.p("Error: No start url was passed", printer.error)
1721
sys.exit()
1822

1923
l = sys.argv[1:]
2024

2125
cdb.enqueue(l)
2226

2327
def crawl():
24-
print "starting..."
28+
printer.p("starting...", printer.success)
2529
queue_empty = False
2630
while True:
2731
url = cdb.dequeue()
28-
print url
2932
if cdb.checkCrawled(url):
3033
continue
3134
if url is False:
3235
queue_empty = True
33-
34-
# Get HTTPConnection
35-
#connection = httplib.HTTPConnection(parsed_url.netloc)
36-
# Make the request
37-
#connection.request("GET", parsed_url.path)
38-
# Get response
39-
#response = connection.getresponse()
40-
#data = response.read()
4136
status = 0
4237
request = None
4338
try:
4439
request = urllib2.urlopen(str(url))
4540
except urllib2.URLError, e:
46-
print e.reason
41+
printer.p(e.reason, printer.error)
4742
except urllib2.HTTPError, e:
4843
status = e.code
4944
if status == 0:
5045
status = 200
5146
data = request.read()
5247

53-
if VERBOSE:
54-
print "Got %s status from %s" % (status, url)
5548
processor.setInfo(str(url), status, data)
5649
add_queue = processor.process()
5750
l = len(add_queue)
58-
print "Found %i links" % l
51+
if VERBOSE:
52+
printer.p("Got %s status from %s" % (status, url), printer.success)
53+
printer.p("Found %i links" % l, printer.success)
5954
if l > 0:
6055
if queue_empty == True:
6156
queue_empty = False
@@ -65,9 +60,9 @@ def crawl():
6560
if queue_empty:
6661
break
6762

68-
print "finishing..."
63+
printer.p("finishing...", printer.success)
6964
cdb.close()
70-
print "done! goodbye!"
65+
printer.p("done! goodbye!", printer.success)
7166

7267
if __name__ == "__main__":
7368
crawl()

cPrinter.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import curses
2+
3+
class Printer():
4+
5+
def __init__(self, COLOR_SUCCESS, COLOR_ERROR):
6+
# Define our types
7+
self.success = 0;
8+
self.error = 1;
9+
10+
# Initialize environment
11+
curses.setupterm()
12+
13+
# Get the foreground color attribute for this environment
14+
self.fcap = curses.tigetstr('setaf')
15+
16+
#Get the normal attribute
17+
self.COLOR_NORMAL = curses.tigetstr('sgr0')
18+
19+
# Initialize custom colors to the first two slots
20+
curses.initscr()
21+
curses.start_color()
22+
curses.init_color(0, COLOR_SUCCESS[0], COLOR_SUCCESS[1], COLOR_SUCCESS[2])
23+
curses.init_color(1, COLOR_ERROR[0], COLOR_ERROR[1], COLOR_ERROR[2])
24+
curses.endwin()
25+
26+
# Get + Save the color sequences
27+
self.COLOR_SUCCESS = curses.tparm(self.fcap, 0)
28+
self.COLOR_ERROR = curses.tparm(self.fcap, 1)
29+
30+
def p(self, text, type):
31+
if type == self.success:
32+
print "%s%s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
33+
elif type == self.error:
34+
print "%s%s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)

content_processor.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,9 @@ def rankKeywords(text):
1212
if t in invalid_keywords:
1313
continue
1414
if not ranks.has_key(t):
15-
print "adding %s" % t
1615
ranks[t] = 1
1716
else:
1817
ranks[t] += 1
19-
print "setting %s to %i" % (t, ranks[t])
2018
return ranks
2119

2220
def stripPunctuation(text):
@@ -67,38 +65,28 @@ def combineKeywordLists(self):
6765
for k,v in l.items():
6866
if self.keywords.has_key(k):
6967
self.keywords[k] += v
70-
print "setting %s to %i" %(k,self.keywords[k])
7168
else:
7269
self.keywords[k] = v
73-
print "setting %s to %i" %(k,v)
7470

7571
# returns links to queue
7672
def processBody(self):
7773
queue = ready_queue(self.url, self.body)
78-
print "found %i links to queue" % len(queue)
7974
self.text = stripPunctuation(self.remove_html_tags(self.body))
8075
if len(self.text) > 5000:
8176
offset = 0
8277
i = 0
8378
l = []
84-
print "splitting text"
8579
while True:
8680
j = self.findnth(self.text[i:],' ',500)
8781
offset += j
88-
print "SPLIT: 500th space at %i" % j
8982
if j == -1:
90-
print "appending from %i on" % i
91-
l.append(self.text[i:])
9283
break
93-
print "appending from %i to %i" % (i,j)
9484
l.append(self.text[i:j])
9585
i = offset + j+1
96-
print "processing with %i threads" % len(l)
9786
pool = Pool(processes=(len(l)))
9887
self.keyword_dicts = pool.map(rankKeywords, l)
99-
print "processed, returned %i dicts" % len(self.keyword_dicts)
10088
else:
101-
self.keyword_dicts.append(self.rankKeywords(self.text))
89+
self.keyword_dicts.append(rankKeywords(self.text))
10290
return queue
10391

10492
def processHead(self):
@@ -117,18 +105,11 @@ def findnth(self, haystack, needle, n):
117105
# returns the queue from processBody
118106
def process(self):
119107
text_lower = self.text.lower()
120-
print "Finding title"
121108
self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
122-
print "Found title: %s" % self.title
123-
print "Finding head"
124109
self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
125-
print "Found head of length %i" % len(self.head)
126110
self.processHead()
127-
print "Finding body"
128111
self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
129-
print "Found body of length %i" % len(self.body)
130112
queue = self.processBody()
131-
print "combining keyword lists"
132113
self.combineKeywordLists()
133114
return queue
134115

query.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def dequeue(self):
7777
if not delres:
7878
return False
7979
# Return the row
80-
print result
8180
return result[0][1]
8281
return False
8382

settings.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,9 @@
88
DATABASE_PASS = "" # Not used with sqlite
99

1010
VERBOSE = True
11+
12+
# These values are for the text output colors.
13+
# List values are 0-255 RGB values, respectively.
14+
15+
COLOR_SUCCESS = [0, 255, 0] # Success Color (Green)
16+
COLOR_ERROR = [255, 0, 0] # Error Color (Red)

0 commit comments

Comments
 (0)