Skip to content

Commit f24055b

Browse files
committed
Added more comments. Added optional arg 'verbose' as 4th arg to specify if the urls should be printed as they are being crawled.
1 parent 848ce4a commit f24055b

File tree

1 file changed

+33
-5
lines changed

1 file changed

+33
-5
lines changed

PyCrawler.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,25 @@
2525
dbname = sys.argv[1]
2626
starturl = sys.argv[2]
2727
crawldepth = int(sys.argv[3])
28-
28+
if len(sys.argv) == 5:
29+
if (sys.argv[4].uppercase == "TRUE"):
30+
verbose = True
31+
else:
32+
verbose = False
33+
else:
34+
verbose = False
2935
# urlparse the start url
3036
surlparsed = urlparse.urlparse(starturl)
3137

3238
# Connect to the db and create the tables if they don't already exist
3339
connection = sqlite.connect(dbname)
3440
cursor = connection.cursor()
41+
# crawl_index: holds all the information of the urls that have been crawled
3542
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
43+
# queue: this should be obvious
3644
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
45+
# status: Contains a record of when crawling was started and stopped.
46+
# Mostly in place for a future application to watch the crawl interactively.
3747
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
3848
connection.commit()
3949

@@ -51,7 +61,13 @@
5161
# insert starting url into queue
5262

5363
class threader ( threading.Thread ):
54-
# Main run method to run
64+
"""
65+
run()
66+
Args:
67+
none
68+
the run() method contains the main loop of the program. Each iteration takes the url
69+
at the top of the queue and starts the crawl of it.
70+
"""
5571
def run(self):
5672
while 1:
5773
try:
@@ -61,7 +77,8 @@ def run(self):
6177
# Remove the item from the queue
6278
cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
6379
connection.commit()
64-
print crawling
80+
if verbose:
81+
print crawling
6582
except KeyError:
6683
raise StopIteration
6784
except:
@@ -75,7 +92,14 @@ def run(self):
7592
# Crawl the link
7693
self.crawl(crawling)
7794

78-
95+
"""
96+
crawl()
97+
Args:
98+
crawling: this should be a url
99+
100+
crawl() opens the page at the "crawling" url, parses it and puts it into the databes.
101+
It looks for the page title, keywords, and links.
102+
"""
79103
def crawl(self, crawling):
80104
# crawler id
81105
cid = crawling[0]
@@ -94,9 +118,11 @@ def crawl(self, crawling):
94118
# If the crawled array is too big, deleted it and start over
95119
del crawled[:]
96120
try:
97-
# Load the link
121+
# Create a Request object
98122
request = urllib2.Request(curl)
123+
# Add user-agent header to the request
99124
request.add_header("User-Agent", "PyCrawler")
125+
# Build the url opener, open the link and read it into response
100126
opener = urllib2.build_opener()
101127
response = opener.open(request).read()
102128

@@ -129,6 +155,8 @@ def crawl(self, crawling):
129155
connection.commit()
130156
except:
131157
pass
158+
159+
132160
def queue_links(self, url, links, cid, curdepth):
133161
if curdepth < crawldepth:
134162
# Read the links and inser them into the queue

0 commit comments

Comments
 (0)