25
25
dbname = sys .argv [1 ]
26
26
starturl = sys .argv [2 ]
27
27
crawldepth = int (sys .argv [3 ])
28
-
28
+ if len (sys .argv ) == 5 :
29
+ if (sys .argv [4 ].uppercase == "TRUE" ):
30
+ verbose = True
31
+ else :
32
+ verbose = False
33
+ else :
34
+ verbose = False
29
35
# urlparse the start url
30
36
surlparsed = urlparse .urlparse (starturl )
31
37
32
38
# Connect to the db and create the tables if they don't already exist
33
39
connection = sqlite .connect (dbname )
34
40
cursor = connection .cursor ()
41
+ # crawl_index: holds all the information of the urls that have been crawled
35
42
cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
43
+ # queue: this should be obvious
36
44
cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))' )
45
+ # status: Contains a record of when crawling was started and stopped.
46
+ # Mostly in place for a future application to watch the crawl interactively.
37
47
cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
38
48
connection .commit ()
39
49
51
61
# insert starting url into queue
52
62
53
63
class threader ( threading .Thread ):
54
- # Main run method to run
64
+ """
65
+ run()
66
+ Args:
67
+ none
68
+ the run() method contains the main loop of the program. Each iteration takes the url
69
+ at the top of the queue and starts the crawl of it.
70
+ """
55
71
def run (self ):
56
72
while 1 :
57
73
try :
@@ -61,7 +77,8 @@ def run(self):
61
77
# Remove the item from the queue
62
78
cursor .execute ("DELETE FROM queue WHERE id = (?)" , (crawling [0 ], ))
63
79
connection .commit ()
64
- print crawling
80
+ if verbose :
81
+ print crawling
65
82
except KeyError :
66
83
raise StopIteration
67
84
except :
@@ -75,7 +92,14 @@ def run(self):
75
92
# Crawl the link
76
93
self .crawl (crawling )
77
94
78
-
95
+ """
96
+ crawl()
97
+ Args:
98
+ crawling: this should be a url
99
+
100
+ crawl() opens the page at the "crawling" url, parses it and puts it into the databes.
101
+ It looks for the page title, keywords, and links.
102
+ """
79
103
def crawl (self , crawling ):
80
104
# crawler id
81
105
cid = crawling [0 ]
@@ -94,9 +118,11 @@ def crawl(self, crawling):
94
118
# If the crawled array is too big, deleted it and start over
95
119
del crawled [:]
96
120
try :
97
- # Load the link
121
+ # Create a Request object
98
122
request = urllib2 .Request (curl )
123
+ # Add user-agent header to the request
99
124
request .add_header ("User-Agent" , "PyCrawler" )
125
+ # Build the url opener, open the link and read it into response
100
126
opener = urllib2 .build_opener ()
101
127
response = opener .open (request ).read ()
102
128
@@ -129,6 +155,8 @@ def crawl(self, crawling):
129
155
connection .commit ()
130
156
except :
131
157
pass
158
+
159
+
132
160
def queue_links (self , url , links , cid , curdepth ):
133
161
if curdepth < crawldepth :
134
162
# Read the links and inser them into the queue
0 commit comments