4
4
import urlparse
5
5
import threading
6
6
import sqlite3 as sqlite
7
+ import robotparser
7
8
# Try to import psyco for JIT compilation
8
9
try :
9
10
import psyco
16
17
1) database file name
17
18
2) start url
18
19
3) crawl depth
20
+ 4) verbose (optional)
19
21
Start out by checking to see if the args are there and
20
22
set them to their variables
21
23
"""
26
28
starturl = sys .argv [2 ]
27
29
crawldepth = int (sys .argv [3 ])
28
30
if len (sys .argv ) == 5 :
29
- if (sys .argv [4 ].uppercase == "TRUE" ):
31
+ if (sys .argv [4 ].upper () == "TRUE" ):
30
32
verbose = True
31
33
else :
32
34
verbose = False
61
63
# insert starting url into queue
62
64
63
65
class threader ( threading .Thread ):
66
+
67
+ # Parser for robots.txt that helps determine if we are allowed to fetch a url
68
+ rp = robotparser .RobotFileParser ()
69
+
64
70
"""
65
71
run()
66
72
Args:
@@ -78,7 +84,7 @@ def run(self):
78
84
cursor .execute ("DELETE FROM queue WHERE id = (?)" , (crawling [0 ], ))
79
85
connection .commit ()
80
86
if verbose :
81
- print crawling
87
+ print crawling [ 3 ]
82
88
except KeyError :
83
89
raise StopIteration
84
90
except :
@@ -111,6 +117,20 @@ def crawl(self, crawling):
111
117
curl = crawling [3 ]
112
118
# Split the link into its sections
113
119
url = urlparse .urlparse (curl )
120
+
121
+ try :
122
+ # Have our robot parser grab the robots.txt file and read it
123
+ self .rp .set_url ('http://' + url [1 ] + '/robots.txt' )
124
+ self .rp .read ()
125
+
126
+ # If we're not allowed to open a url, return the function to skip it
127
+ if not self .rp .can_fetch ('PyCrawler' , curl ):
128
+ if verbose :
129
+ print curl + " not allowed by robots.txt"
130
+ return
131
+ except :
132
+ pass
133
+
114
134
try :
115
135
# Add the link to the already crawled list
116
136
crawled .append (curl )
@@ -122,15 +142,13 @@ def crawl(self, crawling):
122
142
request = urllib2 .Request (curl )
123
143
# Add user-agent header to the request
124
144
request .add_header ("User-Agent" , "PyCrawler" )
125
- # Build the url opener, open the link and read it into response
145
+ # Build the url opener, open the link and read it into msg
126
146
opener = urllib2 .build_opener ()
127
- response = opener .open (request ).read ()
147
+ msg = opener .open (request ).read ()
128
148
129
149
except :
130
150
# If it doesn't load, skip this url
131
151
return
132
- # Read response
133
- msg = response .read ()
134
152
135
153
# Find what's between the title tags
136
154
startPos = msg .find ('<title>' )
@@ -161,10 +179,14 @@ def queue_links(self, url, links, cid, curdepth):
161
179
if curdepth < crawldepth :
162
180
# Read the links and inser them into the queue
163
181
for link in links :
182
+ cursor .execute ("SELECT url FROM queue WHERE url=?" , [link ])
183
+ for row in cursor :
184
+ if row [0 ].decode ('utf-8' ) == url :
185
+ continue
164
186
if link .startswith ('/' ):
165
187
link = 'http://' + url [1 ] + link
166
188
elif link .startswith ('#' ):
167
- link = 'http://' + url [ 1 ] + url [ 2 ] + link
189
+ continue
168
190
elif not link .startswith ('http' ):
169
191
link = 'http://' + url [1 ] + '/' + link
170
192
0 commit comments