Skip to content

Commit 3017da5

Browse files
committed
Added robots.txt compliance and some small fixes.
1 parent f24055b commit 3017da5

File tree

1 file changed

+29
-7
lines changed

1 file changed

+29
-7
lines changed

PyCrawler.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import urlparse
55
import threading
66
import sqlite3 as sqlite
7+
import robotparser
78
# Try to import psyco for JIT compilation
89
try:
910
import psyco
@@ -16,6 +17,7 @@
1617
1) database file name
1718
2) start url
1819
3) crawl depth
20+
4) verbose (optional)
1921
Start out by checking to see if the args are there and
2022
set them to their variables
2123
"""
@@ -26,7 +28,7 @@
2628
starturl = sys.argv[2]
2729
crawldepth = int(sys.argv[3])
2830
if len(sys.argv) == 5:
29-
if (sys.argv[4].uppercase == "TRUE"):
31+
if (sys.argv[4].upper() == "TRUE"):
3032
verbose = True
3133
else:
3234
verbose = False
@@ -61,6 +63,10 @@
6163
# insert starting url into queue
6264

6365
class threader ( threading.Thread ):
66+
67+
# Parser for robots.txt that helps determine if we are allowed to fetch a url
68+
rp = robotparser.RobotFileParser()
69+
6470
"""
6571
run()
6672
Args:
@@ -78,7 +84,7 @@ def run(self):
7884
cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
7985
connection.commit()
8086
if verbose:
81-
print crawling
87+
print crawling[3]
8288
except KeyError:
8389
raise StopIteration
8490
except:
@@ -111,6 +117,20 @@ def crawl(self, crawling):
111117
curl = crawling[3]
112118
# Split the link into its sections
113119
url = urlparse.urlparse(curl)
120+
121+
try:
122+
# Have our robot parser grab the robots.txt file and read it
123+
self.rp.set_url('http://' + url[1] + '/robots.txt')
124+
self.rp.read()
125+
126+
# If we're not allowed to open a url, return the function to skip it
127+
if not self.rp.can_fetch('PyCrawler', curl):
128+
if verbose:
129+
print curl + " not allowed by robots.txt"
130+
return
131+
except:
132+
pass
133+
114134
try:
115135
# Add the link to the already crawled list
116136
crawled.append(curl)
@@ -122,15 +142,13 @@ def crawl(self, crawling):
122142
request = urllib2.Request(curl)
123143
# Add user-agent header to the request
124144
request.add_header("User-Agent", "PyCrawler")
125-
# Build the url opener, open the link and read it into response
145+
# Build the url opener, open the link and read it into msg
126146
opener = urllib2.build_opener()
127-
response = opener.open(request).read()
147+
msg = opener.open(request).read()
128148

129149
except:
130150
# If it doesn't load, skip this url
131151
return
132-
# Read response
133-
msg = response.read()
134152

135153
# Find what's between the title tags
136154
startPos = msg.find('<title>')
@@ -161,10 +179,14 @@ def queue_links(self, url, links, cid, curdepth):
161179
if curdepth < crawldepth:
162180
# Read the links and inser them into the queue
163181
for link in links:
182+
cursor.execute("SELECT url FROM queue WHERE url=?", [link])
183+
for row in cursor:
184+
if row[0].decode('utf-8') == url:
185+
continue
164186
if link.startswith('/'):
165187
link = 'http://' + url[1] + link
166188
elif link.startswith('#'):
167-
link = 'http://' + url[1] + url[2] + link
189+
continue
168190
elif not link.startswith('http'):
169191
link = 'http://' + url[1] + '/' + link
170192

0 commit comments

Comments
 (0)