Skip to content

Commit 9a25e59

Browse files
committed
logic error debugging
1 parent 2b3f0df commit 9a25e59

File tree

1 file changed

+8
-21
lines changed

1 file changed

+8
-21
lines changed

PyCrawler.py

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,22 @@
1212
print "Continuing without psyco JIT compilation!"
1313

1414
"""
15-
The program should take 3 arguments
15+
The program should take arguments
1616
1) database file name
1717
2) start url
1818
3) crawl depth
1919
Start out by checking to see if the args are there and
2020
set them to their variables
2121
"""
22-
if len(sys.argv) < 4:
22+
if len(sys.argv) < 5:
2323
sys.exit("Not enough arguments!")
2424
else:
2525
dbname = sys.argv[1]
2626
starturl = sys.argv[2]
2727
crawldepth = int(sys.argv[3])
2828

29+
# urlparse the start url
30+
surlparsed = urlparse.urlparse(starturl)
2931

3032
# Connect to the db and create the tables if they don't already exist
3133
connection = sqlite.connect(dbname)
@@ -35,23 +37,6 @@
3537
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
3638
connection.commit()
3739

38-
"""
39-
# Check for a start point
40-
if len(sys.argv) < 2:
41-
print "No starting point! Checking existing queue"
42-
cursor.execute("SELECT * FROM queue LIMIT 1")
43-
c = cursor.fetchone()
44-
if c == None:
45-
sys.exit("ERROR: No start point! Exiting")
46-
else:
47-
try:
48-
if sys.argv[1]:
49-
cursor.execute("INSERT INTO queue VALUES ( (?) )", (sys.argv[1], ))
50-
connection.commit()
51-
except:
52-
pass
53-
"""
54-
5540
# Compile keyword and link regex expressions
5641
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
5742
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
@@ -112,11 +97,12 @@ def crawl(self, crawling):
11297
# Load the link
11398
response = urllib2.urlopen(curl)
11499
except:
115-
# If it doesn't load, kill the function
100+
# If it doesn't load, skip this url
116101
return
117102
# Read response
118103
msg = response.read()
119104

105+
# Find what's between the title tags
120106
startPos = msg.find('<title>')
121107
if startPos != -1:
122108
endPos = msg.find('</title>', startPos+7)
@@ -142,13 +128,14 @@ def crawl(self, crawling):
142128
def queue_links(self, url, links, cid, curdepth):
143129
if curdepth < crawldepth:
144130
# Read the links and inser them into the queue
145-
for link in (links.pop(0) for _ in xrange(len(links))):
131+
for link in links:
146132
if link.startswith('/'):
147133
link = 'http://' + url[1] + link
148134
elif link.startswith('#'):
149135
link = 'http://' + url[1] + url[2] + link
150136
elif not link.startswith('http'):
151137
link = 'http://' + url[1] + '/' + link
138+
152139
if link.decode('utf-8') not in crawled:
153140
try:
154141
cursor.execute("INSERT INTO queue VALUES ( (?), (?), (?), (?) )", (None, cid, curdepth+1, link))

0 commit comments

Comments
 (0)