12
12
print "Continuing without psyco JIT compilation!"
13
13
14
14
"""
15
- The program should take 3 arguments
15
+ The program should take arguments
16
16
1) database file name
17
17
2) start url
18
18
3) crawl depth
19
19
Start out by checking to see if the args are there and
20
20
set them to their variables
21
21
"""
22
- if len (sys .argv ) < 4 :
22
+ if len (sys .argv ) < 5 :
23
23
sys .exit ("Not enough arguments!" )
24
24
else :
25
25
dbname = sys .argv [1 ]
26
26
starturl = sys .argv [2 ]
27
27
crawldepth = int (sys .argv [3 ])
28
28
29
+ # urlparse the start url
30
+ surlparsed = urlparse .urlparse (starturl )
29
31
30
32
# Connect to the db and create the tables if they don't already exist
31
33
connection = sqlite .connect (dbname )
35
37
cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
36
38
connection .commit ()
37
39
38
- """
39
- # Check for a start point
40
- if len(sys.argv) < 2:
41
- print "No starting point! Checking existing queue"
42
- cursor.execute("SELECT * FROM queue LIMIT 1")
43
- c = cursor.fetchone()
44
- if c == None:
45
- sys.exit("ERROR: No start point! Exiting")
46
- else:
47
- try:
48
- if sys.argv[1]:
49
- cursor.execute("INSERT INTO queue VALUES ( (?) )", (sys.argv[1], ))
50
- connection.commit()
51
- except:
52
- pass
53
- """
54
-
55
40
# Compile keyword and link regex expressions
56
41
keywordregex = re .compile ('<meta\sname=["\' ]keywords["\' ]\scontent=["\' ](.*?)["\' ]\s/>' )
57
42
linkregex = re .compile ('<a\s*href=[\' |"](.*?)[\' "].*?>' )
@@ -112,11 +97,12 @@ def crawl(self, crawling):
112
97
# Load the link
113
98
response = urllib2 .urlopen (curl )
114
99
except :
115
- # If it doesn't load, kill the function
100
+ # If it doesn't load, skip this url
116
101
return
117
102
# Read response
118
103
msg = response .read ()
119
104
105
+ # Find what's between the title tags
120
106
startPos = msg .find ('<title>' )
121
107
if startPos != - 1 :
122
108
endPos = msg .find ('</title>' , startPos + 7 )
@@ -142,13 +128,14 @@ def crawl(self, crawling):
142
128
def queue_links (self , url , links , cid , curdepth ):
143
129
if curdepth < crawldepth :
144
130
# Read the links and inser them into the queue
145
- for link in ( links . pop ( 0 ) for _ in xrange ( len ( links ))) :
131
+ for link in links :
146
132
if link .startswith ('/' ):
147
133
link = 'http://' + url [1 ] + link
148
134
elif link .startswith ('#' ):
149
135
link = 'http://' + url [1 ] + url [2 ] + link
150
136
elif not link .startswith ('http' ):
151
137
link = 'http://' + url [1 ] + '/' + link
138
+
152
139
if link .decode ('utf-8' ) not in crawled :
153
140
try :
154
141
cursor .execute ("INSERT INTO queue VALUES ( (?), (?), (?), (?) )" , (None , cid , curdepth + 1 , link ))
0 commit comments