4
4
import urlparse
5
5
import threading
6
6
import sqlite3 as sqlite
7
- from BeautifulSoup import BeautifulSoup
8
7
# Try to import psyco for JIT compilation
9
8
try :
10
9
import psyco
25
24
else :
26
25
dbname = sys .argv [1 ]
27
26
starturl = sys .argv [2 ]
28
- crawldepth = sys .argv [3 ]
27
+ crawldepth = int ( sys .argv [3 ])
29
28
30
29
31
30
# Connect to the db and create the tables if they don't already exist
32
- connection = sqlite .connect (db )
31
+ connection = sqlite .connect (dbname )
33
32
cursor = connection .cursor ()
34
33
cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )' )
35
- cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256) PRIMARY KEY )' )
34
+ cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))' )
36
35
cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
37
36
connection .commit ()
38
37
60
59
61
60
# set crawling status and stick starting url into the queue
62
61
cursor .execute ("INSERT INTO status VALUES ((?), (?))" , (1 , "datetime('now')" ))
63
- cursor .execute ("INSERT INTO queue VALUES ((?), (?), (?)) " , (None , 0 , 0 , staturl ))
62
+ cursor .execute ("INSERT INTO queue VALUES ((?), (?), (?), (?)) " , (None , 0 , 0 , starturl ))
64
63
connection .commit ()
65
64
66
65
@@ -80,6 +79,8 @@ def run(self):
80
79
print crawling
81
80
except KeyError :
82
81
raise StopIteration
82
+ except :
83
+ pass
83
84
84
85
# if theres nothing in the que, then set the status to done and exit
85
86
if crawling == None :
@@ -116,10 +117,11 @@ def crawl(self, crawling):
116
117
# Read response
117
118
msg = response .read ()
118
119
119
- # Create the BS object for parsing the doc
120
- soup = BeautifulSoup (msg )
121
- # find the title
122
- title = soup .find ('title' limit = 1 )
120
+ startPos = msg .find ('<title>' )
121
+ if startPos != - 1 :
122
+ endPos = msg .find ('</title>' , startPos + 7 )
123
+ if endPos != - 1 :
124
+ title = msg [startPos + 7 :endPos ]
123
125
124
126
keywordlist = keywordregex .findall (msg )
125
127
if len (keywordlist ) > 0 :
@@ -128,19 +130,16 @@ def crawl(self, crawling):
128
130
keywordlist = ""
129
131
# Get the links
130
132
links = linkregex .findall (msg )
131
- title .replace ("'" , "\' " )
132
- keywordlist .replace ("'" , "\' " )
133
-
134
133
# queue up the links
135
- queue_links (links , cid , curdepth )
134
+ self . queue_links (url , links , cid , curdepth )
136
135
137
136
try :
138
137
# Put now crawled link into the db
139
138
cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?) )" , (cid , pid , curl , title , keywordlist ))
140
139
connection .commit ()
141
140
except :
142
141
pass
143
- def queue_links (self , links , cid , curdepth ):
142
+ def queue_links (self , url , links , cid , curdepth ):
144
143
if curdepth < crawldepth :
145
144
# Read the links and inser them into the queue
146
145
for link in (links .pop (0 ) for _ in xrange (len (links ))):
0 commit comments