4
4
import urlparse
5
5
import threading
6
6
import sqlite3 as sqlite
7
+ # Try to import psyco for JIT compilation
7
8
try :
8
9
import psyco
9
10
psyco .full ()
10
11
except ImportError :
11
12
print "Continuing without psyco JIT compilation!"
12
13
14
+ # Connect to the db and create the tables if they don't already exist
13
15
connection = sqlite .connect ('crawl.db' )
14
16
cursor = connection .cursor ()
15
17
cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index ( url VARCHAR(256) PRIMARY KEY, title VARCHAR(256), keywords VARCHAR(256) )' )
16
18
cursor .execute ('CREATE TABLE IF NOT EXISTS queue ( url VARCHAR(256) PRIMARY KEY )' )
17
19
connection .commit ()
18
20
21
+ # Check for a start point
19
22
if len (argv ) < 2 :
20
23
print "No starting point! Checking existing queue"
21
24
cursor .execute ("SELECT * FROM queue LIMIT 1" )
22
25
c = cursor .fetchone ()
23
26
if c == None :
24
27
sys .exit ("ERROR: No start point! Exiting" )
25
- try :
26
- if sys .argv [1 ]:
27
- cursor .execute ("INSERT INTO queue VALUES ( (?) )" , (sys .argv [1 ], ))
28
- connection .commit ()
29
- except :
30
- pass
28
+ else :
29
+ try :
30
+ if sys .argv [1 ]:
31
+ cursor .execute ("INSERT INTO queue VALUES ( (?) )" , (sys .argv [1 ], ))
32
+ connection .commit ()
33
+ except :
34
+ pass
35
+
36
+ # Compile keyword and link regex expressions
31
37
keywordregex = re .compile ('<meta\sname=["\' ]keywords["\' ]\scontent=["\' ](.*?)["\' ]\s/>' )
32
38
linkregex = re .compile ('<a\s*href=[\' |"](.*?)[\' "].*?>' )
33
39
crawled = []
40
+
34
41
class threader ( threading .Thread ):
42
+ # Main run method to run
35
43
def run (self ):
36
44
while 1 :
37
45
try :
46
+ # Get the first item from the queue
38
47
cursor .execute ("SELECT * FROM queue LIMIT 1" )
39
48
crawling = cursor .fetchone ()
40
49
crawling = crawling [0 ]
50
+ # Remove the item from the queue
41
51
cursor .execute ("DELETE FROM queue WHERE url = (?)" , (crawling , ))
42
52
connection .commit ()
43
53
print crawling
44
54
except KeyError :
45
55
raise StopIteration
56
+ # Crawl the link
46
57
self .crawl (crawling )
47
58
48
59
def crawl (self , crawling ):
60
+ # Split the link into its sections
49
61
url = urlparse .urlparse (crawling )
50
62
try :
63
+ # Add the link to the already crawled list
51
64
crawled .append (crawling )
52
65
except MemoryError :
66
+ # If the crawled array is too big, deleted it and start over
53
67
del crawled [:]
54
68
try :
69
+ # Load the link
55
70
response = urllib2 .urlopen (crawling )
56
71
except :
72
+ # If it doesn't load, kill the function
57
73
return
74
+ # Read response
58
75
msg = response .read ()
76
+ # Find the title of the page
59
77
startPos = msg .find ('<title>' )
60
78
if startPos != - 1 :
61
79
endPos = msg .find ('</title>' , startPos + 7 )
62
80
if endPos != - 1 :
63
81
title = msg [startPos + 7 :endPos ]
82
+ # Get the keywords
64
83
keywordlist = keywordregex .findall (msg )
65
84
if len (keywordlist ) > 0 :
66
85
keywordlist = keywordlist [0 ]
67
86
else :
68
87
keywordlist = ""
88
+ # Get the links
69
89
links = linkregex .findall (msg )
70
90
title .replace ("'" , "\' " )
71
91
keywordlist .replace ("'" , "\' " )
72
92
93
+ # queue up the links
94
+ queue_links (links )
95
+
96
+ try :
97
+ # Put now crawled link into the db
98
+ cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?) )" , (crawling , title , keywordlist ))
99
+ connection .commit ()
100
+ except :
101
+ pass
102
+ def queue_links (links ):
103
+ # Read the links and inser them into the queue
73
104
for link in (links .pop (0 ) for _ in xrange (len (links ))):
74
105
if link .startswith ('/' ):
75
106
link = 'http://' + url [1 ] + link
@@ -83,10 +114,6 @@ def crawl(self, crawling):
83
114
connection .commit ()
84
115
except :
85
116
continue
86
- try :
87
- cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?) )" , (crawling , title , keywordlist ))
88
- connection .commit ()
89
- except :
90
- pass
91
117
if __name__ == '__main__' :
118
+ # Run main loop
92
119
threader ().run ()
0 commit comments