1
- #!/usr/bin/python
2
- import sys
3
- import re
4
- import urllib2
5
- import urlparse
6
- import threading
7
- import sqlite3 as sqlite
8
- import robotparser
9
- # Try to import psyco for JIT compilation
1
+ from query import CrawlerDb
2
+ from content_processor import ContentProcessor
3
+ from settings import VERBOSE
4
+ import sys , urlparse , urllib2
10
5
6
+ # ===== Init stuff =====
11
7
12
- """
13
- The program should take arguments
14
- 1) database file name
15
- 2) start url
16
- 3) crawl depth
17
- 4) domains to limit to, regex (optional)
18
- 5) verbose (optional)
19
- Start out by checking to see if the args are there and
20
- set them to their variables
21
- """
22
- if len (sys .argv ) < 4 :
23
- sys .exit ("Not enough arguments!" )
24
- else :
25
- dbname = sys .argv [1 ]
26
- starturl = sys .argv [2 ]
27
- crawldepth = int (sys .argv [3 ])
28
- if len (sys .argv ) >= 5 :
29
- domains = sys .argv [4 ]
30
- if len (sys .argv ) == 6 :
31
- if (sys .argv [5 ].upper () == "TRUE" ):
32
- verbose = True
33
- else :
34
- verbose = False
35
- else :
36
- domains = False
37
- verbose = False
38
- # urlparse the start url
39
- surlparsed = urlparse .urlparse (starturl )
8
+ # db init
9
+ cdb = CrawlerDb ()
10
+ cdb .connect ()
40
11
41
- # Connect to the db and create the tables if they don't already exist
42
- connection = sqlite .connect (dbname )
43
- cursor = connection .cursor ()
44
- # crawl_index: holds all the information of the urls that have been crawled
45
- cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256), status INTEGER )' )
46
- # queue: this should be obvious
47
- cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))' )
48
- # status: Contains a record of when crawling was started and stopped.
49
- # Mostly in place for a future application to watch the crawl interactively.
50
- cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
51
- connection .commit ()
12
+ # content processor init
13
+ processor = ContentProcessor (None , None , None )
52
14
53
- # Compile keyword and link regex expressions
54
- keywordregex = re .compile ('<meta\sname=["\' ]keywords["\' ]\scontent=["\' ](.*?)["\' ]\s/>' )
55
- linkregex = re .compile ('<a\s(?:.*?\s)*?href=[\' "](.*?)[\' "].*?>' )
56
- if domains :
57
- domainregex = re .compile (domains )
58
- else :
59
- domainregex = False
60
- crawled = []
15
+ if len (sys .argv ) < 2 :
16
+ print "Error: No start url was passed"
17
+ sys .exit ()
61
18
62
- # set crawling status and stick starting url into the queue
63
- cursor .execute ("INSERT INTO status VALUES ((?), (?))" , (1 , "datetime('now')" ))
64
- cursor .execute ("INSERT INTO queue VALUES ((?), (?), (?), (?))" , (None , 0 , 0 , starturl ))
65
- connection .commit ()
19
+ l = sys .argv [1 :]
66
20
21
+ cdb .enqueue (l )
67
22
68
- # insert starting url into queue
23
+ def crawl ():
24
+ print "starting..."
25
+ queue_empty = False
26
+ while True :
27
+ url = cdb .dequeue ()
28
+ print url
29
+ if cdb .checkCrawled (url ):
30
+ continue
31
+ if url is False :
32
+ queue_empty = True
69
33
70
- class threader ( threading .Thread ):
71
-
72
- # Parser for robots.txt that helps determine if we are allowed to fetch a url
73
- rp = robotparser .RobotFileParser ()
74
-
75
- """
76
- run()
77
- Args:
78
- none
79
- the run() method contains the main loop of the program. Each iteration takes the url
80
- at the top of the queue and starts the crawl of it.
81
- """
82
- def run (self ):
83
- while 1 :
84
- try :
85
- # Get the first item from the queue
86
- cursor .execute ("SELECT * FROM queue LIMIT 1" )
87
- crawling = cursor .fetchone ()
88
- # Remove the item from the queue
89
- cursor .execute ("DELETE FROM queue WHERE id = (?)" , (crawling [0 ], ))
90
- connection .commit ()
91
- if verbose :
92
- print crawling [3 ]
93
- except KeyError :
94
- raise StopIteration
95
- except :
96
- pass
97
-
98
- # if theres nothing in the que, then set the status to done and exit
99
- if crawling == None :
100
- cursor .execute ("INSERT INTO status VALUES ((?), datetime('now'))" , (0 ,))
101
- connection .commit ()
102
- sys .exit ("Done!" )
103
- # Crawl the link
104
- self .crawl (crawling )
105
-
106
- """
107
- crawl()
108
- Args:
109
- crawling: this should be a url
110
-
111
- crawl() opens the page at the "crawling" url, parses it and puts it into the database.
112
- It looks for the page title, keywords, and links.
113
- """
114
- def crawl (self , crawling ):
115
- # crawler id
116
- cid = crawling [0 ]
117
- # parent id. 0 if start url
118
- pid = crawling [1 ]
119
- # current depth
120
- curdepth = crawling [2 ]
121
- # crawling urL
122
- curl = crawling [3 ]
123
- if domainregex and not domainregex .search (curl ):
124
- return
125
- # Split the link into its sections
126
- url = urlparse .urlparse (curl )
127
-
34
+ # Get HTTPConnection
35
+ #connection = httplib.HTTPConnection(parsed_url.netloc)
36
+ # Make the request
37
+ #connection.request("GET", parsed_url.path)
38
+ # Get response
39
+ #response = connection.getresponse()
40
+ #data = response.read()
41
+ status = 0
42
+ request = None
128
43
try :
129
- # Have our robot parser grab the robots.txt file and read it
130
- self .rp .set_url ('http://' + url [1 ] + '/robots.txt' )
131
- self .rp .read ()
132
-
133
- # If we're not allowed to open a url, return the function to skip it
134
- if not self .rp .can_fetch ('PyCrawler' , curl ):
135
- if verbose :
136
- print curl + " not allowed by robots.txt"
137
- return
138
- except :
139
- pass
140
-
141
- try :
142
- # Add the link to the already crawled list
143
- crawled .append (curl )
144
- except MemoryError :
145
- # If the crawled array is too big, deleted it and start over
146
- del crawled [:]
147
- try :
148
- # Create a Request object
149
- request = urllib2 .Request (curl )
150
- # Add user-agent header to the request
151
- request .add_header ("User-Agent" , "PyCrawler" )
152
- # Build the url opener, open the link and read it into msg
153
- opener = urllib2 .build_opener ()
154
- f = opener .open (request )
155
- msg = f .read ()
156
- # put meta data in info
157
- info = f .info ()
158
-
159
-
44
+ request = urllib2 .urlopen (str (url ))
160
45
except urllib2 .URLError , e :
161
- # If it doesn't load, skip this url
162
- #print e.code
163
- try :
164
- cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )" , (cid , pid , curl , '' , '' , e .code ))
165
- connection .commit
166
- except :
167
- pass
46
+ print e .reason
47
+ except urllib2 .HTTPError , e :
48
+ status = e .code
49
+ if status == 0 :
50
+ status = 200
51
+ data = request .read ()
168
52
169
- return
170
-
171
- # Find what's between the title tags
172
- startPos = msg .find ('<title>' )
173
- if startPos != - 1 :
174
- endPos = msg .find ('</title>' , startPos + 7 )
175
- if endPos != - 1 :
176
- title = msg [startPos + 7 :endPos ]
177
-
178
- # Start keywords list with whats in the keywords meta tag if there is one
179
- keywordlist = keywordregex .findall (msg )
180
- if len (keywordlist ) > 0 :
181
- keywordlist = keywordlist [0 ]
182
- else :
183
- keywordlist = ""
184
-
185
-
186
-
187
- # Get the links
188
- links = linkregex .findall (msg )
189
- # queue up the links
190
- self .queue_links (url , links , cid , curdepth )
53
+ if VERBOSE :
54
+ print "Got %s status from %s" % (status , url )
55
+ processor .setInfo (str (url ), status , data )
56
+ add_queue = processor .process ()
57
+ l = len (add_queue )
58
+ print "Found %i links" % l
59
+ if l > 0 :
60
+ if queue_empty == True :
61
+ queue_empty = False
62
+ cdb .enqueue (add_queue )
63
+ cdb .addPage (processor .getDataDict ())
64
+ processor .reset ()
65
+ if queue_empty :
66
+ break
191
67
192
- try :
193
- # Put now crawled link into the db
194
- cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )" , (cid , pid , curl , title , keywordlist , 200 ))
195
- connection .commit ()
196
- except :
197
- pass
198
-
199
-
200
- def queue_links (self , url , links , cid , curdepth ):
201
- if curdepth < crawldepth :
202
- # Read the links and inser them into the queue
203
- for link in links :
204
- cursor .execute ("SELECT url FROM queue WHERE url=?" , [link ])
205
- for row in cursor :
206
- if row [0 ].decode ('utf-8' ) == url :
207
- continue
208
- if link .startswith ('/' ):
209
- link = 'http://' + url [1 ] + link
210
- elif link .startswith ('#' ):
211
- continue
212
- elif not link .startswith ('http' ):
213
- link = urlparse .urljoin (url .geturl (),link )
214
-
215
- if link .decode ('utf-8' ) not in crawled :
216
- try :
217
- cursor .execute ("INSERT INTO queue VALUES ( (?), (?), (?), (?) )" , (None , cid , curdepth + 1 , link ))
218
- connection .commit ()
219
- except :
220
- continue
221
- else :
222
- pass
223
- if __name__ == '__main__' :
224
- try :
225
- import psyco
226
- psyco .full ()
227
- except ImportError :
228
- print "Continuing without psyco JIT compilation!"
229
- # Run main loop
230
- threader ().run ()
68
+ print "finishing..."
69
+ cdb .close ()
70
+ print "done! goodbye!"
71
+
72
+ if __name__ == "__main__" :
73
+ crawl ()
0 commit comments