Skip to content

Commit ad8c39f

Browse files
committed
Complete redo
1 parent c462210 commit ad8c39f

8 files changed

+345
-231
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
1+
*.pyc
22
.DS_Store
3+
*.db

PyCrawler.py

+61-218
Original file line numberDiff line numberDiff line change
@@ -1,230 +1,73 @@
1-
#!/usr/bin/python
2-
import sys
3-
import re
4-
import urllib2
5-
import urlparse
6-
import threading
7-
import sqlite3 as sqlite
8-
import robotparser
9-
# Try to import psyco for JIT compilation
1+
from query import CrawlerDb
2+
from content_processor import ContentProcessor
3+
from settings import VERBOSE
4+
import sys, urlparse, urllib2
105

6+
# ===== Init stuff =====
117

12-
"""
13-
The program should take arguments
14-
1) database file name
15-
2) start url
16-
3) crawl depth
17-
4) domains to limit to, regex (optional)
18-
5) verbose (optional)
19-
Start out by checking to see if the args are there and
20-
set them to their variables
21-
"""
22-
if len(sys.argv) < 4:
23-
sys.exit("Not enough arguments!")
24-
else:
25-
dbname = sys.argv[1]
26-
starturl = sys.argv[2]
27-
crawldepth = int(sys.argv[3])
28-
if len(sys.argv) >= 5:
29-
domains = sys.argv[4]
30-
if len(sys.argv) == 6:
31-
if (sys.argv[5].upper() == "TRUE"):
32-
verbose = True
33-
else:
34-
verbose = False
35-
else:
36-
domains = False
37-
verbose = False
38-
# urlparse the start url
39-
surlparsed = urlparse.urlparse(starturl)
8+
# db init
9+
cdb = CrawlerDb()
10+
cdb.connect()
4011

41-
# Connect to the db and create the tables if they don't already exist
42-
connection = sqlite.connect(dbname)
43-
cursor = connection.cursor()
44-
# crawl_index: holds all the information of the urls that have been crawled
45-
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256), status INTEGER )')
46-
# queue: this should be obvious
47-
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
48-
# status: Contains a record of when crawling was started and stopped.
49-
# Mostly in place for a future application to watch the crawl interactively.
50-
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
51-
connection.commit()
12+
# content processor init
13+
processor = ContentProcessor(None, None, None)
5214

53-
# Compile keyword and link regex expressions
54-
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
55-
linkregex = re.compile('<a\s(?:.*?\s)*?href=[\'"](.*?)[\'"].*?>')
56-
if domains:
57-
domainregex = re.compile(domains)
58-
else:
59-
domainregex = False
60-
crawled = []
15+
if len(sys.argv) < 2:
16+
print "Error: No start url was passed"
17+
sys.exit()
6118

62-
# set crawling status and stick starting url into the queue
63-
cursor.execute("INSERT INTO status VALUES ((?), (?))", (1, "datetime('now')"))
64-
cursor.execute("INSERT INTO queue VALUES ((?), (?), (?), (?))", (None, 0, 0, starturl))
65-
connection.commit()
19+
l = sys.argv[1:]
6620

21+
cdb.enqueue(l)
6722

68-
# insert starting url into queue
23+
def crawl():
24+
print "starting..."
25+
queue_empty = False
26+
while True:
27+
url = cdb.dequeue()
28+
print url
29+
if cdb.checkCrawled(url):
30+
continue
31+
if url is False:
32+
queue_empty = True
6933

70-
class threader ( threading.Thread ):
71-
72-
# Parser for robots.txt that helps determine if we are allowed to fetch a url
73-
rp = robotparser.RobotFileParser()
74-
75-
"""
76-
run()
77-
Args:
78-
none
79-
the run() method contains the main loop of the program. Each iteration takes the url
80-
at the top of the queue and starts the crawl of it.
81-
"""
82-
def run(self):
83-
while 1:
84-
try:
85-
# Get the first item from the queue
86-
cursor.execute("SELECT * FROM queue LIMIT 1")
87-
crawling = cursor.fetchone()
88-
# Remove the item from the queue
89-
cursor.execute("DELETE FROM queue WHERE id = (?)", (crawling[0], ))
90-
connection.commit()
91-
if verbose:
92-
print crawling[3]
93-
except KeyError:
94-
raise StopIteration
95-
except:
96-
pass
97-
98-
# if theres nothing in the que, then set the status to done and exit
99-
if crawling == None:
100-
cursor.execute("INSERT INTO status VALUES ((?), datetime('now'))", (0,))
101-
connection.commit()
102-
sys.exit("Done!")
103-
# Crawl the link
104-
self.crawl(crawling)
105-
106-
"""
107-
crawl()
108-
Args:
109-
crawling: this should be a url
110-
111-
crawl() opens the page at the "crawling" url, parses it and puts it into the database.
112-
It looks for the page title, keywords, and links.
113-
"""
114-
def crawl(self, crawling):
115-
# crawler id
116-
cid = crawling[0]
117-
# parent id. 0 if start url
118-
pid = crawling[1]
119-
# current depth
120-
curdepth = crawling[2]
121-
# crawling urL
122-
curl = crawling[3]
123-
if domainregex and not domainregex.search(curl):
124-
return
125-
# Split the link into its sections
126-
url = urlparse.urlparse(curl)
127-
34+
# Get HTTPConnection
35+
#connection = httplib.HTTPConnection(parsed_url.netloc)
36+
# Make the request
37+
#connection.request("GET", parsed_url.path)
38+
# Get response
39+
#response = connection.getresponse()
40+
#data = response.read()
41+
status = 0
42+
request = None
12843
try:
129-
# Have our robot parser grab the robots.txt file and read it
130-
self.rp.set_url('http://' + url[1] + '/robots.txt')
131-
self.rp.read()
132-
133-
# If we're not allowed to open a url, return the function to skip it
134-
if not self.rp.can_fetch('PyCrawler', curl):
135-
if verbose:
136-
print curl + " not allowed by robots.txt"
137-
return
138-
except:
139-
pass
140-
141-
try:
142-
# Add the link to the already crawled list
143-
crawled.append(curl)
144-
except MemoryError:
145-
# If the crawled array is too big, deleted it and start over
146-
del crawled[:]
147-
try:
148-
# Create a Request object
149-
request = urllib2.Request(curl)
150-
# Add user-agent header to the request
151-
request.add_header("User-Agent", "PyCrawler")
152-
# Build the url opener, open the link and read it into msg
153-
opener = urllib2.build_opener()
154-
f = opener.open(request)
155-
msg = f.read()
156-
# put meta data in info
157-
info = f.info()
158-
159-
44+
request = urllib2.urlopen(str(url))
16045
except urllib2.URLError, e:
161-
# If it doesn't load, skip this url
162-
#print e.code
163-
try:
164-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, '', '', e.code))
165-
connection.commit
166-
except:
167-
pass
46+
print e.reason
47+
except urllib2.HTTPError, e:
48+
status = e.code
49+
if status == 0:
50+
status = 200
51+
data = request.read()
16852

169-
return
170-
171-
# Find what's between the title tags
172-
startPos = msg.find('<title>')
173-
if startPos != -1:
174-
endPos = msg.find('</title>', startPos+7)
175-
if endPos != -1:
176-
title = msg[startPos+7:endPos]
177-
178-
# Start keywords list with whats in the keywords meta tag if there is one
179-
keywordlist = keywordregex.findall(msg)
180-
if len(keywordlist) > 0:
181-
keywordlist = keywordlist[0]
182-
else:
183-
keywordlist = ""
184-
185-
186-
187-
# Get the links
188-
links = linkregex.findall(msg)
189-
# queue up the links
190-
self.queue_links(url, links, cid, curdepth)
53+
if VERBOSE:
54+
print "Got %s status from %s" % (status, url)
55+
processor.setInfo(str(url), status, data)
56+
add_queue = processor.process()
57+
l = len(add_queue)
58+
print "Found %i links" % l
59+
if l > 0:
60+
if queue_empty == True:
61+
queue_empty = False
62+
cdb.enqueue(add_queue)
63+
cdb.addPage(processor.getDataDict())
64+
processor.reset()
65+
if queue_empty:
66+
break
19167

192-
try:
193-
# Put now crawled link into the db
194-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist, 200))
195-
connection.commit()
196-
except:
197-
pass
198-
199-
200-
def queue_links(self, url, links, cid, curdepth):
201-
if curdepth < crawldepth:
202-
# Read the links and inser them into the queue
203-
for link in links:
204-
cursor.execute("SELECT url FROM queue WHERE url=?", [link])
205-
for row in cursor:
206-
if row[0].decode('utf-8') == url:
207-
continue
208-
if link.startswith('/'):
209-
link = 'http://' + url[1] + link
210-
elif link.startswith('#'):
211-
continue
212-
elif not link.startswith('http'):
213-
link = urlparse.urljoin(url.geturl(),link)
214-
215-
if link.decode('utf-8') not in crawled:
216-
try:
217-
cursor.execute("INSERT INTO queue VALUES ( (?), (?), (?), (?) )", (None, cid, curdepth+1, link))
218-
connection.commit()
219-
except:
220-
continue
221-
else:
222-
pass
223-
if __name__ == '__main__':
224-
try:
225-
import psyco
226-
psyco.full()
227-
except ImportError:
228-
print "Continuing without psyco JIT compilation!"
229-
# Run main loop
230-
threader().run()
68+
print "finishing..."
69+
cdb.close()
70+
print "done! goodbye!"
71+
72+
if __name__ == "__main__":
73+
crawl()

README

-12
This file was deleted.

__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)