Skip to content

Commit 3ed76c2

Browse files
committed
Added simple comments
1 parent f930b67 commit 3ed76c2

File tree

1 file changed

+38
-11
lines changed

1 file changed

+38
-11
lines changed

PyCrawler.py

+38-11
Original file line numberDiff line numberDiff line change
@@ -4,72 +4,103 @@
44
import urlparse
55
import threading
66
import sqlite3 as sqlite
7+
# Try to import psyco for JIT compilation
78
try:
89
import psyco
910
psyco.full()
1011
except ImportError:
1112
print "Continuing without psyco JIT compilation!"
1213

14+
# Connect to the db and create the tables if they don't already exist
1315
connection = sqlite.connect('crawl.db')
1416
cursor = connection.cursor()
1517
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index ( url VARCHAR(256) PRIMARY KEY, title VARCHAR(256), keywords VARCHAR(256) )')
1618
cursor.execute('CREATE TABLE IF NOT EXISTS queue ( url VARCHAR(256) PRIMARY KEY )')
1719
connection.commit()
1820

21+
# Check for a start point
1922
if len(argv) < 2:
2023
print "No starting point! Checking existing queue"
2124
cursor.execute("SELECT * FROM queue LIMIT 1")
2225
c = cursor.fetchone()
2326
if c == None:
2427
sys.exit("ERROR: No start point! Exiting")
25-
try:
26-
if sys.argv[1]:
27-
cursor.execute("INSERT INTO queue VALUES ( (?) )", (sys.argv[1], ))
28-
connection.commit()
29-
except:
30-
pass
28+
else:
29+
try:
30+
if sys.argv[1]:
31+
cursor.execute("INSERT INTO queue VALUES ( (?) )", (sys.argv[1], ))
32+
connection.commit()
33+
except:
34+
pass
35+
36+
# Compile keyword and link regex expressions
3137
keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>')
3238
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
3339
crawled = []
40+
3441
class threader ( threading.Thread ):
42+
# Main run method to run
3543
def run(self):
3644
while 1:
3745
try:
46+
# Get the first item from the queue
3847
cursor.execute("SELECT * FROM queue LIMIT 1")
3948
crawling = cursor.fetchone()
4049
crawling = crawling[0]
50+
# Remove the item from the queue
4151
cursor.execute("DELETE FROM queue WHERE url = (?)", (crawling, ))
4252
connection.commit()
4353
print crawling
4454
except KeyError:
4555
raise StopIteration
56+
# Crawl the link
4657
self.crawl(crawling)
4758

4859
def crawl(self, crawling):
60+
# Split the link into its sections
4961
url = urlparse.urlparse(crawling)
5062
try:
63+
# Add the link to the already crawled list
5164
crawled.append(crawling)
5265
except MemoryError:
66+
# If the crawled array is too big, deleted it and start over
5367
del crawled[:]
5468
try:
69+
# Load the link
5570
response = urllib2.urlopen(crawling)
5671
except:
72+
# If it doesn't load, kill the function
5773
return
74+
# Read response
5875
msg = response.read()
76+
# Find the title of the page
5977
startPos = msg.find('<title>')
6078
if startPos != -1:
6179
endPos = msg.find('</title>', startPos+7)
6280
if endPos != -1:
6381
title = msg[startPos+7:endPos]
82+
# Get the keywords
6483
keywordlist = keywordregex.findall(msg)
6584
if len(keywordlist) > 0:
6685
keywordlist = keywordlist[0]
6786
else:
6887
keywordlist = ""
88+
# Get the links
6989
links = linkregex.findall(msg)
7090
title.replace("'", "\'")
7191
keywordlist.replace("'", "\'")
7292

93+
# queue up the links
94+
queue_links(links)
95+
96+
try:
97+
# Put now crawled link into the db
98+
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?) )", (crawling, title, keywordlist))
99+
connection.commit()
100+
except:
101+
pass
102+
def queue_links(links):
103+
# Read the links and inser them into the queue
73104
for link in (links.pop(0) for _ in xrange(len(links))):
74105
if link.startswith('/'):
75106
link = 'http://' + url[1] + link
@@ -83,10 +114,6 @@ def crawl(self, crawling):
83114
connection.commit()
84115
except:
85116
continue
86-
try:
87-
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?) )", (crawling, title, keywordlist))
88-
connection.commit()
89-
except:
90-
pass
91117
if __name__ == '__main__':
118+
# Run main loop
92119
threader().run()

0 commit comments

Comments
 (0)