Skip to content

Commit 2b3f0df

Browse files
committed
Got rid of beautiful soup as it caused problems with any page that has any malformed html.fixed a few syntax errors
1 parent c07b5dd commit 2b3f0df

File tree

1 file changed

+13
-14
lines changed

1 file changed

+13
-14
lines changed

PyCrawler.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import urlparse
55
import threading
66
import sqlite3 as sqlite
7-
from BeautifulSoup import BeautifulSoup
87
# Try to import psyco for JIT compilation
98
try:
109
import psyco
@@ -25,14 +24,14 @@
2524
else:
2625
dbname = sys.argv[1]
2726
starturl = sys.argv[2]
28-
crawldepth = sys.argv[3]
27+
crawldepth = int(sys.argv[3])
2928

3029

3130
# Connect to the db and create the tables if they don't already exist
32-
connection = sqlite.connect(db)
31+
connection = sqlite.connect(dbname)
3332
cursor = connection.cursor()
3433
cursor.execute('CREATE TABLE IF NOT EXISTS crawl_index (id INTEGER, parent INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256) )')
35-
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256) PRIMARY KEY )')
34+
cursor.execute('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))')
3635
cursor.execute('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )')
3736
connection.commit()
3837

@@ -60,7 +59,7 @@
6059

6160
# set crawling status and stick starting url into the queue
6261
cursor.execute("INSERT INTO status VALUES ((?), (?))", (1, "datetime('now')"))
63-
cursor.execute("INSERT INTO queue VALUES ((?), (?), (?))", (None, 0, 0, staturl))
62+
cursor.execute("INSERT INTO queue VALUES ((?), (?), (?), (?))", (None, 0, 0, starturl))
6463
connection.commit()
6564

6665

@@ -80,6 +79,8 @@ def run(self):
8079
print crawling
8180
except KeyError:
8281
raise StopIteration
82+
except:
83+
pass
8384

8485
# if theres nothing in the que, then set the status to done and exit
8586
if crawling == None:
@@ -116,10 +117,11 @@ def crawl(self, crawling):
116117
# Read response
117118
msg = response.read()
118119

119-
# Create the BS object for parsing the doc
120-
soup = BeautifulSoup(msg)
121-
# find the title
122-
title = soup.find('title' limit=1)
120+
startPos = msg.find('<title>')
121+
if startPos != -1:
122+
endPos = msg.find('</title>', startPos+7)
123+
if endPos != -1:
124+
title = msg[startPos+7:endPos]
123125

124126
keywordlist = keywordregex.findall(msg)
125127
if len(keywordlist) > 0:
@@ -128,19 +130,16 @@ def crawl(self, crawling):
128130
keywordlist = ""
129131
# Get the links
130132
links = linkregex.findall(msg)
131-
title.replace("'", "\'")
132-
keywordlist.replace("'", "\'")
133-
134133
# queue up the links
135-
queue_links(links, cid, curdepth)
134+
self.queue_links(url, links, cid, curdepth)
136135

137136
try:
138137
# Put now crawled link into the db
139138
cursor.execute("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?) )", (cid, pid, curl, title, keywordlist))
140139
connection.commit()
141140
except:
142141
pass
143-
def queue_links(self, links, cid, curdepth):
142+
def queue_links(self, url, links, cid, curdepth):
144143
if curdepth < crawldepth:
145144
# Read the links and inser them into the queue
146145
for link in (links.pop(0) for _ in xrange(len(links))):

0 commit comments

Comments
 (0)