@@ -12,16 +12,17 @@ def rankKeywords(text):
12
12
if t in invalid_keywords :
13
13
continue
14
14
if not ranks .has_key (t ):
15
- print "adding %s" % t
15
+ # print "adding %s" % t
16
16
ranks [t ] = 1
17
17
else :
18
18
ranks [t ] += 1
19
- print "setting %s to %i" % (t , ranks [t ])
19
+ # print "setting %s to %i" % (t, ranks[t])
20
20
return ranks
21
21
22
22
def stripPunctuation (text ):
23
23
pattern = re .compile (r'[^\w\s]' )
24
24
return pattern .sub ('' , text )
25
+
25
26
class ContentProcessor :
26
27
27
28
def __init__ (self , url , status , text ):
@@ -67,38 +68,38 @@ def combineKeywordLists(self):
67
68
for k ,v in l .items ():
68
69
if self .keywords .has_key (k ):
69
70
self .keywords [k ] += v
70
- print "setting %s to %i" % (k ,self .keywords [k ])
71
+ # print "setting %s to %i" %(k,self.keywords[k])
71
72
else :
72
73
self .keywords [k ] = v
73
- print "setting %s to %i" % (k ,v )
74
+ # print "setting %s to %i" %(k,v)
74
75
75
76
# returns links to queue
76
77
def processBody (self ):
77
78
queue = ready_queue (self .url , self .body )
78
- print "found %i links to queue" % len (queue )
79
+ # print "found %i links to queue" % len(queue)
79
80
self .text = stripPunctuation (self .remove_html_tags (self .body ))
80
81
if len (self .text ) > 5000 :
81
82
offset = 0
82
83
i = 0
83
84
l = []
84
- print "splitting text"
85
+ # print "splitting text"
85
86
while True :
86
87
j = self .findnth (self .text [i :],' ' ,500 )
87
88
offset += j
88
- print "SPLIT: 500th space at %i" % j
89
+ # print "SPLIT: 500th space at %i" % j
89
90
if j == - 1 :
90
- print "appending from %i on" % i
91
+ # print "appending from %i on" % i
91
92
l .append (self .text [i :])
92
93
break
93
- print "appending from %i to %i" % (i ,j )
94
+ # print "appending from %i to %i" % (i,j)
94
95
l .append (self .text [i :j ])
95
96
i = offset + j + 1
96
- print "processing with %i threads" % len (l )
97
+ # print "processing with %i threads" % len(l)
97
98
pool = Pool (processes = (len (l )))
98
99
self .keyword_dicts = pool .map (rankKeywords , l )
99
- print "processed, returned %i dicts" % len (self .keyword_dicts )
100
+ # print "processed, returned %i dicts" % len(self.keyword_dicts)
100
101
else :
101
- self .keyword_dicts .append (self . rankKeywords (self .text ))
102
+ self .keyword_dicts .append (rankKeywords (self .text ))
102
103
return queue
103
104
104
105
def processHead (self ):
@@ -117,20 +118,23 @@ def findnth(self, haystack, needle, n):
117
118
# returns the queue from processBody
118
119
def process (self ):
119
120
text_lower = self .text .lower ()
120
- print "Finding title"
121
+ # print "Finding title"
121
122
self .title = self .text [text_lower .find ('<title' )+ 6 :text_lower .find ('</title>' )]
122
- print "Found title: %s" % self .title
123
- print "Finding head"
123
+ # print "Found title: %s" % self.title
124
+ # print "Finding head"
124
125
self .head = self .text [text_lower .find ('<head' )+ 5 :text_lower .find ('</head>' )]
125
- print "Found head of length %i" % len (self .head )
126
+ # print "Found head of length %i" % len(self.head)
126
127
self .processHead ()
127
- print "Finding body"
128
+ # print "Finding body"
128
129
self .body = self .text [text_lower .find ('<body' ):text_lower .find ('</body>' )]
129
- print "Found body of length %i" % len (self .body )
130
+ # print "Found body of length %i" % len(self.body)
130
131
queue = self .processBody ()
131
- print "combining keyword lists"
132
+ # print "combining keyword lists"
132
133
self .combineKeywordLists ()
133
134
return queue
134
135
135
136
def getDataDict (self ):
137
+ for k ,v in self .keywords .items ():
138
+ if v < 3 :
139
+ del self .keywords [k ]
136
140
return {"address" :self .url , "title" :self .title , "status" :self .status , "size" :self .size , "keywords" :self .keywords }
0 commit comments