Skip to content

Commit d4fe619

Browse files
authored
Merge pull request #193 from cdhigh/master
Better phrase detection for CJK languages. Thanks @cdhigh!
2 parents b220919 + 16ce81d commit d4fe619

File tree

2 files changed

+19
-10
lines changed

2 files changed

+19
-10
lines changed

readability/cleaners.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
22
import re
3-
from lxml.html.clean import Cleaner
3+
try:
4+
from lxml.html.clean import Cleaner
5+
except ImportError:
6+
from lxml_html_clean import Cleaner
47

58
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
69
single_quoted = "'[^']+'"

readability/htmls.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,29 +110,35 @@ def shorten_title(doc):
110110
if e.text_content():
111111
add_match(candidates, e.text_content(), orig)
112112

113+
cjk = re.compile('[\u4e00-\u9fff]+')
114+
113115
if candidates:
114116
title = sorted(candidates, key=len)[-1]
115117
else:
116118
for delimiter in [" | ", " - ", " :: ", " / "]:
117119
if delimiter in title:
118120
parts = orig.split(delimiter)
119-
if len(parts[0].split()) >= 4:
120-
title = parts[0]
121+
p0 = parts[0]
122+
pl = parts[-1]
123+
if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
124+
title = p0
121125
break
122-
elif len(parts[-1].split()) >= 4:
123-
title = parts[-1]
126+
elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
127+
title = p1
124128
break
125129
else:
126130
if ": " in title:
127-
parts = orig.split(": ")
128-
if len(parts[-1].split()) >= 4:
129-
title = parts[-1]
131+
p1 = orig.split(": ")[-1]
132+
if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
133+
title = p1
130134
else:
131135
title = orig.split(": ", 1)[1]
132136

133-
if not 15 < len(title) < 150:
137+
if cjk.search(title) and not (4 <= len(title) < 100):
134138
return orig
135-
139+
elif not 15 < len(title) < 150:
140+
return orig
141+
136142
return title
137143

138144

0 commit comments

Comments
 (0)