Skip to content

Commit 2b28502

Browse files
author
jack
committed
TakeKeyWordsService
1 parent 04450e8 commit 2b28502

File tree

17 files changed

+634
-35
lines changed

17 files changed

+634
-35
lines changed

build.gradle

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,9 @@ dependencies {
6262
// https://mvnrepository.com/artifact/com.alibaba/fastjson
6363
compile group: 'com.alibaba', name: 'fastjson', version: '1.2.47'
6464

65+
// https://mvnrepository.com/artifact/org.htmlparser/htmlparser
66+
compile group: 'org.htmlparser', name: 'htmlparser', version: '2.1'
67+
// https://mvnrepository.com/artifact/com.hankcs/hanlp
68+
compile group: 'com.hankcs', name: 'hanlp', version: 'portable-1.6.4'
6569

6670
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package com.light.saber.textrank;
2+
3+
import java.util.List;
4+
import java.util.Map;
5+
import java.util.TreeMap;
6+
7+
/**
8+
* 搜索相关性评分算法
9+
*
10+
* @author hankcs
11+
*/
12+
public class BM25 {
13+
/**
14+
* 调节因子
15+
*/
16+
final static float k1 = 1.5f;
17+
/**
18+
* 调节因子
19+
*/
20+
final static float b = 0.75f;
21+
/**
22+
* 文档句子的个数
23+
*/
24+
int D;
25+
/**
26+
* 文档句子的平均长度
27+
*/
28+
double avgdl;
29+
/**
30+
* 拆分为[句子[单词]]形式的文档
31+
*/
32+
List<List<String>> docs;
33+
/**
34+
* 文档中每个句子中的每个词与词频
35+
*/
36+
Map<String, Integer>[] f;
37+
/**
38+
* 文档中全部词语与出现在几个句子中
39+
*/
40+
Map<String, Integer> df;
41+
/**
42+
* IDF
43+
*/
44+
Map<String, Double> idf;
45+
46+
public BM25(List<List<String>> docs) {
47+
this.docs = docs;
48+
D = docs.size();
49+
for (List<String> sentence : docs) {
50+
avgdl += sentence.size();
51+
}
52+
avgdl /= D;
53+
f = new Map[D];
54+
df = new TreeMap<String, Integer>();
55+
idf = new TreeMap<String, Double>();
56+
init();
57+
}
58+
59+
/**
60+
* 在构造时初始化自己的所有参数
61+
*/
62+
private void init() {
63+
int index = 0;
64+
for (List<String> sentence : docs) {
65+
Map<String, Integer> tf = new TreeMap<String, Integer>();
66+
for (String word : sentence) {
67+
Integer freq = tf.get(word);
68+
freq = (freq == null ? 0 : freq) + 1;
69+
tf.put(word, freq);
70+
}
71+
f[index] = tf;
72+
for (Map.Entry<String, Integer> entry : tf.entrySet()) {
73+
String word = entry.getKey();
74+
Integer freq = df.get(word);
75+
freq = (freq == null ? 0 : freq) + 1;
76+
df.put(word, freq);
77+
}
78+
++index;
79+
}
80+
for (Map.Entry<String, Integer> entry : df.entrySet()) {
81+
String word = entry.getKey();
82+
Integer freq = entry.getValue();
83+
idf.put(word, Math.log(D - freq + 0.5) - Math.log(freq + 0.5));
84+
}
85+
}
86+
87+
public double sim(List<String> sentence, int index) {
88+
double score = 0;
89+
for (String word : sentence) {
90+
if (!f[index].containsKey(word)) continue;
91+
int d = docs.get(index).size();
92+
Integer wf = f[index].get(word);
93+
score += (idf.get(word) * wf * (k1 + 1)
94+
/ (wf + k1 * (1 - b + b * d
95+
/ avgdl)));
96+
}
97+
98+
return score;
99+
}
100+
101+
public double[] simAll(List<String> sentence) {
102+
double[] scores = new double[D];
103+
for (int i = 0; i < D; ++i) {
104+
scores[i] = sim(sentence, i);
105+
}
106+
return scores;
107+
}
108+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package com.light.saber.textrank;
2+
3+
import org.htmlparser.NodeFilter;
4+
import org.htmlparser.Parser;
5+
import org.htmlparser.beans.StringBean;
6+
import org.htmlparser.filters.CssSelectorNodeFilter;
7+
import org.htmlparser.util.NodeList;
8+
9+
public class HtmlUtil {
10+
11+
public static String getText(String html, String id) {
12+
try {
13+
Parser parser = new Parser(html);
14+
NodeFilter filter = new CssSelectorNodeFilter("#" + id);
15+
NodeList nList = parser.extractAllNodesThatMatch(filter);
16+
return nList == null || nList.size() == 0 ? null : nList.elementAt(
17+
0).toPlainTextString();
18+
} catch (Exception e) {
19+
e.printStackTrace();
20+
return null;
21+
}
22+
}
23+
24+
public static String getTextByClass(String html, String css_class) {
25+
try {
26+
Parser parser = new Parser(html);
27+
NodeFilter filter = new CssSelectorNodeFilter("." + css_class);
28+
NodeList nList = parser.extractAllNodesThatMatch(filter);
29+
return nList == null || nList.size() == 0 ? null : nList.elementAt(
30+
0).toPlainTextString();
31+
} catch (Exception e) {
32+
e.printStackTrace();
33+
return null;
34+
}
35+
}
36+
37+
38+
/**
39+
* 获取网页中纯文本信息
40+
*
41+
* @param html
42+
* @return
43+
* @throws Exception
44+
* @throws Exception
45+
*/
46+
public static String getText(String html) throws Exception {
47+
StringBean bean = new StringBean();
48+
bean.setLinks(false);
49+
bean.setReplaceNonBreakingSpaces(true);
50+
bean.setCollapse(true);
51+
52+
// 返回解析后的网页纯文本信息
53+
Parser parser = Parser.createParser(html, "utf-8");
54+
parser.visitAllNodesWith(bean);
55+
parser.reset();
56+
String text = bean.getStrings();
57+
String reg = "[^\u4e00-\u9fa5]";
58+
text = text.replaceAll(reg, " ");
59+
return text;
60+
}
61+
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
package com.light.saber.textrank;
2+
3+
4+
import com.hankcs.hanlp.HanLP;
5+
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
6+
import com.hankcs.hanlp.seg.common.Term;
7+
8+
import java.util.*;
9+
import java.util.stream.Collectors;
10+
11+
/**
12+
* TextRank关键词提取
13+
*
14+
* @author hankcs
15+
*/
16+
public class TextRankKeyword {
17+
public static final int MAX_KEY_WORDS = 7;
18+
/**
19+
* 阻尼系数(DampingFactor),一般取值为0.85
20+
*/
21+
static final float d = 0.618f;
22+
/**
23+
* 最大迭代次数
24+
*/
25+
static final int max_iter = 2000;
26+
static final float min_diff = 0.001f;
27+
28+
public TextRankKeyword() {
29+
// jdk bug : Exception in thread "main" java.lang.IllegalArgumentException: Comparison method violates its general contract!
30+
System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
31+
}
32+
33+
34+
public String getKeyword(String title, String content) {
35+
List<Term> termList = HanLP.segment(title + content);
36+
List<String> wordList = new ArrayList<String>();
37+
for (Term t : termList) {
38+
if (shouldInclude(t)) {
39+
wordList.add(t.word);
40+
}
41+
}
42+
Map<String, Set<String>> words = new HashMap<String, Set<String>>();
43+
Queue<String> que = new LinkedList<String>();
44+
for (String w : wordList) {
45+
if (!words.containsKey(w)) {
46+
words.put(w, new HashSet<String>());
47+
}
48+
que.offer(w);
49+
if (que.size() > 5) {
50+
que.poll();
51+
}
52+
53+
for (String w1 : que) {
54+
for (String w2 : que) {
55+
if (w1.equals(w2)) {
56+
continue;
57+
}
58+
59+
words.get(w1).add(w2);
60+
words.get(w2).add(w1);
61+
}
62+
}
63+
}
64+
Map<String, Float> score = new HashMap<String, Float>();
65+
for (int i = 0; i < max_iter; ++i) {
66+
Map<String, Float> m = new HashMap<String, Float>();
67+
float max_diff = 0;
68+
for (Map.Entry<String, Set<String>> entry : words.entrySet()) {
69+
String key = entry.getKey();
70+
Set<String> value = entry.getValue();
71+
m.put(key, 1 - d);
72+
for (String other : value) {
73+
int size = words.get(other).size();
74+
if (key.equals(other) || size == 0) continue;
75+
m.put(key, m.get(key) + d / size * (score.get(other) == null ? 0 : score.get(other)));
76+
}
77+
max_diff = Math.max(max_diff, Math.abs(m.get(key) - (score.get(key) == null ? 0 : score.get(key))));
78+
}
79+
score = m;
80+
if (max_diff <= min_diff) break;
81+
}
82+
List<Map.Entry<String, Float>> entryList = new ArrayList<Map.Entry<String, Float>>(score.entrySet());
83+
Collections.sort(entryList, (o1, o2) -> (o1.getValue() - o2.getValue() > 0 ? -1 : 1));
84+
85+
List<Map.Entry<String, Float>> list = entryList.stream().filter(w -> w.getKey().length() > 1).collect(Collectors.toList());
86+
String result = "";
87+
int nKeyword = MAX_KEY_WORDS > list.size() ? list.size() : MAX_KEY_WORDS;
88+
for (int i = 0; i < nKeyword; ++i) {
89+
result += list.get(i).getKey() + ';';
90+
}
91+
return result;
92+
}
93+
94+
/**
95+
* 是否应当将这个term纳入计算,词性属于名词、动词、副词、形容词
96+
*
97+
* @param term
98+
* @return 是否应当
99+
*/
100+
public boolean shouldInclude(Term term) {
101+
return CoreStopWordDictionary.shouldInclude(term);
102+
}
103+
}

0 commit comments

Comments
 (0)