Skip to content

Commit 140ad72

Browse files
committed
small fix
1 parent a69caea commit 140ad72

File tree

6 files changed

+261
-68
lines changed

6 files changed

+261
-68
lines changed

pom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,19 @@
4646
<artifactId>mahout</artifactId>
4747
<version>0.11.1</version>
4848
</dependency>
49+
<!-- https://mvnrepository.com/artifact/org.apache.mahout/mahout-core -->
50+
<dependency>
51+
<groupId>org.apache.mahout</groupId>
52+
<artifactId>mahout-core</artifactId>
53+
<version>0.9</version>
54+
</dependency>
55+
<!-- https://mvnrepository.com/artifact/org.apache.mahout/mahout-examples -->
4956
<dependency>
5057
<groupId>org.apache.mahout</groupId>
5158
<artifactId>mahout-examples</artifactId>
5259
<version>0.13.0</version>
5360
</dependency>
61+
5462
<dependency>
5563
<groupId>com.opencsv</groupId>
5664
<artifactId>opencsv</artifactId>

src/main/java/DataClean/ArticleClean.java

Lines changed: 168 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
import com.google.common.base.Preconditions;
66

77
import java.sql.*;
8-
import java.util.ArrayList;
9-
import java.util.HashSet;
10-
import java.util.List;
8+
import java.util.*;
9+
import java.util.regex.Pattern;
1110

1211
import static java.lang.Integer.min;
1312

@@ -90,34 +89,47 @@ public static String getHtmlSplit(String html){
9089
}
9190

9291

93-
public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
92+
/**
93+
* 获取博客的标签 ——有的博客本身有标签 但是需要统一成系统有的标签 有的博客没有标签,就需要构造标签
94+
* @param tagList 系统定义好的标签
95+
* @param articleBean 文章的信息
96+
* @return
97+
*/
98+
public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
9499
{
95-
String tags = articleBean.getTags();
96-
String clean_content = articleBean.getClean_content();
97-
String title = articleBean.getTitle();
98-
String keywords[] = articleBean.getKeyword().split(",");
99-
//当内容不为空时才能提取tag
100-
List<String> resultTag = new ArrayList<>();
101-
if(!clean_content.equals(""))
100+
String tags = articleBean.getTags();//如果文章本身有标签 存在这里
101+
String clean_content = articleBean.getClean_content();//清洗后的文章内容
102+
String title = articleBean.getTitle();//文章的标题
103+
Map<String,Integer> map = new HashMap<>();
104+
List<String> resultTag = new ArrayList<>();//存储提取标签的结果
105+
106+
if(!clean_content.equals("")) //当内容不为空时才能提取tag
107+
102108
{
103109
//当这篇博客本身就有tag时
104110
if(!tags.equals(""))
105111
{
106112
String temp_tag [] = tags.split(",");
107-
//对比已有标签和 此标签的相似度
113+
//对比已有标签和系统定义标签的相似度标签的相似度
114+
/**
115+
* 对比编辑距离 对比前需要将汉字全部转化为拼音 方便比对编辑距离
116+
*
117+
*/
108118
for(int i = 0 ; i < temp_tag.length ; i++)
109119
{
110120
for(Tag tag :tagList)
111121
{
112122

113123
String default_tag = Word2PinYin(tag.getName());
114-
String desc [] = tag.getDescription().split(" ");
124+
String desc [] = tag.getDescription().split("#");
115125
String now_tag = Word2PinYin(temp_tag[i]);
126+
116127
if(StringHasChinese(tag.getName()) || StringHasChinese(temp_tag[i]))
117128
{
118129
if(editDistance(default_tag,now_tag) <3)
119130
{
120-
131+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
132+
map.put(tag.getName(),map.get(tag.getName()) +1);
121133
resultTag.add(tag.getName());
122134
continue;
123135
}
@@ -127,6 +139,8 @@ public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
127139
if(editDistance(default_tag,now_tag) ==0)
128140
{
129141

142+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
143+
map.put(tag.getName(),map.get(tag.getName()) +1);
130144
resultTag.add(tag.getName());
131145
continue;
132146
}
@@ -139,6 +153,8 @@ public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
139153
{
140154
if(editDistance(Word2PinYin(desc[k]),now_tag) <3)
141155
{
156+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
157+
map.put(tag.getName(),map.get(tag.getName()) +1);
142158
resultTag.add(tag.getName());
143159
continue;
144160
}
@@ -147,59 +163,124 @@ public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
147163

148164
if(editDistance(Word2PinYin(desc[k]),now_tag) ==0)
149165
{
166+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
167+
map.put(tag.getName(),map.get(tag.getName()) +1);
150168
resultTag.add(tag.getName());
151169
continue;
152170
}
153171
}
154172

155173
}
156-
157-
158-
159-
160-
161-
}
162-
}
174+
}}
163175
}
176+
/**
177+
* 如果没有自带标签 或者自带标签与系统中的标签无法匹配上时,需要我们根据博客的标题和文字内容进行 字串对比 看能不能匹配到对应的标签
178+
*/
164179

165180
for(Tag tag :tagList)
166181
{
167182

168-
String default_tag = Word2PinYin(tag.getName());
169-
String desc [] = tag.getDescription().split(" ");
170-
if(clean_content.toLowerCase().contains(default_tag.toLowerCase()))
171-
{
172-
resultTag.add(tag.getName());
173-
continue;
174-
}
175-
else if(title.toLowerCase().contains(default_tag.toLowerCase()))
183+
String default_tag = Word2PinYin(tag.getName());
184+
String desc [] = tag.getDescription().split("#");
185+
if(clean_content.toLowerCase().contains(default_tag.toLowerCase()))
186+
{
187+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
188+
map.put(tag.getName(),map.get(tag.getName()) +1);
189+
resultTag.add(tag.getName());
190+
continue;
191+
}
192+
else if(title.toLowerCase().contains(default_tag.toLowerCase()))
193+
{
194+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
195+
map.put(tag.getName(),map.get(tag.getName()) +1);
196+
resultTag.add(tag.getName());
197+
continue;
198+
}else
199+
{
200+
for(int k = 0 ; k<desc.length ; k++)
176201
{
177-
resultTag.add(tag.getName());
178-
continue;
179-
}else
180-
{
181-
for(int k = 0 ; k<desc.length ; k++)
202+
if(clean_content.toLowerCase().contains(desc[k].toLowerCase()) && !desc[k].equals(""))
182203
{
183-
if(clean_content.toLowerCase().contains(desc[k].toLowerCase()) && !desc[k].equals(""))
184-
{
185-
186-
resultTag.add(tag.getName());
187-
continue;
188-
}
204+
if(!map.containsKey(tag.getName())) map.put(tag.getName(),0);
205+
map.put(tag.getName(),map.get(tag.getName()) +1);
206+
resultTag.add(tag.getName());
207+
continue;
189208
}
190209
}
210+
}
191211

192212

193213

194214
}
195-
196215
}
197-
return new ArrayList<String>(new HashSet<String>(resultTag));
216+
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());
217+
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
218+
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
219+
if(o2.getValue() > o1.getValue())
220+
return 1;
221+
else if(o2.getValue() < o1.getValue())
222+
return -1;
223+
else return 0;
224+
//
225+
}
226+
});
227+
int index = 0;
228+
List<String> result = new ArrayList<>();
229+
for(Map.Entry<String, Integer> t:list){
230+
result.add(t.getKey());
231+
if(++index >2) break;
232+
}
233+
return result;
234+
// return new ArrayList<String>(new HashSet<String>(resultTag));
198235

199236
}
200237

201-
//求编辑距离 利用动态规划
202-
public static int editDistance(String str1, String str2) {
238+
/**
239+
* 有了标签之后 根据博客的标签 匹配对应的博客分类
240+
* @param classify_list
241+
* @param tag_list
242+
* @return
243+
*/
244+
public static List<String> getClassify(List<Classify> classify_list ,List<String> tag_list)
245+
{
246+
List<String> classify_result = new ArrayList<>();
247+
for(String tag :tag_list)
248+
{
249+
String deal_tag = Word2PinYin(filtration(tag));
250+
for(Classify classify :classify_list)
251+
{
252+
String classify_desc [] = classify.getDescription().split("#");
253+
for(int i = 0 ; i< classify_desc.length ; i++)
254+
{
255+
String deal_decs = Word2PinYin(filtration(classify_desc[i]));
256+
if(editDistance(deal_decs,deal_tag) == 0)
257+
{
258+
classify_result.add(classify.getName());
259+
}
260+
}
261+
}
262+
}
263+
return new ArrayList<String>(new HashSet<String>(classify_result));
264+
}
265+
266+
/**
267+
* 正则表达式 去除特殊字符
268+
* @param str
269+
* @return
270+
*/
271+
public static String filtration(String str) {
272+
String regEx = "[`~!@#$%^&*()+=|{}:;\\\\[\\\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?']";
273+
str = Pattern.compile(regEx).matcher(str).replaceAll("").trim();
274+
return str;
275+
}
276+
277+
/**
278+
* 利用动态规划求编辑距离
279+
* @param str1
280+
* @param str2
281+
* @return
282+
*/
283+
public static int editDistance(String str1, String str2) {
203284
Preconditions.checkNotNull(str1);
204285
Preconditions.checkNotNull(str2);
205286

@@ -231,7 +312,12 @@ public static int editDistance(String str1, String str2) {
231312

232313
return dp[len1][len2];
233314
}
234-
//将有中文的字符串转成拼音
315+
316+
/**
317+
* 将有中文的字符串转成拼音
318+
* @param str
319+
* @return
320+
*/
235321
public static String Word2PinYin(String str)
236322
{
237323
StringBuffer sb = new StringBuffer();
@@ -259,6 +345,12 @@ else if(isChinese(temp)) //如果是中文
259345
}
260346
return sb.toString();
261347
}
348+
349+
/**
350+
* 判断一个字符串中是否含有中文
351+
* @param str
352+
* @return
353+
*/
262354
public static boolean StringHasChinese(String str)
263355
{
264356
for(int i = 0 ; i< str.length() ; i++)
@@ -270,7 +362,12 @@ public static boolean StringHasChinese(String str)
270362
}
271363
return false;
272364
}
273-
//判断一个字符是否是中文
365+
366+
/**
367+
* 判断一个字符是否是中文
368+
* @param c
369+
* @return
370+
*/
274371
public static boolean isChinese(char c) {
275372
Character.UnicodeScript sc = Character.UnicodeScript.of(c);
276373
if (sc == Character.UnicodeScript.HAN) {
@@ -283,36 +380,48 @@ public static boolean isChinese(char c) {
283380
public static void main(String [] args)
284381
{
285382
GetMysqlData getMysqlData = new GetMysqlData();
286-
List<ArticleBean> articleBeans =getMysqlData.getArticle();
287-
List<Tag> tagList = getMysqlData.getTag();
383+
List<ArticleBean> articleBeans =getMysqlData.getArticle();//从数据库中取出所有文章
384+
List<Tag> tagList = getMysqlData.getTag();//从数据库取出所有tag信息
385+
List<Classify> classifyList = getMysqlData.getClassify();//从数据库中取出所有分类信息
288386
for(ArticleBean articleBean:articleBeans) {
289387
int id = articleBean.getId();
290388
String title = articleBean.getTitle();
291389
String content = articleBean.getContent();
292390
String tags = articleBean.getTags();
293-
GetCleanedContent GetCleanedContent = new GetCleanedContent(content);
391+
GetCleanedContent GetCleanedContent = new GetCleanedContent(content); //将博客内容 带html标签的内容 使用jsoup提取对应标签中的内容
294392
String clean_content = GetCleanedContent.parse();
295-
String keyword = new TextRankKeyword().getKeyword(title, clean_content);
296-
String summary = TextRankSummary.getTopSentenceList(clean_content, 3);
393+
String keyword = new TextRankKeyword().getKeyword(title, clean_content);//使用textrank算法提取关键词
394+
String summary = TextRankSummary.getTopSentenceList(clean_content, 3);//使用textrank算法提取摘要
297395
articleBean.setKeyword(keyword);
298396
articleBean.setClean_content(clean_content);
299397
articleBean.setSummary(summary);
300-
List<String > result_tag = getTag(tagList,articleBean);
301-
String str = "";
398+
List<String > result_tag = getTag(tagList,articleBean);//提取标签
399+
400+
List<String > result_classify =getClassify(classifyList,result_tag);//提取分类
401+
String str_tag = "";
302402
for(String s :result_tag)
303403
{
304-
str += s+",";
404+
str_tag += s+",";
305405
}
306-
if (str.length() != 0)
406+
String str_classify = "";
407+
for(String s :result_classify)
408+
{
409+
str_classify += s+",";
410+
}
411+
412+
413+
if (str_tag.length() != 0)
307414
{
308415
System.out.println(id+" " +tags);
309-
System.out.println(str.substring(0,str.length()-1));
310-
getMysqlData.UpdateArticle(id,str.substring(0,str.length()-1),clean_content,keyword,summary);
416+
System.out.println(str_tag.substring(0,str_tag.length()-1));
417+
System.out.println(str_classify);
418+
getMysqlData.UpdateArticle(id,str_tag.substring(0,str_tag.length()-1),str_classify,clean_content,keyword,summary);
311419
}else
312420
{
313421
System.out.println(id+" " +tags);
314-
System.out.println(str);
315-
getMysqlData.UpdateArticle(id,str,clean_content,keyword,summary);
422+
System.out.println(str_tag);
423+
System.out.println(str_classify);
424+
getMysqlData.UpdateArticle(id,str_tag,str_classify,clean_content,keyword,summary);
316425
}
317426

318427

0 commit comments

Comments
 (0)