5
5
import com .google .common .base .Preconditions ;
6
6
7
7
import java .sql .*;
8
- import java .util .ArrayList ;
9
- import java .util .HashSet ;
10
- import java .util .List ;
8
+ import java .util .*;
9
+ import java .util .regex .Pattern ;
11
10
12
11
import static java .lang .Integer .min ;
13
12
@@ -90,34 +89,47 @@ public static String getHtmlSplit(String html){
90
89
}
91
90
92
91
93
- public static List <String > getTag (List <Tag > tagList ,ArticleBean articleBean )
92
+ /**
93
+ * 获取博客的标签 ——有的博客本身有标签 但是需要统一成系统有的标签 有的博客没有标签,就需要构造标签
94
+ * @param tagList 系统定义好的标签
95
+ * @param articleBean 文章的信息
96
+ * @return
97
+ */
98
+ public static List <String > getTag (List <Tag > tagList ,ArticleBean articleBean )
94
99
{
95
- String tags = articleBean .getTags ();
96
- String clean_content = articleBean .getClean_content ();
97
- String title = articleBean .getTitle ();
98
- String keywords [] = articleBean .getKeyword ().split ("," );
99
- //当内容不为空时才能提取tag
100
- List <String > resultTag = new ArrayList <>();
101
- if (!clean_content .equals ("" ))
100
+ String tags = articleBean .getTags ();//如果文章本身有标签 存在这里
101
+ String clean_content = articleBean .getClean_content ();//清洗后的文章内容
102
+ String title = articleBean .getTitle ();//文章的标题
103
+ Map <String ,Integer > map = new HashMap <>();
104
+ List <String > resultTag = new ArrayList <>();//存储提取标签的结果
105
+
106
+ if (!clean_content .equals ("" )) //当内容不为空时才能提取tag
107
+
102
108
{
103
109
//当这篇博客本身就有tag时
104
110
if (!tags .equals ("" ))
105
111
{
106
112
String temp_tag [] = tags .split ("," );
107
- //对比已有标签和 此标签的相似度
113
+ //对比已有标签和系统定义标签的相似度标签的相似度
114
+ /**
115
+ * 对比编辑距离 对比前需要将汉字全部转化为拼音 方便比对编辑距离
116
+ *
117
+ */
108
118
for (int i = 0 ; i < temp_tag .length ; i ++)
109
119
{
110
120
for (Tag tag :tagList )
111
121
{
112
122
113
123
String default_tag = Word2PinYin (tag .getName ());
114
- String desc [] = tag .getDescription ().split (" " );
124
+ String desc [] = tag .getDescription ().split ("# " );
115
125
String now_tag = Word2PinYin (temp_tag [i ]);
126
+
116
127
if (StringHasChinese (tag .getName ()) || StringHasChinese (temp_tag [i ]))
117
128
{
118
129
if (editDistance (default_tag ,now_tag ) <3 )
119
130
{
120
-
131
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
132
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
121
133
resultTag .add (tag .getName ());
122
134
continue ;
123
135
}
@@ -127,6 +139,8 @@ public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
127
139
if (editDistance (default_tag ,now_tag ) ==0 )
128
140
{
129
141
142
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
143
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
130
144
resultTag .add (tag .getName ());
131
145
continue ;
132
146
}
@@ -139,6 +153,8 @@ public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
139
153
{
140
154
if (editDistance (Word2PinYin (desc [k ]),now_tag ) <3 )
141
155
{
156
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
157
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
142
158
resultTag .add (tag .getName ());
143
159
continue ;
144
160
}
@@ -147,59 +163,124 @@ public static List<String> getTag(List<Tag> tagList,ArticleBean articleBean)
147
163
148
164
if (editDistance (Word2PinYin (desc [k ]),now_tag ) ==0 )
149
165
{
166
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
167
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
150
168
resultTag .add (tag .getName ());
151
169
continue ;
152
170
}
153
171
}
154
172
155
173
}
156
-
157
-
158
-
159
-
160
-
161
- }
162
- }
174
+ }}
163
175
}
176
+ /**
177
+ * 如果没有自带标签 或者自带标签与系统中的标签无法匹配上时,需要我们根据博客的标题和文字内容进行 字串对比 看能不能匹配到对应的标签
178
+ */
164
179
165
180
for (Tag tag :tagList )
166
181
{
167
182
168
- String default_tag = Word2PinYin (tag .getName ());
169
- String desc [] = tag .getDescription ().split (" " );
170
- if (clean_content .toLowerCase ().contains (default_tag .toLowerCase ()))
171
- {
172
- resultTag .add (tag .getName ());
173
- continue ;
174
- }
175
- else if (title .toLowerCase ().contains (default_tag .toLowerCase ()))
183
+ String default_tag = Word2PinYin (tag .getName ());
184
+ String desc [] = tag .getDescription ().split ("#" );
185
+ if (clean_content .toLowerCase ().contains (default_tag .toLowerCase ()))
186
+ {
187
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
188
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
189
+ resultTag .add (tag .getName ());
190
+ continue ;
191
+ }
192
+ else if (title .toLowerCase ().contains (default_tag .toLowerCase ()))
193
+ {
194
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
195
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
196
+ resultTag .add (tag .getName ());
197
+ continue ;
198
+ }else
199
+ {
200
+ for (int k = 0 ; k <desc .length ; k ++)
176
201
{
177
- resultTag .add (tag .getName ());
178
- continue ;
179
- }else
180
- {
181
- for (int k = 0 ; k <desc .length ; k ++)
202
+ if (clean_content .toLowerCase ().contains (desc [k ].toLowerCase ()) && !desc [k ].equals ("" ))
182
203
{
183
- if (clean_content .toLowerCase ().contains (desc [k ].toLowerCase ()) && !desc [k ].equals ("" ))
184
- {
185
-
186
- resultTag .add (tag .getName ());
187
- continue ;
188
- }
204
+ if (!map .containsKey (tag .getName ())) map .put (tag .getName (),0 );
205
+ map .put (tag .getName (),map .get (tag .getName ()) +1 );
206
+ resultTag .add (tag .getName ());
207
+ continue ;
189
208
}
190
209
}
210
+ }
191
211
192
212
193
213
194
214
}
195
-
196
215
}
197
- return new ArrayList <String >(new HashSet <String >(resultTag ));
216
+ List <Map .Entry <String , Integer >> list = new ArrayList <Map .Entry <String , Integer >>(map .entrySet ());
217
+ Collections .sort (list , new Comparator <Map .Entry <String , Integer >>() {
218
+ public int compare (Map .Entry <String , Integer > o1 , Map .Entry <String , Integer > o2 ) {
219
+ if (o2 .getValue () > o1 .getValue ())
220
+ return 1 ;
221
+ else if (o2 .getValue () < o1 .getValue ())
222
+ return -1 ;
223
+ else return 0 ;
224
+ //
225
+ }
226
+ });
227
+ int index = 0 ;
228
+ List <String > result = new ArrayList <>();
229
+ for (Map .Entry <String , Integer > t :list ){
230
+ result .add (t .getKey ());
231
+ if (++index >2 ) break ;
232
+ }
233
+ return result ;
234
+ // return new ArrayList<String>(new HashSet<String>(resultTag));
198
235
199
236
}
200
237
201
- //求编辑距离 利用动态规划
202
- public static int editDistance (String str1 , String str2 ) {
238
+ /**
239
+ * 有了标签之后 根据博客的标签 匹配对应的博客分类
240
+ * @param classify_list
241
+ * @param tag_list
242
+ * @return
243
+ */
244
+ public static List <String > getClassify (List <Classify > classify_list ,List <String > tag_list )
245
+ {
246
+ List <String > classify_result = new ArrayList <>();
247
+ for (String tag :tag_list )
248
+ {
249
+ String deal_tag = Word2PinYin (filtration (tag ));
250
+ for (Classify classify :classify_list )
251
+ {
252
+ String classify_desc [] = classify .getDescription ().split ("#" );
253
+ for (int i = 0 ; i < classify_desc .length ; i ++)
254
+ {
255
+ String deal_decs = Word2PinYin (filtration (classify_desc [i ]));
256
+ if (editDistance (deal_decs ,deal_tag ) == 0 )
257
+ {
258
+ classify_result .add (classify .getName ());
259
+ }
260
+ }
261
+ }
262
+ }
263
+ return new ArrayList <String >(new HashSet <String >(classify_result ));
264
+ }
265
+
266
+ /**
267
+ * 正则表达式 去除特殊字符
268
+ * @param str
269
+ * @return
270
+ */
271
+ public static String filtration (String str ) {
272
+ String regEx = "[`~!@#$%^&*()+=|{}:;\\ \\ [\\ \\ ].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?']" ;
273
+ str = Pattern .compile (regEx ).matcher (str ).replaceAll ("" ).trim ();
274
+ return str ;
275
+ }
276
+
277
+ /**
278
+ * 利用动态规划求编辑距离
279
+ * @param str1
280
+ * @param str2
281
+ * @return
282
+ */
283
+ public static int editDistance (String str1 , String str2 ) {
203
284
Preconditions .checkNotNull (str1 );
204
285
Preconditions .checkNotNull (str2 );
205
286
@@ -231,7 +312,12 @@ public static int editDistance(String str1, String str2) {
231
312
232
313
return dp [len1 ][len2 ];
233
314
}
234
- //将有中文的字符串转成拼音
315
+
316
+ /**
317
+ * 将有中文的字符串转成拼音
318
+ * @param str
319
+ * @return
320
+ */
235
321
public static String Word2PinYin (String str )
236
322
{
237
323
StringBuffer sb = new StringBuffer ();
@@ -259,6 +345,12 @@ else if(isChinese(temp)) //如果是中文
259
345
}
260
346
return sb .toString ();
261
347
}
348
+
349
+ /**
350
+ * 判断一个字符串中是否含有中文
351
+ * @param str
352
+ * @return
353
+ */
262
354
public static boolean StringHasChinese (String str )
263
355
{
264
356
for (int i = 0 ; i < str .length () ; i ++)
@@ -270,7 +362,12 @@ public static boolean StringHasChinese(String str)
270
362
}
271
363
return false ;
272
364
}
273
- //判断一个字符是否是中文
365
+
366
+ /**
367
+ * 判断一个字符是否是中文
368
+ * @param c
369
+ * @return
370
+ */
274
371
public static boolean isChinese (char c ) {
275
372
Character .UnicodeScript sc = Character .UnicodeScript .of (c );
276
373
if (sc == Character .UnicodeScript .HAN ) {
@@ -283,36 +380,48 @@ public static boolean isChinese(char c) {
283
380
public static void main (String [] args )
284
381
{
285
382
GetMysqlData getMysqlData = new GetMysqlData ();
286
- List <ArticleBean > articleBeans =getMysqlData .getArticle ();
287
- List <Tag > tagList = getMysqlData .getTag ();
383
+ List <ArticleBean > articleBeans =getMysqlData .getArticle ();//从数据库中取出所有文章
384
+ List <Tag > tagList = getMysqlData .getTag ();//从数据库取出所有tag信息
385
+ List <Classify > classifyList = getMysqlData .getClassify ();//从数据库中取出所有分类信息
288
386
for (ArticleBean articleBean :articleBeans ) {
289
387
int id = articleBean .getId ();
290
388
String title = articleBean .getTitle ();
291
389
String content = articleBean .getContent ();
292
390
String tags = articleBean .getTags ();
293
- GetCleanedContent GetCleanedContent = new GetCleanedContent (content );
391
+ GetCleanedContent GetCleanedContent = new GetCleanedContent (content ); //将博客内容 带html标签的内容 使用jsoup提取对应标签中的内容
294
392
String clean_content = GetCleanedContent .parse ();
295
- String keyword = new TextRankKeyword ().getKeyword (title , clean_content );
296
- String summary = TextRankSummary .getTopSentenceList (clean_content , 3 );
393
+ String keyword = new TextRankKeyword ().getKeyword (title , clean_content );//使用textrank算法提取关键词
394
+ String summary = TextRankSummary .getTopSentenceList (clean_content , 3 );//使用textrank算法提取摘要
297
395
articleBean .setKeyword (keyword );
298
396
articleBean .setClean_content (clean_content );
299
397
articleBean .setSummary (summary );
300
- List <String > result_tag = getTag (tagList ,articleBean );
301
- String str = "" ;
398
+ List <String > result_tag = getTag (tagList ,articleBean );//提取标签
399
+
400
+ List <String > result_classify =getClassify (classifyList ,result_tag );//提取分类
401
+ String str_tag = "" ;
302
402
for (String s :result_tag )
303
403
{
304
- str += s +"," ;
404
+ str_tag += s +"," ;
305
405
}
306
- if (str .length () != 0 )
406
+ String str_classify = "" ;
407
+ for (String s :result_classify )
408
+ {
409
+ str_classify += s +"," ;
410
+ }
411
+
412
+
413
+ if (str_tag .length () != 0 )
307
414
{
308
415
System .out .println (id +" " +tags );
309
- System .out .println (str .substring (0 ,str .length ()-1 ));
310
- getMysqlData .UpdateArticle (id ,str .substring (0 ,str .length ()-1 ),clean_content ,keyword ,summary );
416
+ System .out .println (str_tag .substring (0 ,str_tag .length ()-1 ));
417
+ System .out .println (str_classify );
418
+ getMysqlData .UpdateArticle (id ,str_tag .substring (0 ,str_tag .length ()-1 ),str_classify ,clean_content ,keyword ,summary );
311
419
}else
312
420
{
313
421
System .out .println (id +" " +tags );
314
- System .out .println (str );
315
- getMysqlData .UpdateArticle (id ,str ,clean_content ,keyword ,summary );
422
+ System .out .println (str_tag );
423
+ System .out .println (str_classify );
424
+ getMysqlData .UpdateArticle (id ,str_tag ,str_classify ,clean_content ,keyword ,summary );
316
425
}
317
426
318
427
0 commit comments