@@ -62,6 +62,197 @@ class CrawKnowledgeService {
62
62
}
63
63
}
64
64
65
+ fun doCrawImportNewKnowledge () {
66
+ for (page in 1 .. 135 ) {
67
+ try {
68
+ launch(CommonPool ) {
69
+ crawImportNew(page)
70
+ }
71
+ } catch (e: Exception ) {
72
+
73
+ }
74
+ }
75
+ }
76
+
77
+ fun doCrawITEyeKnowledge () {
78
+ for (page in 1 .. 10000 ) {
79
+ try {
80
+ launch(CommonPool ) {
81
+ crawITEye(page)
82
+ }
83
+ } catch (e: Exception ) {
84
+
85
+ }
86
+ }
87
+ }
88
+
89
+ fun doCrawCNBlogKnowledge () {
90
+ for (page in 1 .. 200 ) {
91
+ try {
92
+ launch(CommonPool ) {
93
+ crawCNBlog(page)
94
+ }
95
+ } catch (e: Exception ) {
96
+
97
+ }
98
+ }
99
+ }
100
+
101
+ fun doCrawInfoQKnowledge () {
102
+ for (page in 0 .. 40 ) {
103
+ try {
104
+ launch(CommonPool ) {
105
+ crawInfoQ(page)
106
+ }
107
+ } catch (e: Exception ) {
108
+
109
+ }
110
+ }
111
+ }
112
+
113
+ private fun crawInfoQ (page : Int ) {
114
+ val pageUrl = " http://www.infoq.com/cn/java/articles/${page * 12 } "
115
+ val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
116
+ val document = Jsoup .parse(文章列表HTML )
117
+ // document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]
118
+ // <a href="/cn/articles/Reactive-Systems-Akka-Actors-DomainDrivenDesign" title="使用Akka的Actor模型和领域驱动设计构建反应式系统">…</a>
119
+ document.getElementsByClass(" news_type2 full_screen" ).forEach {
120
+ val url = it.child(1 ).child(0 ).attr(" href" )
121
+ val title = it.child(1 ).child(0 ).html()
122
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
123
+ try {
124
+ val InfoQ 文章HTML = CrawlerWebClient .getPageHtmlText(url)
125
+ val InfoQ 文章Document = Jsoup .parse(InfoQ 文章HTML )
126
+ val content = 获取InfoQ 文章内容(InfoQ 文章Document )
127
+ println (title)
128
+ println (url)
129
+ doSaveKnowledge(
130
+ url = url,
131
+ title = title,
132
+ content = content
133
+ )
134
+ } catch (e: Exception ) {
135
+
136
+ }
137
+ }
138
+ }
139
+
140
+ }
141
+
142
+ private fun 获取InfoQ 文章内容(infoQ文章Document : Document ? ): String? {
143
+ return infoQ文章Document ?.getElementsByClass(" text_info text_info_article" )?.get(0 )?.html()
144
+ }
145
+
146
+ private fun crawCNBlog (page : Int ) {
147
+ val pageUrl = " https://www.cnblogs.com/#p$page "
148
+ val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
149
+ val document = Jsoup .parse(文章列表HTML )
150
+ // document.getElementsByClassName("titlelnk")[0]
151
+ // <a class="titlelnk" href="https://www.cnblogs.com/qzrzq1/p/9069509.html" target="_blank">基于Orangpi Zero和Linux ALSA实现WIFI无线音箱(一)</a>
152
+ document.getElementsByClass(" titlelnk" ).forEach {
153
+ val url = it.attr(" href" )
154
+ val title = it.html()
155
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
156
+ try {
157
+ val CNBlog 文章HTML = CrawlerWebClient .getPageHtmlText(url)
158
+ val CNBlog 文章Document = Jsoup .parse(CNBlog 文章HTML )
159
+ val content = 获取CNBlog 文章内容(CNBlog 文章Document )
160
+ println (title)
161
+ println (url)
162
+ doSaveKnowledge(
163
+ url = url,
164
+ title = title,
165
+ content = content
166
+ )
167
+ } catch (e: Exception ) {
168
+
169
+ }
170
+ }
171
+ }
172
+
173
+ }
174
+
175
+ private fun 获取CNBlog 文章内容(cnBlog文章Document : Document ? ): String? {
176
+ return cnBlog文章Document ?.getElementById(" cnblogs_post_body" )?.html()
177
+ }
178
+
179
+ private fun crawITEye (page : Int ) {
180
+ val pageUrl = " http://www.iteye.com/blogs/category/language?page=$page "
181
+ val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
182
+ val document = Jsoup .parse(文章列表HTML )
183
+
184
+ // document.getElementsByClassName("content")[0].children[0].children[0]
185
+ // <a href="http://fhuan123.iteye.com/blog/2423594" title="C#Make自动化构建-简介" target="_blank">C#Make自动化构建-简介</a>
186
+ document.getElementsByClass(" content" ).forEach {
187
+ val url = it.child(0 ).child(0 ).attr(" href" )
188
+ val title = it.child(0 ).child(0 ).html()
189
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
190
+ try {
191
+ val ITEye 文章HTML = CrawlerWebClient .getPageHtmlText(url)
192
+ val ITEye 文章Document = Jsoup .parse(ITEye 文章HTML )
193
+ val content = 获取ITEye 文章内容(ITEye 文章Document )
194
+ println (title)
195
+ println (url)
196
+ doSaveKnowledge(
197
+ url = url,
198
+ title = title,
199
+ content = content
200
+ )
201
+ } catch (e: Exception ) {
202
+
203
+ }
204
+ }
205
+ }
206
+
207
+ }
208
+
209
+ private fun 获取ITEye 文章内容(itEye文章Document : Document ? ): String? {
210
+ return itEye文章Document ?.getElementById(" blog_content" )?.html()
211
+ }
212
+
213
+ private fun crawImportNew (page : Int ) {
214
+ val pageUrl = " http://www.importnew.com/all-posts/page/$page "
215
+ val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
216
+ val document = Jsoup .parse(文章列表HTML )
217
+ // document.getElementsByClassName("meta-title")[0]
218
+ // <a class="meta-title" target="_blank" href="http://www.importnew.com/28577.html" title="使用 Java 注解自动化处理对应关系实现注释代码化 ">使用 Java 注解自动化处理对应关系实现注释代码化</a>
219
+ document.getElementsByClass(" meta-title" ).forEach {
220
+ val url = it.attr(" href" )
221
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
222
+ try {
223
+ val ImportNew 文章HTML = CrawlerWebClient .getPageHtmlText(url)
224
+ val ImportNew 文章Document = Jsoup .parse(ImportNew 文章HTML )
225
+ val title = 获取ImportNew 文章标题(ImportNew 文章Document )
226
+ val content = 获取ImportNew 文章内容(ImportNew 文章Document )
227
+ println (title)
228
+ println (url)
229
+ doSaveKnowledge(
230
+ url = url,
231
+ title = title,
232
+ content = content
233
+ )
234
+ } catch (e: Exception ) {
235
+
236
+ }
237
+ }
238
+ }
239
+
240
+ }
241
+
242
+ private fun 获取ImportNew 文章内容(importNew文章Document : Document ? ): String? {
243
+ // document.getElementsByClassName("entry")
244
+ return importNew文章Document ?.getElementsByClass(" entry" )?.get(0 )?.html()
245
+ }
246
+
247
+ private fun 获取ImportNew 文章标题(importNew文章Document : Document ? ): String? {
248
+ // document.getElementsByClassName("entry-header")[0]
249
+ // <div class="entry-header"><h1>使用 Java 注解自动化处理对应关系实现注释代码化</h1></div>
250
+ // document.getElementsByClassName("entry-header")[0].children[0].innerHTML
251
+ // "使用 Java 注解自动化处理对应关系实现注释代码化"
252
+ return importNew文章Document ?.getElementsByClass(" entry-header" )?.get(0 )?.child(0 )?.html()
253
+
254
+ }
255
+
65
256
private fun crawOSChina (page : Int ) {
66
257
val pageUrl = " https://www.oschina.net/action/ajax/get_more_recommend_blog?classification=0&p=$page "
67
258
val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
@@ -101,8 +292,6 @@ class CrawKnowledgeService {
101
292
} catch (e: Exception ) {
102
293
103
294
}
104
-
105
-
106
295
}
107
296
}
108
297
}
0 commit comments