Skip to content

Commit b525c63

Browse files
committed
抓取OSChina
抓取ImportNew< 抓取CNBlog 抓取InfoQ
1 parent 5296460 commit b525c63

File tree

3 files changed

+221
-2
lines changed

3 files changed

+221
-2
lines changed

src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,31 @@ class KnowledgeCrawController {
4040
return "DONE"
4141
}
4242

43+
@GetMapping("/knowledge/doCrawImportNewKnowledge")
44+
fun doCrawImportNewKnowledge(): String {
45+
Thread {
46+
CrawKnowledgeService.doCrawImportNewKnowledge()
47+
}.start()
48+
49+
return "DONE"
50+
}
51+
52+
@GetMapping("/knowledge/doCrawCNBlogKnowledge")
53+
fun doCrawCNBlogKnowledge(): String {
54+
Thread {
55+
CrawKnowledgeService.doCrawCNBlogKnowledge()
56+
}.start()
57+
58+
return "DONE"
59+
}
60+
61+
@GetMapping("/knowledge/doCrawInfoQKnowledge")
62+
fun doCrawInfoQKnowledge(): String {
63+
Thread {
64+
CrawKnowledgeService.doCrawInfoQKnowledge()
65+
}.start()
66+
67+
return "DONE"
68+
}
69+
4370
}

src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt

Lines changed: 191 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,197 @@ class CrawKnowledgeService {
6262
}
6363
}
6464

65+
fun doCrawImportNewKnowledge() {
66+
for (page in 1..135) {
67+
try {
68+
launch(CommonPool) {
69+
crawImportNew(page)
70+
}
71+
} catch (e: Exception) {
72+
73+
}
74+
}
75+
}
76+
77+
fun doCrawITEyeKnowledge() {
78+
for (page in 1..10000) {
79+
try {
80+
launch(CommonPool) {
81+
crawITEye(page)
82+
}
83+
} catch (e: Exception) {
84+
85+
}
86+
}
87+
}
88+
89+
fun doCrawCNBlogKnowledge() {
90+
for (page in 1..200) {
91+
try {
92+
launch(CommonPool) {
93+
crawCNBlog(page)
94+
}
95+
} catch (e: Exception) {
96+
97+
}
98+
}
99+
}
100+
101+
fun doCrawInfoQKnowledge() {
102+
for (page in 0..40) {
103+
try {
104+
launch(CommonPool) {
105+
crawInfoQ(page)
106+
}
107+
} catch (e: Exception) {
108+
109+
}
110+
}
111+
}
112+
113+
private fun crawInfoQ(page: Int) {
114+
val pageUrl = "http://www.infoq.com/cn/java/articles/${page * 12}"
115+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
116+
val document = Jsoup.parse(文章列表HTML)
117+
// document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]
118+
//<a href=​"/​cn/​articles/​Reactive-Systems-Akka-Actors-DomainDrivenDesign" title=​"使用Akka的Actor模型和领域驱动设计构建反应式系统">​…​</a>​
119+
document.getElementsByClass("news_type2 full_screen").forEach {
120+
val url = it.child(1).child(0).attr("href")
121+
val title = it.child(1).child(0).html()
122+
if (KnowledgeDao.countByUrl(url) == 0) {
123+
try {
124+
val InfoQ文章HTML = CrawlerWebClient.getPageHtmlText(url)
125+
val InfoQ文章Document = Jsoup.parse(InfoQ文章HTML)
126+
val content = 获取InfoQ文章内容(InfoQ文章Document)
127+
println(title)
128+
println(url)
129+
doSaveKnowledge(
130+
url = url,
131+
title = title,
132+
content = content
133+
)
134+
} catch (e: Exception) {
135+
136+
}
137+
}
138+
}
139+
140+
}
141+
142+
private fun 获取InfoQ文章内容(infoQ文章Document: Document?): String? {
143+
return infoQ文章Document?.getElementsByClass("text_info text_info_article")?.get(0)?.html()
144+
}
145+
146+
private fun crawCNBlog(page: Int) {
147+
val pageUrl = "https://www.cnblogs.com/#p$page"
148+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
149+
val document = Jsoup.parse(文章列表HTML)
150+
// document.getElementsByClassName("titlelnk")[0]
151+
//<a class=​"titlelnk" href=​"https:​/​/​www.cnblogs.com/​qzrzq1/​p/​9069509.html" target=​"_blank">​基于Orangpi Zero和Linux ALSA实现WIFI无线音箱(一)​</a>​
152+
document.getElementsByClass("titlelnk").forEach {
153+
val url = it.attr("href")
154+
val title = it.html()
155+
if (KnowledgeDao.countByUrl(url) == 0) {
156+
try {
157+
val CNBlog文章HTML = CrawlerWebClient.getPageHtmlText(url)
158+
val CNBlog文章Document = Jsoup.parse(CNBlog文章HTML)
159+
val content = 获取CNBlog文章内容(CNBlog文章Document)
160+
println(title)
161+
println(url)
162+
doSaveKnowledge(
163+
url = url,
164+
title = title,
165+
content = content
166+
)
167+
} catch (e: Exception) {
168+
169+
}
170+
}
171+
}
172+
173+
}
174+
175+
private fun 获取CNBlog文章内容(cnBlog文章Document: Document?): String? {
176+
return cnBlog文章Document?.getElementById("cnblogs_post_body")?.html()
177+
}
178+
179+
private fun crawITEye(page: Int) {
180+
val pageUrl = "http://www.iteye.com/blogs/category/language?page=$page"
181+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
182+
val document = Jsoup.parse(文章列表HTML)
183+
184+
// document.getElementsByClassName("content")[0].children[0].children[0]
185+
//<a href=​"http:​/​/​fhuan123.iteye.com/​blog/​2423594" title=​"C#Make自动化构建-简介" target=​"_blank">​C#Make自动化构建-简介​</a>​
186+
document.getElementsByClass("content").forEach {
187+
val url = it.child(0).child(0).attr("href")
188+
val title = it.child(0).child(0).html()
189+
if (KnowledgeDao.countByUrl(url) == 0) {
190+
try {
191+
val ITEye文章HTML = CrawlerWebClient.getPageHtmlText(url)
192+
val ITEye文章Document = Jsoup.parse(ITEye文章HTML)
193+
val content = 获取ITEye文章内容(ITEye文章Document)
194+
println(title)
195+
println(url)
196+
doSaveKnowledge(
197+
url = url,
198+
title = title,
199+
content = content
200+
)
201+
} catch (e: Exception) {
202+
203+
}
204+
}
205+
}
206+
207+
}
208+
209+
private fun 获取ITEye文章内容(itEye文章Document: Document?): String? {
210+
return itEye文章Document?.getElementById("blog_content")?.html()
211+
}
212+
213+
private fun crawImportNew(page: Int) {
214+
val pageUrl = "http://www.importnew.com/all-posts/page/$page"
215+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
216+
val document = Jsoup.parse(文章列表HTML)
217+
// document.getElementsByClassName("meta-title")[0]
218+
//<a class=​"meta-title" target=​"_blank" href=​"http:​/​/​www.importnew.com/​28577.html" title=​"使用 Java 注解自动化处理对应关系实现注释代码化 ">​使用 Java 注解自动化处理对应关系实现注释代码化​</a>​
219+
document.getElementsByClass("meta-title").forEach {
220+
val url = it.attr("href")
221+
if (KnowledgeDao.countByUrl(url) == 0) {
222+
try {
223+
val ImportNew文章HTML = CrawlerWebClient.getPageHtmlText(url)
224+
val ImportNew文章Document = Jsoup.parse(ImportNew文章HTML)
225+
val title = 获取ImportNew文章标题(ImportNew文章Document)
226+
val content = 获取ImportNew文章内容(ImportNew文章Document)
227+
println(title)
228+
println(url)
229+
doSaveKnowledge(
230+
url = url,
231+
title = title,
232+
content = content
233+
)
234+
} catch (e: Exception) {
235+
236+
}
237+
}
238+
}
239+
240+
}
241+
242+
private fun 获取ImportNew文章内容(importNew文章Document: Document?): String? {
243+
// document.getElementsByClassName("entry")
244+
return importNew文章Document?.getElementsByClass("entry")?.get(0)?.html()
245+
}
246+
247+
private fun 获取ImportNew文章标题(importNew文章Document: Document?): String? {
248+
// document.getElementsByClassName("entry-header")[0]
249+
// <div class=​"entry-header">​<h1>​使用 Java 注解自动化处理对应关系实现注释代码化​</h1>​</div>​
250+
// document.getElementsByClassName("entry-header")[0].children[0].innerHTML
251+
// "使用 Java 注解自动化处理对应关系实现注释代码化"
252+
return importNew文章Document?.getElementsByClass("entry-header")?.get(0)?.child(0)?.html()
253+
254+
}
255+
65256
private fun crawOSChina(page: Int) {
66257
val pageUrl = "https://www.oschina.net/action/ajax/get_more_recommend_blog?classification=0&p=$page"
67258
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
@@ -101,8 +292,6 @@ class CrawKnowledgeService {
101292
} catch (e: Exception) {
102293

103294
}
104-
105-
106295
}
107296
}
108297
}

src/main/resources/templates/common/head.ftl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@
4949
<dd><a href="/knowledge/doCrawJianShu" target="_blank">抓取简书</a></dd>
5050
<dd><a href="/knowledge/doCrawSegmentFaultKnowledge" target="_blank">抓取SegmentFault</a></dd>
5151
<dd><a href="/knowledge/doCrawOSChinaKnowledge" target="_blank">抓取OSChina</a></dd>
52+
<dd><a href="/knowledge/doCrawImportNewKnowledge" target="_blank">抓取ImportNew</a></dd>
53+
<dd><a href="/knowledge/doCrawCNBlogKnowledge" target="_blank">抓取CNBlog</a></dd>
54+
<dd><a href="/knowledge/doCrawInfoQKnowledge" target="_blank">抓取InfoQ</a></dd>
5255
<dd><a href="">超链接</a></dd>
5356
</dl>
5457
</li>

0 commit comments

Comments
 (0)