Skip to content

Commit 4929ac4

Browse files
author
jack
committed
crawBlockChain
1 parent 56dd224 commit 4929ac4

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,17 @@ class CrawKnowledgeService {
1818
@Autowired
1919
lateinit var CrawSourceDao: CrawSourceDao
2020

21+
22+
fun doCrawBlockChainKnowledge() {
23+
for (page in 0..40) {
24+
try {
25+
crawBlockChain(page)
26+
} catch (e: Exception) {
27+
28+
}
29+
}
30+
}
31+
2132
fun doCrawJianShuKnowledge() {
2233
val 简书专题URLs = CrawSourceDao.findJianShu()
2334
简书专题URLs.forEach {
@@ -94,6 +105,36 @@ class CrawKnowledgeService {
94105
}
95106
}
96107

108+
private fun crawBlockChain(page: Int) {
109+
val pageUrl = "http://www.blockchainbrother.com/articles?page=${page}"
110+
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
111+
val document = Jsoup.parse(文章列表HTML)
112+
// document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]
113+
//<a href=​"/​cn/​articles/​Reactive-Systems-Akka-Actors-DomainDrivenDesign" title=​"使用Akka的Actor模型和领域驱动设计构建反应式系统">​…​</a>​
114+
document.getElementsByClass("title").forEach {
115+
val url = it.child(0).attr("href")
116+
val title = it.child(0).html()
117+
if (KnowledgeDao.countByUrl(url) == 0) {
118+
try {
119+
val BlockChain文章HTML = CrawlerWebClient.getPageHtmlText(url)
120+
val BlockChain文章Document = Jsoup.parse(BlockChain文章HTML)
121+
val content = 获取BlockChain文章内容(BlockChain文章Document)
122+
println(title)
123+
println(url)
124+
doSaveKnowledge(
125+
url = url,
126+
title = title,
127+
content = content
128+
)
129+
} catch (e: Exception) {
130+
131+
}
132+
}
133+
}
134+
135+
}
136+
137+
97138
private fun crawInfoQ(page: Int) {
98139
val pageUrl = "http://www.infoq.com/cn/java/articles/${page * 12}"
99140
val 文章列表HTML = CrawlerWebClient.getPageHtmlText(pageUrl)
@@ -364,6 +405,10 @@ class CrawKnowledgeService {
364405

365406
}
366407

408+
private fun 获取BlockChain文章内容(infoQ文章Document: Document?): String? {
409+
return infoQ文章Document?.getElementsByClass("widget-article")?.get(0)?.html()
410+
}
411+
367412
private fun 获取简书文章内容(jianShuArticleDocument: Document?): String? {
368413
// document.getElementsByClassName('article')[0].children
369414
// HTMLCollection(3) [h1.title, div.author, div.show-content]

0 commit comments

Comments
 (0)