@@ -18,6 +18,17 @@ class CrawKnowledgeService {
18
18
@Autowired
19
19
lateinit var CrawSourceDao : CrawSourceDao
20
20
21
+
22
+ fun doCrawBlockChainKnowledge () {
23
+ for (page in 0 .. 40 ) {
24
+ try {
25
+ crawBlockChain(page)
26
+ } catch (e: Exception ) {
27
+
28
+ }
29
+ }
30
+ }
31
+
21
32
fun doCrawJianShuKnowledge () {
22
33
val 简书专题URLs = CrawSourceDao .findJianShu()
23
34
简书专题URLs .forEach {
@@ -94,6 +105,36 @@ class CrawKnowledgeService {
94
105
}
95
106
}
96
107
108
+ private fun crawBlockChain (page : Int ) {
109
+ val pageUrl = " http://www.blockchainbrother.com/articles?page=${page} "
110
+ val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
111
+ val document = Jsoup .parse(文章列表HTML )
112
+ // document.getElementsByClassName("news_type2 full_screen")[0].children[1].children[0]
113
+ // <a href="/cn/articles/Reactive-Systems-Akka-Actors-DomainDrivenDesign" title="使用Akka的Actor模型和领域驱动设计构建反应式系统">…</a>
114
+ document.getElementsByClass(" title" ).forEach {
115
+ val url = it.child(0 ).attr(" href" )
116
+ val title = it.child(0 ).html()
117
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
118
+ try {
119
+ val BlockChain 文章HTML = CrawlerWebClient .getPageHtmlText(url)
120
+ val BlockChain 文章Document = Jsoup .parse(BlockChain 文章HTML )
121
+ val content = 获取BlockChain 文章内容(BlockChain 文章Document )
122
+ println (title)
123
+ println (url)
124
+ doSaveKnowledge(
125
+ url = url,
126
+ title = title,
127
+ content = content
128
+ )
129
+ } catch (e: Exception ) {
130
+
131
+ }
132
+ }
133
+ }
134
+
135
+ }
136
+
137
+
97
138
private fun crawInfoQ (page : Int ) {
98
139
val pageUrl = " http://www.infoq.com/cn/java/articles/${page * 12 } "
99
140
val 文章列表HTML = CrawlerWebClient .getPageHtmlText(pageUrl)
@@ -364,6 +405,10 @@ class CrawKnowledgeService {
364
405
365
406
}
366
407
408
+ private fun 获取BlockChain 文章内容(infoQ文章Document : Document ? ): String? {
409
+ return infoQ文章Document ?.getElementsByClass(" widget-article" )?.get(0 )?.html()
410
+ }
411
+
367
412
private fun 获取简书文章内容(jianShuArticleDocument : Document ? ): String? {
368
413
// document.getElementsByClassName('article')[0].children
369
414
// HTMLCollection(3) [h1.title, div.author, div.show-content]
0 commit comments