Skip to content

Commit fdf0615

Browse files
author
jack
committed
KnowledgeCrawlerOfSpring4All
1 parent 9a29483 commit fdf0615

File tree

10 files changed

+139
-16
lines changed

10 files changed

+139
-16
lines changed

build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ dependencies {
5959

6060
compile group: 'org.jetbrains.kotlinx', name: 'kotlinx-coroutines-core', version: '0.19.2'
6161

62+
// https://mvnrepository.com/artifact/com.alibaba/fastjson
63+
compile group: 'com.alibaba', name: 'fastjson', version: '1.2.47'
6264

6365

6466
}

src/main/kotlin/com/light/saber/controller/KnowledgeCrawController.kt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.light.saber.controller
22

3+
import com.light.saber.crawler.KnowledgeCrawlerOfSpring4All
34
import com.light.saber.service.CrawKnowledgeService
45
import org.springframework.beans.factory.annotation.Autowired
56
import org.springframework.web.bind.annotation.GetMapping
@@ -12,6 +13,10 @@ class KnowledgeCrawController {
1213
lateinit var CrawKnowledgeService: CrawKnowledgeService
1314

1415

16+
@Autowired
17+
lateinit var KnowledgeCrawlerOfSpring4All: KnowledgeCrawlerOfSpring4All
18+
19+
1520
@GetMapping("/knowledge/doCrawJianShu")
1621
fun doCrawJianShu(): String {
1722
Thread {
@@ -74,5 +79,25 @@ class KnowledgeCrawController {
7479

7580
return "DONE"
7681
}
82+
83+
84+
@GetMapping("/knowledge/doCrawBlockChainKnowledge")
85+
fun doCrawBlockChainKnowledge(): String {
86+
Thread {
87+
CrawKnowledgeService.doCrawBlockChainKnowledge()
88+
}.start()
89+
90+
return "DONE"
91+
}
92+
93+
94+
@GetMapping("/knowledge/KnowledgeCrawlerOfSpring4All")
95+
fun KnowledgeCrawlerOfSpring4All(): String {
96+
Thread {
97+
KnowledgeCrawlerOfSpring4All.doCraw()
98+
}.start()
99+
100+
return "DONE"
101+
}
77102

78103
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package com.light.saber.crawler
2+
3+
import com.alibaba.fastjson.JSON
4+
import com.alibaba.fastjson.JSONArray
5+
import com.light.saber.service.KnowledgeService
6+
import com.light.saber.webclient.CrawlerWebClient
7+
import org.jsoup.Jsoup
8+
import org.jsoup.nodes.Element
9+
import org.springframework.beans.factory.annotation.Autowired
10+
import org.springframework.beans.factory.annotation.Qualifier
11+
import org.springframework.scheduling.annotation.Scheduled
12+
import org.springframework.stereotype.Service
13+
import java.net.URL
14+
15+
@Service
16+
class KnowledgeCrawlerOfSpring4All : KnowledgeJsonCrawler {
17+
@Autowired
18+
@Qualifier("knowledgeService")
19+
lateinit var KnowledgeService: KnowledgeService
20+
@Autowired
21+
lateinit var CrawlerWebClient: CrawlerWebClient
22+
23+
override fun getArticleBody(e: Element) = e.getElementsByClass("fmt").html()
24+
25+
override fun pageUrls(page: Int): String = "http://www.spring4all.com/common/articles/${page}"
26+
27+
override fun getPageJson(url: String) = URL(url).readText(Charsets.UTF_8)
28+
29+
@Scheduled(cron = "0 30 0 1/1 * ?")
30+
override fun doCraw() {
31+
for (p in 1..62) {
32+
val pageUrl = pageUrls(p)
33+
val pageJson = getPageJson(pageUrl)
34+
val map = JSON.parse(pageJson) as Map<*, *>
35+
val data = map["data"] as Map<*, *>
36+
val list = data["list"] as JSONArray
37+
list.forEach {
38+
val articleUrl = "http://www.spring4all.com/article/${(it as Map<*, *>)["id"]}"
39+
val articleTitle = it["title"] as String
40+
val articleHTML = CrawlerWebClient.getPageHtmlText(articleUrl)
41+
val articleDocument = Jsoup.parse(articleHTML)
42+
val articleBody = getArticleBody(articleDocument)
43+
println(articleTitle)
44+
println(articleUrl)
45+
KnowledgeService.doSaveKnowledge(articleUrl, articleTitle, articleBody)
46+
}
47+
}
48+
}
49+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package com.light.saber.crawler
2+
3+
import org.jsoup.nodes.Element
4+
5+
interface KnowledgeJsonCrawler {
6+
fun pageUrls(page: Int): String
7+
fun getPageJson(url: String): String
8+
fun getArticleBody(e: Element): String
9+
fun doCraw()
10+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package com.light.saber.crawler
2+
3+
import org.jsoup.nodes.Element
4+
import org.jsoup.select.Elements
5+
import org.springframework.beans.factory.annotation.Autowired
6+
import org.springframework.util.StringUtils
7+
import java.util.*
8+
9+
interface KnowledgeWebCrawler {
10+
fun pageUrls(page: Int): String
11+
fun getArticleListDocument(url: String, className: String): Elements
12+
fun getArticleUrl(e: Element, className: String): String
13+
fun getArticleTitle(e: Element, className: String): String
14+
fun getArticleBody(e: Element): String
15+
16+
fun doCraw()
17+
}

src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ class CrawKnowledgeService {
1717
lateinit var KnowledgeDao: KnowledgeDao
1818
@Autowired
1919
lateinit var CrawSourceDao: CrawSourceDao
20-
20+
@Autowired
21+
lateinit var CrawlerWebClient: CrawlerWebClient
2122

2223
fun doCrawBlockChainKnowledge() {
2324
for (page in 0..40) {

src/main/kotlin/com/light/saber/service/KnowledgeService.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ import org.springframework.data.domain.Pageable
66

77
interface KnowledgeService {
88
abstract fun page(title: String?, page: Pageable): Page<Knowledge>
9+
fun doSaveKnowledge(articleUrl: String, articleTitle: String, articleBody: String?)
10+
911
}

src/main/kotlin/com/light/saber/service/KnowledgeServiceImpl.kt

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,27 @@ import org.springframework.data.domain.Page
77
import org.springframework.data.domain.Pageable
88
import org.springframework.stereotype.Service
99
import org.springframework.util.StringUtils
10+
import java.util.*
1011

11-
@Service
12+
@Service("knowledgeService")
1213
class KnowledgeServiceImpl : KnowledgeService {
14+
override fun doSaveKnowledge(url: String, title: String, content: String?) {
15+
if (StringUtils.isEmpty(url) || StringUtils.isEmpty(title) || StringUtils.isEmpty(content)) {
16+
return
17+
}
18+
19+
val Knowledge = Knowledge()
20+
Knowledge.title = title ?: ""
21+
Knowledge.content = content ?: ""
22+
Knowledge.gmtCreate = Date()
23+
Knowledge.gmtModified = Date()
24+
try {
25+
KnowledgeDao.save(Knowledge)
26+
} catch (e: Exception) {
27+
e.printStackTrace()
28+
}
29+
}
30+
1331
@Autowired lateinit var KnowledgeDao: KnowledgeDao
1432
override fun page(title: String?, page: Pageable): Page<Knowledge> {
1533
if (StringUtils.isEmpty(title)) {

src/main/kotlin/com/light/saber/webclient/CrawlerWebClient.kt

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,27 @@ package com.light.saber.webclient
33

44
import com.gargoylesoftware.htmlunit.WebClient
55
import com.gargoylesoftware.htmlunit.html.HtmlPage
6+
import org.springframework.stereotype.Service
67

7-
8-
object CrawlerWebClient {
9-
private var webClient: WebClient? = null
8+
@Service
9+
class CrawlerWebClient {
1010

1111
private fun instanceWebClient(javaScriptTimeout: Long): WebClient {
12-
if (webClient == null) {
13-
webClient = WebClient()
14-
}
12+
val webClient = WebClient()
1513
if (javaScriptTimeout > 0) {
16-
webClient?.javaScriptTimeout = javaScriptTimeout
14+
webClient.javaScriptTimeout = javaScriptTimeout
1715
}
18-
webClient?.options?.isJavaScriptEnabled = true //启用JS解释器,默认为true
19-
webClient?.options?.isCssEnabled = false
20-
webClient?.options?.isThrowExceptionOnScriptError = false //js运行错误时,是否抛出异常
21-
webClient?.options?.isUseInsecureSSL = true
16+
webClient.options.isJavaScriptEnabled = true //启用JS解释器,默认为true
17+
webClient.options.isCssEnabled = false
18+
webClient.options.isThrowExceptionOnScriptError = false //js运行错误时,是否抛出异常
19+
webClient.options.isUseInsecureSSL = true
2220
return webClient as WebClient
2321
}
2422

25-
@Synchronized
2623
fun getPageHtmlText(url: String): String? {
27-
webClient = instanceWebClient(3000)
24+
val webClient = instanceWebClient(3000)
2825
return try {
29-
webClient?.getPage<HtmlPage>(url)?.asXml()
26+
webClient.getPage<HtmlPage>(url).asXml()
3027
} catch (e: Exception) {
3128
null
3229
}

src/main/resources/templates/common/head.ftl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
<dd><a href="/knowledge/doCrawImportNewKnowledge" target="_blank">抓取ImportNew</a></dd>
5959
<dd><a href="/knowledge/doCrawCNBlogKnowledge" target="_blank">抓取CNBlog</a></dd>
6060
<dd><a href="/knowledge/doCrawInfoQKnowledge" target="_blank">抓取InfoQ</a></dd>
61+
<dd><a href="/knowledge/doCrawBlockChainKnowledge" target="_blank">BlockChain</a></dd>
62+
<dd><a href="/knowledge/KnowledgeCrawlerOfSpring4All" target="_blank">Spring4All</a></dd>
6163
<dd><a href="">超链接</a></dd>
6264
</dl>
6365
</li>

0 commit comments

Comments
 (0)