Skip to content

Commit 1a40ec1

Browse files
committed
获取简书文章/获取SegmentFault文章
1 parent 8e21f73 commit 1a40ec1

File tree

15 files changed

+409
-160
lines changed

15 files changed

+409
-160
lines changed

app.sql

Lines changed: 28 additions & 35 deletions
Large diffs are not rendered by default.

build.gradle

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,13 @@ dependencies {
5252
runtime('org.springframework.boot:spring-boot-devtools')
5353
runtime('mysql:mysql-connector-java')
5454
testCompile('org.springframework.boot:spring-boot-starter-test')
55+
56+
compile 'com.squareup.okhttp3:okhttp:3.8.1'
57+
compile group: 'net.sourceforge.htmlunit', name: 'htmlunit', version: '2.27'
58+
compile group: 'org.jsoup', name: 'jsoup', version: '1.10.3'
59+
60+
compile group: 'org.jetbrains.kotlinx', name: 'kotlinx-coroutines-core', version: '0.19.2'
61+
62+
63+
5564
}

src/main/kotlin/com/light/saber/controller/KnowledgeController.kt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ import java.util.*
1515

1616
@Controller
1717
class KnowledgeController {
18-
@Autowired lateinit var KnowledgeDao: KnowledgeDao
19-
@Autowired lateinit var KnowledgeService: KnowledgeService
18+
@Autowired
19+
lateinit var KnowledgeDao: KnowledgeDao
20+
@Autowired
21+
lateinit var KnowledgeService: KnowledgeService
2022

2123
@GetMapping("/")
2224
fun root(@RequestParam(value = "title", required = false) title: String?,
@@ -34,16 +36,19 @@ class KnowledgeController {
3436
return "index"
3537
}
3638

39+
@GetMapping("toAddKnowledgePage")
40+
fun toAddKnowledgePage() = "add"
41+
3742
@PostMapping("/addKnowledge")
3843
@ResponseBody
3944
fun addKnowledge(knowledge: Knowledge): Result<String> {
4045
val title = knowledge.title
41-
val answer = knowledge.answer
46+
val content = knowledge.content
4247
if (StringUtils.isEmpty(title)) {
4348
return Result(title, "问题不能为空", false)
4449
} else if (title.length > 100) {
4550
return Result(title, "问题长度不能超过100", false)
46-
} else if (StringUtils.isEmpty(answer)) {
51+
} else if (StringUtils.isEmpty(content)) {
4752
return Result(title, "答案不能为空", false)
4853
} else if (isTitleExist(title)) {
4954
return Result(title, "问题已经存在,请换一个问题", false)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package com.light.saber.controller
2+
3+
import com.light.saber.service.CrawKnowledgeService
4+
import org.springframework.beans.factory.annotation.Autowired
5+
import org.springframework.web.bind.annotation.GetMapping
6+
import org.springframework.web.bind.annotation.RestController
7+
8+
9+
@RestController
10+
class KnowledgeCrawController {
11+
@Autowired
12+
lateinit var CrawKnowledgeService: CrawKnowledgeService
13+
14+
15+
@GetMapping("/knowledge/doCrawJianShu")
16+
fun doCrawJianShu(): String {
17+
CrawKnowledgeService.doCrawJianShuKnowledge()
18+
return "DONE"
19+
}
20+
21+
@GetMapping("/knowledge/doCrawSegmentFaultKnowledge")
22+
fun doCrawSegmentFaultKnowledge(): String {
23+
CrawKnowledgeService.doCrawSegmentFaultKnowledge()
24+
return "DONE"
25+
}
26+
27+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package com.light.saber.dao
2+
3+
import com.light.saber.model.CrawSource
4+
import org.springframework.data.jpa.repository.JpaRepository
5+
import org.springframework.data.jpa.repository.Query
6+
7+
interface CrawSourceDao : JpaRepository<CrawSource, Long>{
8+
9+
@Query("select a from #{#entityName} a where a.type = 'JIAN_SHU' order by a.id desc")
10+
fun findJianShu():List<CrawSource>
11+
}

src/main/kotlin/com/light/saber/dao/KnowledgeDao.kt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import org.springframework.data.jpa.repository.Query
88

99
interface KnowledgeDao : JpaRepository<Knowledge, Long> {
1010

11-
// JPQL里面没有limit
11+
// JPQL里面没有limit
1212
@Query(value = "SELECT * FROM knowledge WHERE title = :title limit 1", nativeQuery = true)
1313
fun selectByTitle(title: String): Knowledge?
1414

@@ -18,4 +18,7 @@ interface KnowledgeDao : JpaRepository<Knowledge, Long> {
1818
@Query("SELECT a FROM #{#entityName} a")
1919
fun page(page: Pageable): Page<Knowledge>
2020

21+
@Query("select count(*) from #{#entityName} a where a.url = :url")
22+
abstract fun countByUrl(url: String): Int
23+
2124
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package com.light.saber.model
2+
3+
import javax.persistence.*
4+
5+
@Entity
6+
class CrawSource {
7+
@Id
8+
@GeneratedValue(strategy = GenerationType.IDENTITY)
9+
var id: Long = 0
10+
11+
@Column(length = 10)
12+
var type = ""
13+
14+
@Column(length = 200, unique = true)
15+
var url = ""
16+
17+
}

src/main/kotlin/com/light/saber/model/Knowledge.kt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,15 @@ class Knowledge {
1010
@GeneratedValue(strategy = GenerationType.IDENTITY)
1111
var id: Long = 0
1212

13-
@Column(length = 100, unique = true)
13+
@Column(length = 200, unique = true, nullable = false)
1414
var title = ""
15+
16+
@Column(length = 200, unique = true, nullable = false)
17+
var url = ""
18+
1519
@Lob
16-
var answer = ""
20+
@Column(nullable = false)
21+
var content = ""
1722

1823
@JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", locale = "GMT+8")
1924
var gmtCreate = Date()
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package com.light.saber.service
2+
3+
import com.light.saber.dao.CrawSourceDao
4+
import com.light.saber.dao.KnowledgeDao
5+
import com.light.saber.model.Knowledge
6+
import com.light.saber.webclient.CrawlerWebClient
7+
import kotlinx.coroutines.experimental.CommonPool
8+
import kotlinx.coroutines.experimental.launch
9+
import org.jsoup.Jsoup
10+
import org.jsoup.nodes.Document
11+
import org.jsoup.nodes.Element
12+
import org.springframework.beans.factory.annotation.Autowired
13+
import org.springframework.stereotype.Service
14+
15+
@Service
16+
class CrawKnowledgeService {
17+
@Autowired
18+
lateinit var KnowledgeDao: KnowledgeDao
19+
@Autowired
20+
lateinit var CrawSourceDao: CrawSourceDao
21+
22+
fun doCrawJianShuKnowledge() {
23+
24+
val 简书专题URLs = CrawSourceDao.findJianShu()
25+
26+
简书专题URLs.forEach {
27+
launch(CommonPool) {
28+
for (page in 1..100) {
29+
crawJianShuArticles(page, it.url)
30+
}
31+
}
32+
}
33+
}
34+
35+
36+
fun doCrawSegmentFaultKnowledge() {
37+
for (page in 1..803) {
38+
launch(CommonPool) {
39+
crawSegmentFault(page)
40+
}
41+
}
42+
}
43+
44+
private fun crawSegmentFault(page: Int) {
45+
val SegmentFault文章列表的HTML = CrawlerWebClient.getPageHtmlText("https://segmentfault.com/blogs?page=$page")
46+
val document = Jsoup.parse(SegmentFault文章列表的HTML)
47+
//document.getElementsByClassName('blog-stream')[0].children.length
48+
document.getElementsByClass("blog-stream")[0].children().forEach {
49+
// document.getElementsByClassName('blog-stream')[0].children[0].children[1].children[0].children[0]
50+
// <a href=​"/a/​1190000000270453">​开启 NFS 文件系统提升 Vagrant 共享目录的性能​</a>​
51+
val url = "https://segmentfault.com" + it.child(1).child(0).child(0).attr("href")
52+
if (KnowledgeDao.countByUrl(url) == 0) {
53+
val SegmentFault文章HTML = CrawlerWebClient.getPageHtmlText(url)
54+
val SegmentFault文章Document = Jsoup.parse(SegmentFault文章HTML)
55+
val title = 获取SegmentFault文章标题(SegmentFault文章Document)
56+
val content = 获取SegmentFault文章内容(SegmentFault文章Document)
57+
println(title)
58+
println(url)
59+
println(content)
60+
61+
doSaveKnowledge(
62+
url = url,
63+
title = title,
64+
content = content
65+
)
66+
}
67+
}
68+
69+
}
70+
71+
private fun 获取SegmentFault文章内容(segmentFault文章Document: Document?): String? {
72+
// document.getElementsByClassName('article__content')
73+
val e = segmentFault文章Document?.getElementsByClass("article__content")
74+
return e?.html()
75+
}
76+
77+
private fun 获取SegmentFault文章标题(segmentFault文章Document: Document?): String? {
78+
// document.getElementById('articleTitle').children[0].innerHTML
79+
// " 开启 NFS 文件系统提升 Vagrant 共享目录的性能"
80+
val e = segmentFault文章Document?.getElementById("articleTitle")
81+
return e?.child(0)?.html()
82+
}
83+
84+
85+
private fun crawJianShuArticles(page: Int, 要遍历的简书专题URL: String) {
86+
val 简书专题分页URL = "${要遍历的简书专题URL}?order_by=added_at&page=${page}"
87+
val 简书专题HTML = CrawlerWebClient.getPageHtmlText(简书专题分页URL)
88+
val document = Jsoup.parse(简书专题HTML)
89+
document.getElementsByClass("content").forEach {
90+
val url = getKnowledgeUrl(it)
91+
if (KnowledgeDao.countByUrl(url) == 0) {
92+
val 简书文章HTML = CrawlerWebClient.getPageHtmlText(url)
93+
val 简书文章Document = Jsoup.parse(简书文章HTML)
94+
val title = 获取简书文章标题(简书文章Document)
95+
val content = 获取简书文章内容(简书文章Document)
96+
println(title)
97+
println(url)
98+
println(content)
99+
100+
doSaveKnowledge(
101+
url = url,
102+
title = title,
103+
content = content
104+
)
105+
}
106+
}
107+
108+
109+
}
110+
111+
private fun 获取简书文章内容(jianShuArticleDocument: Document?): String? {
112+
// document.getElementsByClassName('article')[0].children
113+
// HTMLCollection(3) [h1.title, div.author, div.show-content]
114+
val e = jianShuArticleDocument?.getElementsByClass("article")
115+
return e?.get(0)?.child(2)?.html()
116+
}
117+
118+
private fun 获取简书文章标题(jianShuArticleDocument: Document?): String? {
119+
val e = jianShuArticleDocument?.getElementsByClass("article")
120+
return e?.get(0)?.child(0)?.html()
121+
}
122+
123+
private fun getKnowledgeUrl(it: Element): String {
124+
return "http://www.jianshu.com" + it.child(0).attr("href")
125+
}
126+
127+
private fun doSaveKnowledge(url: String, title: String?, content: String?) {
128+
val Knowledge = Knowledge()
129+
Knowledge.url = url
130+
Knowledge.title = title ?: ""
131+
Knowledge.content = content ?: ""
132+
try {
133+
KnowledgeDao.save(Knowledge)
134+
} catch (e: Exception) {
135+
// ignore
136+
}
137+
}
138+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package com.light.saber.webclient
2+
3+
4+
import com.gargoylesoftware.htmlunit.WebClient
5+
import com.gargoylesoftware.htmlunit.html.HtmlPage
6+
7+
8+
object CrawlerWebClient {
9+
private var webClient: WebClient? = null
10+
11+
private fun instanceWebClient(javaScriptTimeout: Long): WebClient {
12+
if (webClient == null) {
13+
webClient = WebClient()
14+
}
15+
if (javaScriptTimeout > 0) {
16+
webClient?.javaScriptTimeout = javaScriptTimeout
17+
}
18+
webClient?.options?.isJavaScriptEnabled = true //启用JS解释器,默认为true
19+
webClient?.options?.isCssEnabled = false
20+
webClient?.options?.isThrowExceptionOnScriptError = false //js运行错误时,是否抛出异常
21+
webClient?.options?.isUseInsecureSSL = true
22+
return webClient as WebClient
23+
}
24+
25+
fun getPageHtmlText(url: String): String? {
26+
webClient = instanceWebClient(3000)
27+
return webClient?.getPage<HtmlPage>(url)?.asXml()
28+
}
29+
30+
}

src/main/resources/application.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ spring.application.name=saber
22
info.app.name=saber
33
info.app.version=v1.0.0
44
info.app.description=saber
5-
server.port=8008
5+
server.port=7000
66
spring.datasource.url=jdbc:mysql://127.0.0.1:3306/saber?zeroDateTimeBehavior=convertToNull&characterEncoding=utf8&characterSetResults=utf8&useSSL=false
77
spring.datasource.username=root
88
spring.datasource.password=root

0 commit comments

Comments
 (0)