Skip to content

Commit a91de54

Browse files
committed
爬虫异常try catch, ignore
1 parent 0ba1d2f commit a91de54

File tree

2 files changed

+79
-46
lines changed

2 files changed

+79
-46
lines changed

src/main/kotlin/com/light/saber/service/CrawKnowledgeService.kt

Lines changed: 73 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,35 @@ class CrawKnowledgeService {
2222
val 简书专题URLs = CrawSourceDao.findJianShu()
2323
简书专题URLs.forEach {
2424
for (page in 1..100) {
25-
crawJianShuArticles(page, it.url)
25+
try {
26+
crawJianShuArticles(page, it.url)
27+
} catch (e: Exception) {
28+
29+
}
30+
2631
}
2732
}
2833
}
2934

3035

3136
fun doCrawSegmentFaultKnowledge() {
3237
for (page in 1..803) {
33-
crawSegmentFault(page)
38+
try {
39+
crawSegmentFault(page)
40+
} catch (e: Exception) {
41+
42+
}
43+
3444
}
3545
}
3646

3747
fun doCrawOSChinaKnowledge() {
3848
for (page in 1..560) {
39-
crawOSChina(page)
49+
try {
50+
crawOSChina(page)
51+
} catch (e: Exception) {
52+
53+
}
4054
}
4155
}
4256

@@ -65,17 +79,21 @@ class CrawKnowledgeService {
6579
links.forEachIndexed { index, it ->
6680
val url = it.attr("href")
6781
if (KnowledgeDao.countByUrl(url) == 0) {
68-
val OSChina文章HTML = CrawlerWebClient.getPageHtmlText(url)
69-
val OSChina文章Document = Jsoup.parse(OSChina文章HTML)
70-
val content = 获取OSChina文章内容(OSChina文章Document)
71-
println(url)
72-
println(content)
73-
74-
doSaveKnowledge(
75-
url = url,
76-
title = titles[index],
77-
content = content
78-
)
82+
try {
83+
val OSChina文章HTML = CrawlerWebClient.getPageHtmlText(url)
84+
val OSChina文章Document = Jsoup.parse(OSChina文章HTML)
85+
val content = 获取OSChina文章内容(OSChina文章Document)
86+
println(url)
87+
// println(content)
88+
doSaveKnowledge(
89+
url = url,
90+
title = titles[index],
91+
content = content
92+
)
93+
} catch (e: Exception) {
94+
95+
}
96+
7997

8098
}
8199
}
@@ -94,22 +112,27 @@ class CrawKnowledgeService {
94112
document.getElementsByClass("blog-stream")[0].children().forEach {
95113
// document.getElementsByClassName('blog-stream')[0].children[0].children[1].children[0].children[0]
96114
// <a href=​"/a/​1190000000270453">​开启 NFS 文件系统提升 Vagrant 共享目录的性能​</a>​
97-
val url = "https://segmentfault.com" + it.child(1).child(0).child(0).attr("href")
98-
if (KnowledgeDao.countByUrl(url) == 0) {
99-
val SegmentFault文章HTML = CrawlerWebClient.getPageHtmlText(url)
100-
val SegmentFault文章Document = Jsoup.parse(SegmentFault文章HTML)
101-
val title = 获取SegmentFault文章标题(SegmentFault文章Document)
102-
val content = 获取SegmentFault文章内容(SegmentFault文章Document)
103-
println(title)
104-
println(url)
105-
println(content)
106-
107-
doSaveKnowledge(
108-
url = url,
109-
title = title,
110-
content = content
111-
)
115+
try {
116+
val url = "https://segmentfault.com" + it.child(1).child(0).child(0).attr("href")
117+
if (KnowledgeDao.countByUrl(url) == 0) {
118+
val SegmentFault文章HTML = CrawlerWebClient.getPageHtmlText(url)
119+
val SegmentFault文章Document = Jsoup.parse(SegmentFault文章HTML)
120+
val title = 获取SegmentFault文章标题(SegmentFault文章Document)
121+
val content = 获取SegmentFault文章内容(SegmentFault文章Document)
122+
println(title)
123+
println(url)
124+
println(content)
125+
126+
doSaveKnowledge(
127+
url = url,
128+
title = title,
129+
content = content
130+
)
131+
}
132+
} catch (e: Exception) {
133+
112134
}
135+
113136
}
114137

115138
}
@@ -133,22 +156,28 @@ class CrawKnowledgeService {
133156
val 简书专题HTML = CrawlerWebClient.getPageHtmlText(简书专题分页URL)
134157
val document = Jsoup.parse(简书专题HTML)
135158
document.getElementsByClass("content").forEach {
136-
val url = getKnowledgeUrl(it)
137-
if (KnowledgeDao.countByUrl(url) == 0) {
138-
val 简书文章HTML = CrawlerWebClient.getPageHtmlText(url)
139-
val 简书文章Document = Jsoup.parse(简书文章HTML)
140-
val title = 获取简书文章标题(简书文章Document)
141-
val content = 获取简书文章内容(简书文章Document)
142-
println(title)
143-
println(url)
144-
println(content)
145-
146-
doSaveKnowledge(
147-
url = url,
148-
title = title,
149-
content = content
150-
)
159+
try {
160+
val url = getKnowledgeUrl(it)
161+
if (KnowledgeDao.countByUrl(url) == 0) {
162+
val 简书文章HTML = CrawlerWebClient.getPageHtmlText(url)
163+
val 简书文章Document = Jsoup.parse(简书文章HTML)
164+
val title = 获取简书文章标题(简书文章Document)
165+
val content = 获取简书文章内容(简书文章Document)
166+
println(title)
167+
println(url)
168+
println(content)
169+
170+
doSaveKnowledge(
171+
url = url,
172+
title = title,
173+
content = content
174+
)
175+
}
176+
} catch (e: Exception) {
177+
151178
}
179+
180+
152181
}
153182

154183

src/main/kotlin/com/light/saber/webclient/CrawlerWebClient.kt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,12 @@ object CrawlerWebClient {
2323
}
2424

2525
fun getPageHtmlText(url: String): String? {
26-
webClient = instanceWebClient(3000)
27-
return webClient?.getPage<HtmlPage>(url)?.asXml()
26+
webClient = instanceWebClient(7000)
27+
try {
28+
return webClient?.getPage<HtmlPage>(url)?.asXml()
29+
} catch (e: Exception) {
30+
return null
31+
}
2832
}
2933

3034
}

0 commit comments

Comments
 (0)