@@ -22,21 +22,35 @@ class CrawKnowledgeService {
22
22
val 简书专题URLs = CrawSourceDao .findJianShu()
23
23
简书专题URLs .forEach {
24
24
for (page in 1 .. 100 ) {
25
- crawJianShuArticles(page, it.url)
25
+ try {
26
+ crawJianShuArticles(page, it.url)
27
+ } catch (e: Exception ) {
28
+
29
+ }
30
+
26
31
}
27
32
}
28
33
}
29
34
30
35
31
36
fun doCrawSegmentFaultKnowledge () {
32
37
for (page in 1 .. 803 ) {
33
- crawSegmentFault(page)
38
+ try {
39
+ crawSegmentFault(page)
40
+ } catch (e: Exception ) {
41
+
42
+ }
43
+
34
44
}
35
45
}
36
46
37
47
fun doCrawOSChinaKnowledge () {
38
48
for (page in 1 .. 560 ) {
39
- crawOSChina(page)
49
+ try {
50
+ crawOSChina(page)
51
+ } catch (e: Exception ) {
52
+
53
+ }
40
54
}
41
55
}
42
56
@@ -65,17 +79,21 @@ class CrawKnowledgeService {
65
79
links.forEachIndexed { index, it ->
66
80
val url = it.attr(" href" )
67
81
if (KnowledgeDao .countByUrl(url) == 0 ) {
68
- val OSChina 文章HTML = CrawlerWebClient .getPageHtmlText(url)
69
- val OSChina 文章Document = Jsoup .parse(OSChina 文章HTML )
70
- val content = 获取OSChina 文章内容(OSChina 文章Document )
71
- println (url)
72
- println (content)
73
-
74
- doSaveKnowledge(
75
- url = url,
76
- title = titles[index],
77
- content = content
78
- )
82
+ try {
83
+ val OSChina 文章HTML = CrawlerWebClient .getPageHtmlText(url)
84
+ val OSChina 文章Document = Jsoup .parse(OSChina 文章HTML )
85
+ val content = 获取OSChina 文章内容(OSChina 文章Document )
86
+ println (url)
87
+ // println(content)
88
+ doSaveKnowledge(
89
+ url = url,
90
+ title = titles[index],
91
+ content = content
92
+ )
93
+ } catch (e: Exception ) {
94
+
95
+ }
96
+
79
97
80
98
}
81
99
}
@@ -94,22 +112,27 @@ class CrawKnowledgeService {
94
112
document.getElementsByClass(" blog-stream" )[0 ].children().forEach {
95
113
// document.getElementsByClassName('blog-stream')[0].children[0].children[1].children[0].children[0]
96
114
// <a href="/a/1190000000270453">开启 NFS 文件系统提升 Vagrant 共享目录的性能</a>
97
- val url = " https://segmentfault.com" + it.child(1 ).child(0 ).child(0 ).attr(" href" )
98
- if (KnowledgeDao .countByUrl(url) == 0 ) {
99
- val SegmentFault 文章HTML = CrawlerWebClient .getPageHtmlText(url)
100
- val SegmentFault 文章Document = Jsoup .parse(SegmentFault 文章HTML )
101
- val title = 获取SegmentFault 文章标题(SegmentFault 文章Document )
102
- val content = 获取SegmentFault 文章内容(SegmentFault 文章Document )
103
- println (title)
104
- println (url)
105
- println (content)
106
-
107
- doSaveKnowledge(
108
- url = url,
109
- title = title,
110
- content = content
111
- )
115
+ try {
116
+ val url = " https://segmentfault.com" + it.child(1 ).child(0 ).child(0 ).attr(" href" )
117
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
118
+ val SegmentFault 文章HTML = CrawlerWebClient .getPageHtmlText(url)
119
+ val SegmentFault 文章Document = Jsoup .parse(SegmentFault 文章HTML )
120
+ val title = 获取SegmentFault 文章标题(SegmentFault 文章Document )
121
+ val content = 获取SegmentFault 文章内容(SegmentFault 文章Document )
122
+ println (title)
123
+ println (url)
124
+ println (content)
125
+
126
+ doSaveKnowledge(
127
+ url = url,
128
+ title = title,
129
+ content = content
130
+ )
131
+ }
132
+ } catch (e: Exception ) {
133
+
112
134
}
135
+
113
136
}
114
137
115
138
}
@@ -133,22 +156,28 @@ class CrawKnowledgeService {
133
156
val 简书专题HTML = CrawlerWebClient .getPageHtmlText(简书专题分页URL )
134
157
val document = Jsoup .parse(简书专题HTML )
135
158
document.getElementsByClass(" content" ).forEach {
136
- val url = getKnowledgeUrl(it)
137
- if (KnowledgeDao .countByUrl(url) == 0 ) {
138
- val 简书文章HTML = CrawlerWebClient .getPageHtmlText(url)
139
- val 简书文章Document = Jsoup .parse(简书文章HTML )
140
- val title = 获取简书文章标题(简书文章Document )
141
- val content = 获取简书文章内容(简书文章Document )
142
- println (title)
143
- println (url)
144
- println (content)
145
-
146
- doSaveKnowledge(
147
- url = url,
148
- title = title,
149
- content = content
150
- )
159
+ try {
160
+ val url = getKnowledgeUrl(it)
161
+ if (KnowledgeDao .countByUrl(url) == 0 ) {
162
+ val 简书文章HTML = CrawlerWebClient .getPageHtmlText(url)
163
+ val 简书文章Document = Jsoup .parse(简书文章HTML )
164
+ val title = 获取简书文章标题(简书文章Document )
165
+ val content = 获取简书文章内容(简书文章Document )
166
+ println (title)
167
+ println (url)
168
+ println (content)
169
+
170
+ doSaveKnowledge(
171
+ url = url,
172
+ title = title,
173
+ content = content
174
+ )
175
+ }
176
+ } catch (e: Exception ) {
177
+
151
178
}
179
+
180
+
152
181
}
153
182
154
183
0 commit comments