Safer fetching

html5lib · jayvdb · Jul 27, 2016 · Jul 27, 2016 · Jul 27, 2016 · Jul 28, 2016
commit 5ee2066b49de79c06c9b3bbf1ae4deaa3e2d6a67
diff --git a/utils/spider.py b/utils/spider.py
@@ -147,9 +147,25 @@ def check_robots(self, urls):
             robotURL = urllib_parse.urlunsplit(robotURL)
             self.robotParser.set_url(https://github.com/robotURL)
             try:
-                self.robotParser.read()
+                resp, content = self.http.request(robotURL, "GET")
             except Exception as e:
-                print('Failed to read {0}: {1}'.format(robotURL, e), file=sys.stderr)
+                print("Failed to fetch {0}: {1}".format(robotURL, e), file=sys.stderr)
+                urls.remove(url)
+                continue
+
+            if resp['status'] == '404':
+                # no robots.txt to check
+                continue
+
+            if resp['status'] not in ('200', '304'):
+                print("Fetch {0} status {1}".format(url, resp['status']), file=sys.stderr)
+                urls.remove(url)
+                continue
+
+            try:
+                self.robotParser.parse(content.decode('utf8'))
+            except Exception as e:
+                print('Failed to parse {0}: {1}'.format(robotURL, e), file=sys.stderr)
                 urls.remove(url)
                 continue