File tree Expand file tree Collapse file tree 3 files changed +99
-0
lines changed Expand file tree Collapse file tree 3 files changed +99
-0
lines changed Original file line number Diff line number Diff line change
1
+ from selenium import webdriver
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+
5
+
6
+ browser = webdriver .Firefox ()
7
+
8
+ url = 'http://www.xampaperz.com/1stsemcse.php'
9
+ browser .get (url )
10
+ time .sleep (10 )
11
+
12
+ page = browser .page_source
13
+ f = open ("adsa.txt" ,"w" )
14
+ f .write (page .encode ("utf-8" ))
15
+ f .close ()
Original file line number Diff line number Diff line change
1
+ import urllib2
2
+ from bs4 import BeautifulSoup
3
+
4
+
5
+ print "Welcome ! \n "
6
+
7
+
8
+
9
+
10
+ #The million dollar base url
11
+ url = "http://www.geeksforgeeks.org/data-structures"
12
+
13
+
14
+ #Variables
15
+ filter_stuff = [] #used in function make_things_alright as a temporary array which stores useful links
16
+ chapters = [] #This array will store all the useful links from webpage discarding other links
17
+ pdf_links = []
18
+
19
+ def get_page (url ):
20
+ opener = urllib2 .build_opener ()
21
+ opener .addheaders = [('User-agent' , 'Chrome/5.0' )]
22
+ response = opener .open (url )
23
+ page = response .read ()
24
+ return page
25
+
26
+ def get_next_target (page ):
27
+ start_link = page .find ('<a href="' )
28
+ if start_link == - 1 :
29
+ return None , 0
30
+ start_quote = page .find ('"' , start_link )
31
+ end_quote = page .find ('"' , start_quote + 1 )
32
+ url = page [start_quote + 1 :end_quote ]
33
+ return url , end_quote
34
+ new_links = []
35
+ def get_all_links (page ):
36
+ links = []
37
+ while True :
38
+ url ,endpos = get_next_target (page )
39
+ if url :
40
+ links .append (url )
41
+ page = page [endpos :]
42
+ else :
43
+ break
44
+ return links
45
+ fresh_links = []
46
+ links = get_all_links (get_page (url ))
47
+ for x in links :
48
+ if "category" not in x and "/data-structures" not in x and "fundamentals-of-algorithms" not in x and "facebook" not in x :
49
+ fresh_links .append (x )
50
+
51
+
52
+ i = 0
53
+ j = 0
54
+ for x in fresh_links :
55
+ i = i + 1
56
+ j = j + 1
57
+ if i > 25 and j < (len (fresh_links )- 31 ):
58
+ new_links .append (x )
59
+ print len (new_links )
60
+ f = open ("G.txt" ,"w" )
61
+ for x in new_links :
62
+ page = get_page (x )
63
+ startpoint = page .find ('<h1 class="entry-title">' )
64
+ print startpoint
65
+
66
+ endpoint = page .find ('<script async src' )
67
+ soup = BeautifulSoup (page [startpoint :endpoint ])
68
+ text = soup .get_text ()
69
+ f .write (text .encode ("utf-8" ))
70
+
71
+
72
+
73
+ #for x in fresh_links:
74
+
75
+ # soup = BeautifulSoup(urllib2.urlopen(x).read())
76
+ # text=soup.get_text()
Original file line number Diff line number Diff line change
1
+ import urllib2
2
+ from bs4 import BeautifulSoup
3
+
4
+ f = open ("ds.txt" ,"r" )
5
+ g = open ("read.txt" ,"w" )
6
+ soup = BeautifulSoup (f .read ())
7
+ text = soup .get_text ()
8
+ g .write (text .encode ("utf-8" ))
You can’t perform that action at this time.
0 commit comments