Skip to content

Commit bc7c4b2

Browse files
committed
Updates
1 parent d918d2a commit bc7c4b2

File tree

3 files changed

+99
-0
lines changed

3 files changed

+99
-0
lines changed

ques_paper/dry.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from selenium import webdriver
2+
from bs4 import BeautifulSoup
3+
import time
4+
5+
6+
browser = webdriver.Firefox()
7+
8+
url = 'http://www.xampaperz.com/1stsemcse.php'
9+
browser.get(url)
10+
time.sleep(10)
11+
12+
page=browser.page_source
13+
f=open("adsa.txt","w")
14+
f.write(page.encode("utf-8"))
15+
f.close()

ques_paper/geeks.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import urllib2
2+
from bs4 import BeautifulSoup
3+
4+
5+
print "Welcome ! \n"
6+
7+
8+
9+
10+
#The million dollar base url
11+
url="http://www.geeksforgeeks.org/data-structures"
12+
13+
14+
#Variables
15+
filter_stuff=[] #used in function make_things_alright as a temporary array which stores useful links
16+
chapters=[] #This array will store all the useful links from webpage discarding other links
17+
pdf_links=[]
18+
19+
def get_page(url):
20+
opener = urllib2.build_opener()
21+
opener.addheaders = [('User-agent', 'Chrome/5.0')]
22+
response = opener.open(url)
23+
page = response.read()
24+
return page
25+
26+
def get_next_target(page):
27+
start_link = page.find('<a href="')
28+
if start_link == -1:
29+
return None, 0
30+
start_quote = page.find('"', start_link)
31+
end_quote = page.find('"', start_quote + 1)
32+
url = page[start_quote + 1:end_quote]
33+
return url, end_quote
34+
new_links=[]
35+
def get_all_links(page):
36+
links = []
37+
while True:
38+
url,endpos = get_next_target(page)
39+
if url:
40+
links.append(url)
41+
page = page[endpos:]
42+
else:
43+
break
44+
return links
45+
fresh_links=[]
46+
links =get_all_links(get_page(url))
47+
for x in links:
48+
if "category" not in x and "/data-structures" not in x and "fundamentals-of-algorithms" not in x and "facebook" not in x:
49+
fresh_links.append(x)
50+
51+
52+
i=0
53+
j=0
54+
for x in fresh_links:
55+
i=i+1
56+
j=j+1
57+
if i>25 and j<(len(fresh_links)-31):
58+
new_links.append(x)
59+
print len(new_links)
60+
f=open("G.txt","w")
61+
for x in new_links:
62+
page=get_page(x)
63+
startpoint=page.find('<h1 class="entry-title">')
64+
print startpoint
65+
66+
endpoint =page.find('<script async src')
67+
soup=BeautifulSoup(page[startpoint:endpoint])
68+
text=soup.get_text()
69+
f.write(text.encode("utf-8"))
70+
71+
72+
73+
#for x in fresh_links:
74+
75+
# soup = BeautifulSoup(urllib2.urlopen(x).read())
76+
# text=soup.get_text()

ques_paper/ref.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import urllib2
2+
from bs4 import BeautifulSoup
3+
4+
f=open("ds.txt","r")
5+
g=open("read.txt","w")
6+
soup=BeautifulSoup(f.read())
7+
text=soup.get_text()
8+
g.write(text.encode("utf-8"))

0 commit comments

Comments
 (0)