|
| 1 | +from bs4 import BeautifulSoup |
| 2 | +import urllib2 |
| 3 | +import wget |
| 4 | + |
| 5 | + |
| 6 | +print "Welcome to mathforcollege.com Numerical Method Chapters Downloader \n" |
| 7 | + |
| 8 | + |
| 9 | + |
| 10 | + |
| 11 | +#The million dollar base url |
| 12 | +url="http://nm.mathforcollege.com/topics/index.html" |
| 13 | + |
| 14 | + |
| 15 | +#Variables |
| 16 | +filter_stuff=[] #used in function make_things_alright as a temporary array which stores useful links |
| 17 | +chapters=[] #This array will store all the useful links from webpage discarding other links |
| 18 | +pdf_links=[] |
| 19 | + |
| 20 | +def get_page(url): |
| 21 | + f = urllib2.urlopen(url) |
| 22 | + page = f.read() |
| 23 | + f.close() |
| 24 | + return page |
| 25 | + |
| 26 | + |
| 27 | +def get_next_target(page): |
| 28 | + start_link = page.find('<a href="') |
| 29 | + if start_link == -1: |
| 30 | + return None, 0 |
| 31 | + start_quote = page.find('"', start_link) |
| 32 | + end_quote = page.find('"', start_quote + 1) |
| 33 | + url = page[start_quote + 1:end_quote] |
| 34 | + return url, end_quote |
| 35 | + |
| 36 | +def get_all_links(page): |
| 37 | + links = [] |
| 38 | + while True: |
| 39 | + url,endpos = get_next_target(page) |
| 40 | + if url: |
| 41 | + links.append(url) |
| 42 | + page = page[endpos:] |
| 43 | + else: |
| 44 | + break |
| 45 | + return links |
| 46 | + |
| 47 | +def find_pdf_links(y): |
| 48 | + |
| 49 | + for plink in y: |
| 50 | + if ('.pdf' in plink and 'http://mathforcollege.com/nm/' in plink and 'ppt' not in plink and 'problem' not in plink and 'quiz' not in plink and 'example' not in plink): |
| 51 | + #collect_links.append(plink) |
| 52 | + filename=wget.download(plink) |
| 53 | + |
| 54 | +def make_things_alright(y): |
| 55 | + #iterators |
| 56 | + i=0 |
| 57 | + for text in y: |
| 58 | + if 'http:' not in text and 'https:' not in text and 'mailto' not in text: |
| 59 | + filter_stuff.append(text) |
| 60 | + filter_stuff[i]='http://nm.mathforcollege.com/topics/'+text |
| 61 | + i=i+1 |
| 62 | + return filter_stuff |
| 63 | + |
| 64 | +all_links= get_all_links(get_page(url)) |
| 65 | +chapters = make_things_alright(all_links) #filtering links |
| 66 | + |
| 67 | + |
| 68 | +#iterators |
| 69 | +i=0 |
| 70 | + |
| 71 | +for text in chapters: |
| 72 | + i=i+1 |
| 73 | + print "Downloading Chapter "+ i |
| 74 | + find_pdf_links(get_all_links(get_page(text))) |
| 75 | + |
| 76 | +print "Download Complete!" |
0 commit comments