Skip to content

Commit 228d378

Browse files
committed
Changed gemfile so all the necessary gems are listed. Added the beginnings of a scraper in the lib.
1 parent 488b95b commit 228d378

File tree

3 files changed

+58
-1
lines changed

3 files changed

+58
-1
lines changed

Gemfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ group :assets do
3838
gem 'coffee-rails', '~> 3.2.1'
3939

4040
gem 'compass-rails'
41-
# gem 'zurb-foundation'
41+
gem 'zurb-foundation'
4242

4343
# See https://github.com/sstephenson/execjs#readme for more supported runtimes
4444
# gem 'therubyracer', :platforms => :ruby

lib/.DS_Store

6 KB
Binary file not shown.

lib/tasks/pdfScraper.rb

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
require 'rubygems'
2+
require 'mechanize'
3+
require 'pdf-reader'
4+
require 'open-uri'
5+
require 'json'
6+
7+
#------------- Globals --------------------#
8+
9+
words = ["addition","division",'\+']
10+
sites = [""]
11+
pdfs = [""]
12+
13+
#------------- Find PDFs ------------------#
14+
15+
#agent = Mechanize.new
16+
17+
#sites.each do |site|
18+
# page = agent.get(site)
19+
20+
# page.links.each do |link|
21+
# ------------------- Need to put the FIND PDF LOGIC HERE AND DEV PDF VAR
22+
# puts link.text
23+
# end
24+
#end
25+
26+
#-------------- Begin the PDF Parse -------#
27+
28+
def scrapePDF(name)
29+
30+
if(!(name.instance_of? String))
31+
return nil
32+
33+
reader = PDF::Reader.new(name)
34+
#puts reader.page_count
35+
36+
frequency = Hash.new(0) #default value set here as 0
37+
words.each do |word|
38+
frequency[word] = 0 #Don't know how to initialize this otherwise
39+
end
40+
41+
reader.pages.each do |page|
42+
s = StringScanner.new(page.text.downcase)
43+
#puts page.text.downcase.class
44+
words.each do |word|
45+
while s.scan_until(/#{word}/) != nil do
46+
frequency[word] = frequency[word] + 1
47+
end
48+
end
49+
end
50+
51+
words.each do |word|
52+
p word, frequency[word]
53+
end
54+
55+
return frequency
56+
57+
end

0 commit comments

Comments
 (0)