Skip to content

Commit 1f7cc25

Browse files
committed
Convert to python3
1 parent ed5d728 commit 1f7cc25

14 files changed

+2416
-236
lines changed

Bluto/modules/bluto_logging.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
if not os.path.exists(LOG_DIR):
1111
os.makedirs(LOG_DIR)
12-
os.chmod(LOG_DIR, 0700)
12+
os.chmod(LOG_DIR, 0o700)
1313
open(INFO_LOG_FILE,'a').close()
1414

1515
# set up formatting

Bluto/modules/bluto_logging.py.bak

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/python
2+
3+
import logging
4+
import sys
5+
import site
6+
import os
7+
LOG_DIR = os.path.expanduser('~/Bluto/log/')
8+
INFO_LOG_FILE = os.path.expanduser(LOG_DIR + 'bluto-info.log')
9+
10+
if not os.path.exists(LOG_DIR):
11+
os.makedirs(LOG_DIR)
12+
os.chmod(LOG_DIR, 0700)
13+
open(INFO_LOG_FILE,'a').close()
14+
15+
# set up formatting
16+
formatter = logging.Formatter('[%(asctime)s] %(module)s: %(message)s')
17+
18+
# set up logging to a file for all levels WARNING and higher
19+
fh2 = logging.FileHandler(INFO_LOG_FILE)
20+
fh2.setLevel(logging.INFO)
21+
fh2.setFormatter(formatter)
22+
23+
# create Logger object
24+
mylogger = logging.getLogger('MyLogger')
25+
mylogger.setLevel(logging.INFO)
26+
mylogger.addHandler(fh2)
27+
28+
# create shortcut functions
29+
info = mylogger.info

Bluto/modules/data_mine.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import pdfminer
22
import requests
3-
import urllib2
3+
import urllib.request, urllib.error, urllib.parse
44
import oletools.thirdparty.olefile as olefile
55
import os
66
import traceback
@@ -9,18 +9,18 @@
99
import random
1010
import math
1111
import sys
12-
import Queue
12+
import queue
1313
import time
1414
import threading
1515
import cgi
1616
from termcolor import colored
1717
from pdfminer.pdfparser import PDFParser
1818
from pdfminer.pdfdocument import PDFDocument
1919
from bs4 import BeautifulSoup
20-
from bluto_logging import info, INFO_LOG_FILE
21-
from get_file import get_user_agents
22-
from search import doc_bing, doc_exalead
23-
from general import get_size
20+
from .bluto_logging import info, INFO_LOG_FILE
21+
from .get_file import get_user_agents
22+
from .search import doc_bing, doc_exalead
23+
from .general import get_size
2424

2525

2626

@@ -29,7 +29,7 @@ def action_download(doc_list, docs):
2929
i = 0
3030
download_list = []
3131
initial_count = 0
32-
print 'Gathering Live Documents For Metadata Mining\n'
32+
print('Gathering Live Documents For Metadata Mining\n')
3333
headers = {
3434
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; pl; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB7.1 ( .NET CLR 3.5.30729',
3535
'Referer': 'https://www.google.co.uk/',
@@ -52,7 +52,7 @@ def action_download(doc_list, docs):
5252
code.write(r.content)
5353
code.close()
5454
initial_count += 1
55-
print('\tDownload Count: {}\r'.format(str(initial_count))),
55+
print(('\tDownload Count: {}\r'.format(str(initial_count))), end=' ')
5656
info(str(doc).replace('%20', ' '))
5757
download_list.append(str(doc).replace('%20', ' '))
5858

@@ -66,7 +66,7 @@ def action_download(doc_list, docs):
6666
code.write(r.content)
6767
code.close()
6868
initial_count += 1
69-
print('\tDownload Count: {}\r'.format(str(initial_count))),
69+
print(('\tDownload Count: {}\r'.format(str(initial_count))), end=' ')
7070
download_list.append(str(doc).replace('%20', ' '))
7171
info(str(doc).replace('%20', ' '))
7272
continue
@@ -96,14 +96,14 @@ def action_download(doc_list, docs):
9696
if i < 1:
9797
sys.exit()
9898
data_size = get_size(docs)
99-
print '\tData Downloaded: {}MB'.format(str(math.floor(data_size)))
99+
print('\tData Downloaded: {}MB'.format(str(math.floor(data_size))))
100100
info('Documents Downloaded: {}'.format(initial_count))
101101
return download_list
102102

103103

104104
def doc_search(domain, USERAGENT_F, prox):
105-
q1 = Queue.Queue()
106-
q2 = Queue.Queue()
105+
q1 = queue.Queue()
106+
q2 = queue.Queue()
107107
t1 = threading.Thread(target=doc_bing, args=(domain, USERAGENT_F, prox, q1))
108108
t2 = threading.Thread(target=doc_exalead, args=(domain, USERAGENT_F, prox, q2))
109109
t1.start()

Bluto/modules/data_mine.py.bak

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
import pdfminer
2+
import requests
3+
import urllib2
4+
import oletools.thirdparty.olefile as olefile
5+
import os
6+
import traceback
7+
import time
8+
import re
9+
import random
10+
import math
11+
import sys
12+
import Queue
13+
import time
14+
import threading
15+
import cgi
16+
from termcolor import colored
17+
from pdfminer.pdfparser import PDFParser
18+
from pdfminer.pdfdocument import PDFDocument
19+
from bs4 import BeautifulSoup
20+
from bluto_logging import info, INFO_LOG_FILE
21+
from get_file import get_user_agents
22+
from search import doc_bing, doc_exalead
23+
from general import get_size
24+
25+
26+
27+
def action_download(doc_list, docs):
28+
info('Document Download Started')
29+
i = 0
30+
download_list = []
31+
initial_count = 0
32+
print 'Gathering Live Documents For Metadata Mining\n'
33+
headers = {
34+
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; pl; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB7.1 ( .NET CLR 3.5.30729',
35+
'Referer': 'https://www.google.co.uk/',
36+
'Accept-Language': 'en-US,en;q=0.5',
37+
'Cache-Control': 'no-cache'
38+
}
39+
for doc in doc_list:
40+
doc = doc.replace(' ', '%20')
41+
try:
42+
r = requests.get(doc.encode('utf-8'), headers=headers)
43+
if r.status_code == 404:
44+
r.raise_for_status()
45+
46+
if r.status_code == 200:
47+
params = cgi.parse_header(r.headers.get('Content-Disposition', ''))[-1]
48+
if 'filename' not in params:
49+
filename = str(doc).replace('%20', ' ').split('/')[-1]
50+
with open(docs + filename, "w") as code:
51+
i += 1
52+
code.write(r.content)
53+
code.close()
54+
initial_count += 1
55+
print('\tDownload Count: {}\r'.format(str(initial_count))),
56+
info(str(doc).replace('%20', ' '))
57+
download_list.append(str(doc).replace('%20', ' '))
58+
59+
continue
60+
else:
61+
filename_t = re.search('filename="(.*)"', r.headers['content-disposition'])
62+
filename = filename_t.group(1)
63+
64+
with open(docs + filename, "w") as code:
65+
i += 1
66+
code.write(r.content)
67+
code.close()
68+
initial_count += 1
69+
print('\tDownload Count: {}\r'.format(str(initial_count))),
70+
download_list.append(str(doc).replace('%20', ' '))
71+
info(str(doc).replace('%20', ' '))
72+
continue
73+
74+
75+
except ValueError:
76+
info('No Filename in header')
77+
pass
78+
except AttributeError:
79+
pass
80+
except IOError:
81+
info('Not Found: {}'.format(str(doc).replace('%20', ' ')))
82+
pass
83+
except requests.exceptions.HTTPError:
84+
info('Error: File Not Found Server Side: HTTPError')
85+
pass
86+
except requests.exceptions.ConnectionError:
87+
info('Error: File Not Found Server Side: ConnectionError')
88+
pass
89+
except KeyError:
90+
pass
91+
except Exception:
92+
info('An Unhandled Exception Has Occured, Please Check The Log For Details\n' + INFO_LOG_FILE)
93+
info(str(doc).replace('%20', ' '))
94+
info(r.headers)
95+
pass
96+
if i < 1:
97+
sys.exit()
98+
data_size = get_size(docs)
99+
print '\tData Downloaded: {}MB'.format(str(math.floor(data_size)))
100+
info('Documents Downloaded: {}'.format(initial_count))
101+
return download_list
102+
103+
104+
def doc_search(domain, USERAGENT_F, prox):
105+
q1 = Queue.Queue()
106+
q2 = Queue.Queue()
107+
t1 = threading.Thread(target=doc_bing, args=(domain, USERAGENT_F, prox, q1))
108+
t2 = threading.Thread(target=doc_exalead, args=(domain, USERAGENT_F, prox, q2))
109+
t1.start()
110+
t2.start()
111+
t1.join()
112+
t2.join()
113+
bing = q1.get()
114+
exalead = q2.get()
115+
list_d = bing + exalead
116+
return list_d
117+
118+
119+
#Extract Author PDF
120+
def pdf_read(pdf_file_list):
121+
info('Extracting PDF MetaData')
122+
software_list = []
123+
user_names = []
124+
for filename in pdf_file_list:
125+
info(filename)
126+
try:
127+
128+
fp = open(filename, 'rb')
129+
parser = PDFParser(fp)
130+
doc = PDFDocument(parser)
131+
software = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Creator'])
132+
person = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Author'])
133+
if person:
134+
oddity = re.match('(\s\w\s+(\w\s+)+\w)', person)
135+
if oddity:
136+
oddity = str(oddity.group(1)).replace(' ', '')
137+
user_names.append(str(oddity).title())
138+
else:
139+
user_names.append(str(person).title())
140+
if software:
141+
oddity2 = re.match('(\s\w\s+(\w\s+)+\w)', software)
142+
if oddity2:
143+
oddity2 = str(oddity2.group(1)).replace(' ', '')
144+
software_list.append(oddity2)
145+
else:
146+
software_list.append(software)
147+
except IndexError:
148+
continue
149+
except pdfminer.pdfparser.PDFSyntaxError:
150+
continue
151+
except KeyError:
152+
continue
153+
except TypeError:
154+
continue
155+
except Exception:
156+
info('An Unhandled Exception Has Occured, Please Check The Log For Details' + INFO_LOG_FILE)
157+
continue
158+
info('Finished Extracting PDF MetaData')
159+
return (user_names, software_list)
160+
161+
162+
163+
#Extract Author MS FILES
164+
def ms_doc(ms_file_list):
165+
software_list = []
166+
user_names = []
167+
info('Extracting MSDOCS MetaData')
168+
for filename in ms_file_list:
169+
try:
170+
data = olefile.OleFileIO(filename)
171+
meta = data.get_metadata()
172+
author = re.sub('[^0-9a-zA-Z]+', ' ', meta.author)
173+
company = re.sub('[^0-9a-zA-Z]+', ' ', meta.company)
174+
software = re.sub('[^0-9a-zA-Z]+', ' ', meta.creating_application)
175+
save_by = re.sub('[^0-9a-zA-Z]+', ' ', meta.last_saved_by)
176+
if author:
177+
oddity = re.match('(\s\w\s+(\w\s+)+\w)', author)
178+
if oddity:
179+
oddity = str(oddity.group(1)).replace(' ', '')
180+
user_names.append(str(oddity).title())
181+
else:
182+
user_names.append(str(author).title())
183+
if software:
184+
oddity2 = re.match('(\s\w\s+(\w\s+)+\w)', software)
185+
if oddity2:
186+
oddity2 = str(oddity2.group(1)).replace(' ', '')
187+
software_list.append(oddity2)
188+
else:
189+
software_list.append(software)
190+
191+
if save_by:
192+
oddity3 = re.match('(\s\w\s+(\w\s+)+\w)', save_by)
193+
if oddity3:
194+
oddity3 = str(oddity3.group(1)).replace(' ', '')
195+
user_names.append(str(oddity3).title())
196+
else:
197+
user_names.append(str(save_by).title())
198+
199+
except Exception:
200+
pass
201+
info('Finished Extracting MSDOC MetaData')
202+
return (user_names, software_list)
203+
204+
#Modules takes in DOMAIN, PROX, USERAGENTS outputs user_names, software_list
205+
def doc_start(domain, USERAGENT_F, prox, q):
206+
ms_list_ext = ('.docx', '.pptx', '.xlsx', '.doc', '.xls', '.ppt')
207+
ms_file_list = []
208+
pdf_file_list = []
209+
info('Let The Hunt Begin')
210+
domain_r = domain.split('.')
211+
if not os.path.exists(os.path.expanduser('~/Bluto/doc/{}'.format(domain_r[0]))):
212+
os.makedirs(os.path.expanduser('~/Bluto/doc/{}'.format(domain_r[0])))
213+
214+
location = os.path.expanduser('~/Bluto/doc/{}/'.format(domain_r[0]))
215+
info('Data Folder Created ' + location)
216+
docs = os.path.expanduser(location)
217+
doc_list = doc_search(domain, USERAGENT_F, prox)
218+
219+
if doc_list == []:
220+
q.put(None)
221+
return
222+
doc_list = set(sorted(doc_list))
223+
download_list = action_download(doc_list, docs)
224+
download_count = len(download_list)
225+
226+
for root, dirs, files in os.walk(docs):
227+
for filename in files:
228+
if str(filename).endswith(ms_list_ext):
229+
ms_file_list.append(os.path.join(root, filename))
230+
if str(filename).endswith('.pdf'):
231+
pdf_file_list.append(os.path.join(root, filename))
232+
233+
if ms_file_list and pdf_file_list:
234+
user_names_ms, software_list_ms = ms_doc(ms_file_list)
235+
user_names_pdf, software_list_pdf = pdf_read(pdf_file_list)
236+
user_names_t = user_names_ms + user_names_pdf
237+
software_list_t = software_list_ms + software_list_pdf
238+
239+
elif ms_file_list:
240+
user_names_ms, software_list_ms = ms_doc(ms_file_list)
241+
user_names_t = user_names_ms
242+
software_list_t = software_list_ms
243+
244+
elif pdf_file_list:
245+
user_names_pdf, software_list_pdf = pdf_read(pdf_file_list)
246+
user_names_t = user_names_pdf
247+
software_list_t = software_list_pdf
248+
else:
249+
user_names_t = []
250+
software_list_t = []
251+
252+
if user_names_t and software_list_t:
253+
user_names = sorted(set(user_names_t))
254+
software_list = sorted(set(software_list_t))
255+
info('The Hunt Ended')
256+
q.put((user_names, software_list, download_count, download_list))
257+
258+
elif software_list_t:
259+
software_list = sorted(set(software_list_t))
260+
user_names = []
261+
info('The Hunt Ended')
262+
q.put((user_names, software_list, download_count, download_list))
263+
264+
elif user_names_t:
265+
user_names = sorted(set(user_names_t))
266+
software_list = []
267+
info('The Hunt Ended')
268+
q.put((user_names, software_list, download_count, download_list))

0 commit comments

Comments
 (0)