|
| 1 | +import pdfminer |
| 2 | +import requests |
| 3 | +import urllib2 |
| 4 | +import oletools.thirdparty.olefile as olefile |
| 5 | +import os |
| 6 | +import traceback |
| 7 | +import time |
| 8 | +import re |
| 9 | +import random |
| 10 | +import math |
| 11 | +import sys |
| 12 | +import Queue |
| 13 | +import time |
| 14 | +import threading |
| 15 | +import cgi |
| 16 | +from termcolor import colored |
| 17 | +from pdfminer.pdfparser import PDFParser |
| 18 | +from pdfminer.pdfdocument import PDFDocument |
| 19 | +from bs4 import BeautifulSoup |
| 20 | +from bluto_logging import info, INFO_LOG_FILE |
| 21 | +from get_file import get_user_agents |
| 22 | +from search import doc_bing, doc_exalead |
| 23 | +from general import get_size |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | +def action_download(doc_list, docs): |
| 28 | + info('Document Download Started') |
| 29 | + i = 0 |
| 30 | + download_list = [] |
| 31 | + initial_count = 0 |
| 32 | + print 'Gathering Live Documents For Metadata Mining\n' |
| 33 | + headers = { |
| 34 | + 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; pl; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB7.1 ( .NET CLR 3.5.30729', |
| 35 | + 'Referer': 'https://www.google.co.uk/', |
| 36 | + 'Accept-Language': 'en-US,en;q=0.5', |
| 37 | + 'Cache-Control': 'no-cache' |
| 38 | + } |
| 39 | + for doc in doc_list: |
| 40 | + doc = doc.replace(' ', '%20') |
| 41 | + try: |
| 42 | + r = requests.get(doc.encode('utf-8'), headers=headers) |
| 43 | + if r.status_code == 404: |
| 44 | + r.raise_for_status() |
| 45 | + |
| 46 | + if r.status_code == 200: |
| 47 | + params = cgi.parse_header(r.headers.get('Content-Disposition', ''))[-1] |
| 48 | + if 'filename' not in params: |
| 49 | + filename = str(doc).replace('%20', ' ').split('/')[-1] |
| 50 | + with open(docs + filename, "w") as code: |
| 51 | + i += 1 |
| 52 | + code.write(r.content) |
| 53 | + code.close() |
| 54 | + initial_count += 1 |
| 55 | + print('\tDownload Count: {}\r'.format(str(initial_count))), |
| 56 | + info(str(doc).replace('%20', ' ')) |
| 57 | + download_list.append(str(doc).replace('%20', ' ')) |
| 58 | + |
| 59 | + continue |
| 60 | + else: |
| 61 | + filename_t = re.search('filename="(.*)"', r.headers['content-disposition']) |
| 62 | + filename = filename_t.group(1) |
| 63 | + |
| 64 | + with open(docs + filename, "w") as code: |
| 65 | + i += 1 |
| 66 | + code.write(r.content) |
| 67 | + code.close() |
| 68 | + initial_count += 1 |
| 69 | + print('\tDownload Count: {}\r'.format(str(initial_count))), |
| 70 | + download_list.append(str(doc).replace('%20', ' ')) |
| 71 | + info(str(doc).replace('%20', ' ')) |
| 72 | + continue |
| 73 | + |
| 74 | + |
| 75 | + except ValueError: |
| 76 | + info('No Filename in header') |
| 77 | + pass |
| 78 | + except AttributeError: |
| 79 | + pass |
| 80 | + except IOError: |
| 81 | + info('Not Found: {}'.format(str(doc).replace('%20', ' '))) |
| 82 | + pass |
| 83 | + except requests.exceptions.HTTPError: |
| 84 | + info('Error: File Not Found Server Side: HTTPError') |
| 85 | + pass |
| 86 | + except requests.exceptions.ConnectionError: |
| 87 | + info('Error: File Not Found Server Side: ConnectionError') |
| 88 | + pass |
| 89 | + except KeyError: |
| 90 | + pass |
| 91 | + except Exception: |
| 92 | + info('An Unhandled Exception Has Occured, Please Check The Log For Details\n' + INFO_LOG_FILE) |
| 93 | + info(str(doc).replace('%20', ' ')) |
| 94 | + info(r.headers) |
| 95 | + pass |
| 96 | + if i < 1: |
| 97 | + sys.exit() |
| 98 | + data_size = get_size(docs) |
| 99 | + print '\tData Downloaded: {}MB'.format(str(math.floor(data_size))) |
| 100 | + info('Documents Downloaded: {}'.format(initial_count)) |
| 101 | + return download_list |
| 102 | + |
| 103 | + |
| 104 | +def doc_search(domain, USERAGENT_F, prox): |
| 105 | + q1 = Queue.Queue() |
| 106 | + q2 = Queue.Queue() |
| 107 | + t1 = threading.Thread(target=doc_bing, args=(domain, USERAGENT_F, prox, q1)) |
| 108 | + t2 = threading.Thread(target=doc_exalead, args=(domain, USERAGENT_F, prox, q2)) |
| 109 | + t1.start() |
| 110 | + t2.start() |
| 111 | + t1.join() |
| 112 | + t2.join() |
| 113 | + bing = q1.get() |
| 114 | + exalead = q2.get() |
| 115 | + list_d = bing + exalead |
| 116 | + return list_d |
| 117 | + |
| 118 | + |
| 119 | +#Extract Author PDF |
| 120 | +def pdf_read(pdf_file_list): |
| 121 | + info('Extracting PDF MetaData') |
| 122 | + software_list = [] |
| 123 | + user_names = [] |
| 124 | + for filename in pdf_file_list: |
| 125 | + info(filename) |
| 126 | + try: |
| 127 | + |
| 128 | + fp = open(filename, 'rb') |
| 129 | + parser = PDFParser(fp) |
| 130 | + doc = PDFDocument(parser) |
| 131 | + software = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Creator']) |
| 132 | + person = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Author']) |
| 133 | + if person: |
| 134 | + oddity = re.match('(\s\w\s+(\w\s+)+\w)', person) |
| 135 | + if oddity: |
| 136 | + oddity = str(oddity.group(1)).replace(' ', '') |
| 137 | + user_names.append(str(oddity).title()) |
| 138 | + else: |
| 139 | + user_names.append(str(person).title()) |
| 140 | + if software: |
| 141 | + oddity2 = re.match('(\s\w\s+(\w\s+)+\w)', software) |
| 142 | + if oddity2: |
| 143 | + oddity2 = str(oddity2.group(1)).replace(' ', '') |
| 144 | + software_list.append(oddity2) |
| 145 | + else: |
| 146 | + software_list.append(software) |
| 147 | + except IndexError: |
| 148 | + continue |
| 149 | + except pdfminer.pdfparser.PDFSyntaxError: |
| 150 | + continue |
| 151 | + except KeyError: |
| 152 | + continue |
| 153 | + except TypeError: |
| 154 | + continue |
| 155 | + except Exception: |
| 156 | + info('An Unhandled Exception Has Occured, Please Check The Log For Details' + INFO_LOG_FILE) |
| 157 | + continue |
| 158 | + info('Finished Extracting PDF MetaData') |
| 159 | + return (user_names, software_list) |
| 160 | + |
| 161 | + |
| 162 | + |
| 163 | +#Extract Author MS FILES |
| 164 | +def ms_doc(ms_file_list): |
| 165 | + software_list = [] |
| 166 | + user_names = [] |
| 167 | + info('Extracting MSDOCS MetaData') |
| 168 | + for filename in ms_file_list: |
| 169 | + try: |
| 170 | + data = olefile.OleFileIO(filename) |
| 171 | + meta = data.get_metadata() |
| 172 | + author = re.sub('[^0-9a-zA-Z]+', ' ', meta.author) |
| 173 | + company = re.sub('[^0-9a-zA-Z]+', ' ', meta.company) |
| 174 | + software = re.sub('[^0-9a-zA-Z]+', ' ', meta.creating_application) |
| 175 | + save_by = re.sub('[^0-9a-zA-Z]+', ' ', meta.last_saved_by) |
| 176 | + if author: |
| 177 | + oddity = re.match('(\s\w\s+(\w\s+)+\w)', author) |
| 178 | + if oddity: |
| 179 | + oddity = str(oddity.group(1)).replace(' ', '') |
| 180 | + user_names.append(str(oddity).title()) |
| 181 | + else: |
| 182 | + user_names.append(str(author).title()) |
| 183 | + if software: |
| 184 | + oddity2 = re.match('(\s\w\s+(\w\s+)+\w)', software) |
| 185 | + if oddity2: |
| 186 | + oddity2 = str(oddity2.group(1)).replace(' ', '') |
| 187 | + software_list.append(oddity2) |
| 188 | + else: |
| 189 | + software_list.append(software) |
| 190 | + |
| 191 | + if save_by: |
| 192 | + oddity3 = re.match('(\s\w\s+(\w\s+)+\w)', save_by) |
| 193 | + if oddity3: |
| 194 | + oddity3 = str(oddity3.group(1)).replace(' ', '') |
| 195 | + user_names.append(str(oddity3).title()) |
| 196 | + else: |
| 197 | + user_names.append(str(save_by).title()) |
| 198 | + |
| 199 | + except Exception: |
| 200 | + pass |
| 201 | + info('Finished Extracting MSDOC MetaData') |
| 202 | + return (user_names, software_list) |
| 203 | + |
| 204 | +#Modules takes in DOMAIN, PROX, USERAGENTS outputs user_names, software_list |
| 205 | +def doc_start(domain, USERAGENT_F, prox, q): |
| 206 | + ms_list_ext = ('.docx', '.pptx', '.xlsx', '.doc', '.xls', '.ppt') |
| 207 | + ms_file_list = [] |
| 208 | + pdf_file_list = [] |
| 209 | + info('Let The Hunt Begin') |
| 210 | + domain_r = domain.split('.') |
| 211 | + if not os.path.exists(os.path.expanduser('~/Bluto/doc/{}'.format(domain_r[0]))): |
| 212 | + os.makedirs(os.path.expanduser('~/Bluto/doc/{}'.format(domain_r[0]))) |
| 213 | + |
| 214 | + location = os.path.expanduser('~/Bluto/doc/{}/'.format(domain_r[0])) |
| 215 | + info('Data Folder Created ' + location) |
| 216 | + docs = os.path.expanduser(location) |
| 217 | + doc_list = doc_search(domain, USERAGENT_F, prox) |
| 218 | + |
| 219 | + if doc_list == []: |
| 220 | + q.put(None) |
| 221 | + return |
| 222 | + doc_list = set(sorted(doc_list)) |
| 223 | + download_list = action_download(doc_list, docs) |
| 224 | + download_count = len(download_list) |
| 225 | + |
| 226 | + for root, dirs, files in os.walk(docs): |
| 227 | + for filename in files: |
| 228 | + if str(filename).endswith(ms_list_ext): |
| 229 | + ms_file_list.append(os.path.join(root, filename)) |
| 230 | + if str(filename).endswith('.pdf'): |
| 231 | + pdf_file_list.append(os.path.join(root, filename)) |
| 232 | + |
| 233 | + if ms_file_list and pdf_file_list: |
| 234 | + user_names_ms, software_list_ms = ms_doc(ms_file_list) |
| 235 | + user_names_pdf, software_list_pdf = pdf_read(pdf_file_list) |
| 236 | + user_names_t = user_names_ms + user_names_pdf |
| 237 | + software_list_t = software_list_ms + software_list_pdf |
| 238 | + |
| 239 | + elif ms_file_list: |
| 240 | + user_names_ms, software_list_ms = ms_doc(ms_file_list) |
| 241 | + user_names_t = user_names_ms |
| 242 | + software_list_t = software_list_ms |
| 243 | + |
| 244 | + elif pdf_file_list: |
| 245 | + user_names_pdf, software_list_pdf = pdf_read(pdf_file_list) |
| 246 | + user_names_t = user_names_pdf |
| 247 | + software_list_t = software_list_pdf |
| 248 | + else: |
| 249 | + user_names_t = [] |
| 250 | + software_list_t = [] |
| 251 | + |
| 252 | + if user_names_t and software_list_t: |
| 253 | + user_names = sorted(set(user_names_t)) |
| 254 | + software_list = sorted(set(software_list_t)) |
| 255 | + info('The Hunt Ended') |
| 256 | + q.put((user_names, software_list, download_count, download_list)) |
| 257 | + |
| 258 | + elif software_list_t: |
| 259 | + software_list = sorted(set(software_list_t)) |
| 260 | + user_names = [] |
| 261 | + info('The Hunt Ended') |
| 262 | + q.put((user_names, software_list, download_count, download_list)) |
| 263 | + |
| 264 | + elif user_names_t: |
| 265 | + user_names = sorted(set(user_names_t)) |
| 266 | + software_list = [] |
| 267 | + info('The Hunt Ended') |
| 268 | + q.put((user_names, software_list, download_count, download_list)) |
0 commit comments