+
+
+
+
+
+
+ The Offensive Web Application Penetration Testing Framework.
+
+ Photon Wiki • + How To Use • + Compatibility • + Photon Library • + Contribution • + Roadmap +
+ +### Key Features + +#### Data Extraction +Photon can extract the following data while crawling: + +- URLs (in-scope & out-of-scope) +- URLs with parameters (`example.com/gallery.php?id=2`) +- Intel (emails, social media accounts, amazon buckets etc.) +- Files (pdf, png, xml etc.) +- Secret keys (auth/API keys & hashes) +- JavaScript files & Endpoints present in them +- Strings matching custom regex pattern +- Subdomains & DNS related data + +The extracted information is saved in an organized manner or can be [exported as json](https://github.com/s0md3v/Photon/wiki/Usage#export-formatted-result). + + + +#### Flexible +Control timeout, delay, add seeds, exclude URLs matching a regex pattern and other cool stuff. +The extensive range of [options](https://github.com/s0md3v/Photon/wiki/Usage) provided by Photon lets you crawl the web exactly the way you want. + +#### Genius +Photon's smart thread management & refined logic gives you top notch performance. + +Still, crawling can be resource intensive but Photon has some tricks up it's sleeves. You can fetch URLs archived by [archive.org](https://archive.org/) to be used as seeds by using `--wayback` option. + +#### Plugins +- **[wayback](https://github.com/s0md3v/Photon/wiki/Usage#use-urls-from-archiveorg-as-seeds)** +- **[dnsdumpster](https://github.com/s0md3v/Photon/wiki/Usage#dumping-dns-data)** +- **[Exporter](https://github.com/s0md3v/Photon/wiki/Usage#export-formatted-result)** + +#### Docker + +Photon can be launched using a lightweight Python-Alpine (103 MB) Docker image. + +```bash +$ git clone https://github.com/s0md3v/Photon.git +$ cd Photon +$ docker build -t photon . +$ docker run -it --name photon photon:latest -u google.com +``` + +To view results, you can either head over to the local docker volume, which you can find by running `docker inspect photon` or by mounting the target loot folder: + +```bash +$ docker run -it --name photon -v "$PWD:/Photon/google.com" photon:latest -u google.com +``` + +#### Frequent & Seamless Updates +Photon is under heavy development and updates for fixing bugs. optimizing performance & new features are being rolled regularly. + +If you would like to see features and issues that are being worked on, you can do that on [Development](https://github.com/s0md3v/Photon/projects/1) project board. + +Updates can be installed & checked for with the `--update` option. Photon has seamless update capabilities which means you can update Photon without losing any of your saved data. + +### Contribution & License +You can contribute in following ways: + +- Report bugs +- Develop plugins +- Add more "APIs" for ninja mode +- Give suggestions to make it better +- Fix issues & submit a pull request + +Please read the [guidelines](https://github.com/s0md3v/Photon/wiki/Guidelines) before submitting a pull request or issue. + +Do you want to have a conversation in private? Hit me up on my [twitter](https://twitter.com/s0md3v/), inbox is open :) + +**Photon** is licensed under [GPL v3.0 license](https://www.gnu.org/licenses/gpl-3.0.en.html) diff --git a/core/lib/Photon/core/__init__.py b/core/lib/Photon/core/__init__.py new file mode 100644 index 00000000..0d75f5e1 --- /dev/null +++ b/core/lib/Photon/core/__init__.py @@ -0,0 +1 @@ +"""The Photon core.""" diff --git a/core/lib/Photon/core/colors.py b/core/lib/Photon/core/colors.py new file mode 100644 index 00000000..f549f64b --- /dev/null +++ b/core/lib/Photon/core/colors.py @@ -0,0 +1,17 @@ +import sys + +if sys.platform.lower().startswith(('os', 'win', 'darwin', 'ios')): + # Colors shouldn't be displayed on Mac and Windows + end = red = white = green = yellow = run = bad = good = info = que = '' +else: + white = '\033[97m' + green = '\033[92m' + red = '\033[91m' + yellow = '\033[93m' + end = '\033[0m' + back = '\033[7;91m' + info = '\033[93m[!]\033[0m' + que = '\033[94m[?]\033[0m' + bad = '\033[91m[-]\033[0m' + good = '\033[92m[+]\033[0m' + run = '\033[97m[~]\033[0m' diff --git a/core/lib/Photon/core/config.py b/core/lib/Photon/core/config.py new file mode 100644 index 00000000..0a54b9ab --- /dev/null +++ b/core/lib/Photon/core/config.py @@ -0,0 +1,27 @@ +"""Configuration options for Photon.""" + +VERBOSE = False + +INTELS = [ + 'facebook.com', + 'github.com', + 'instagram.com', + 'youtube.com', +] + +BAD_TYPES = ( + 'bmp', + 'css', + 'csv', + 'docx', + 'ico', + 'jpeg', + 'jpg', + 'js', + 'json', + 'pdf', + 'png', + 'svg', + 'xls', + 'xml', +) diff --git a/core/lib/Photon/core/flash.py b/core/lib/Photon/core/flash.py new file mode 100644 index 00000000..0741fbba --- /dev/null +++ b/core/lib/Photon/core/flash.py @@ -0,0 +1,20 @@ +# from __future__ import print_function +import concurrent.futures +import sys +import os +from os import path +sys.path.append(os.path.abspath('.')) + +from .colors import info + +def flash(function, links, thread_count): + """Process the URLs and uses a threadpool to execute a function.""" + # Convert links (set) to list + links = list(links) + threadpool = concurrent.futures.ThreadPoolExecutor( + max_workers=thread_count) + futures = (threadpool.submit(function, link) for link in links) + for i, _ in enumerate(concurrent.futures.as_completed(futures)): + if i + 1 == len(links) or (i + 1) % thread_count == 0: + print('%s Progress: %i/%i' % (info, i + 1, len(links))) + print('') diff --git a/core/lib/Photon/core/mirror.py b/core/lib/Photon/core/mirror.py new file mode 100644 index 00000000..8dfebe06 --- /dev/null +++ b/core/lib/Photon/core/mirror.py @@ -0,0 +1,39 @@ +import os + + +def mirror(url, response): + if response != 'dummy': + clean_url = url.replace('http://', '').replace('https://', '').rstrip('/') + parts = clean_url.split('?')[0].split('/') + root = parts[0] + webpage = parts[-1] + parts.remove(root) + try: + parts.remove(webpage) + except ValueError: + pass + prefix = root + '_mirror' + try: + os.mkdir(prefix) + except OSError: + pass + suffix = '' + if parts: + for directory in parts: + suffix += directory + '/' + try: + os.mkdir(prefix + '/' + suffix) + except OSError: + pass + path = prefix + '/' + suffix + trail = '' + if '.' not in webpage: + trail += '.html' + if webpage == root: + name = 'index.html' + else: + name = webpage + if len(url.split('?')) > 1: + trail += '?' + url.split('?')[1] + with open(path + name + trail, 'w+') as out_file: + out_file.write(response.encode('utf-8')) diff --git a/core/lib/Photon/core/prompt.py b/core/lib/Photon/core/prompt.py new file mode 100644 index 00000000..0e3e84bf --- /dev/null +++ b/core/lib/Photon/core/prompt.py @@ -0,0 +1,22 @@ +"""Support for an input prompt.""" +import os +import tempfile + + +def prompt(default=None): + """Present the user a prompt.""" + editor = 'nano' + with tempfile.NamedTemporaryFile(mode='r+') as tmpfile: + if default: + tmpfile.write(default) + tmpfile.flush() + + child_pid = os.fork() + is_child = child_pid == 0 + + if is_child: + os.execvp(editor, [editor, tmpfile.name]) + else: + os.waitpid(child_pid, 0) + tmpfile.seek(0) + return tmpfile.read().strip() diff --git a/core/lib/Photon/core/regex.py b/core/lib/Photon/core/regex.py new file mode 100644 index 00000000..570742d8 --- /dev/null +++ b/core/lib/Photon/core/regex.py @@ -0,0 +1,234 @@ +import re + +# regex taken from https://github.com/InQuest/python-iocextract +# Reusable end punctuation regex. +END_PUNCTUATION = r"[\.\?>\"'\)!,}:;\u201d\u2019\uff1e\uff1c\]]*" + +# Reusable regex for symbols commonly used to defang. +SEPARATOR_DEFANGS = r"[\(\)\[\]{}<>\\]" + +# Split URLs on some characters that may be valid, but may also be garbage. +URL_SPLIT_STR = r"[>\"'\),};]" + +# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme. +GENERIC_URL = re.compile(r""" + ( + # Scheme. + [fhstu]\S\S?[px]s? + # One of these delimiters/defangs. + (?: + :\/\/| + :\\\\| + :?__ + ) + # Any number of defang characters. + (?: + \x20| + """ + SEPARATOR_DEFANGS + r""" + )* + # Domain/path characters. + \w + \S+? + # CISCO ESA style defangs followed by domain/path characters. + (?:\x20[\/\.][^\.\/\s]\S*?)* + ) + """ + END_PUNCTUATION + r""" + (?=\s|$) + """, re.IGNORECASE | re.VERBOSE | re.UNICODE) + +# Get some obfuscated urls, main anchor is brackets around the period. +BRACKET_URL = re.compile(r""" + \b + ( + [\.\:\/\\\w\[\]\(\)-]+ + (?: + \x20? + [\(\[] + \x20? + \. + \x20? + [\]\)] + \x20? + \S*? + )+ + ) + """ + END_PUNCTUATION + r""" + (?=\s|$) + """, re.VERBOSE | re.UNICODE) + +# Get some obfuscated urls, main anchor is backslash before a period. +BACKSLASH_URL = re.compile(r""" + \b + ( + [\:\/\\\w\[\]\(\)-]+ + (?: + \x20? + \\?\. + \x20? + \S*? + )*? + (?: + \x20? + \\\. + \x20? + \S*? + ) + (?: + \x20? + \\?\. + \x20? + \S*? + )* + ) + """ + END_PUNCTUATION + r""" + (?=\s|$) + """, re.VERBOSE | re.UNICODE) + +# Get hex-encoded urls. +HEXENCODED_URL = re.compile(r""" + ( + [46][86] + (?:[57]4)? + [57]4[57]0 + (?:[57]3)? + 3a2f2f + (?:2[356def]|3[0-9adf]|[46][0-9a-f]|[57][0-9af])+ + ) + (?:[046]0|2[0-2489a-c]|3[bce]|[57][b-e]|[8-f][0-9a-f]|0a|0d|09|[ + \x5b-\x5d\x7b\x7d\x0a\x0d\x20 + ]|$) + """, re.IGNORECASE | re.VERBOSE) + +# Get urlencoded urls. +URLENCODED_URL = re.compile(r""" + (s?[hf]t?tps?%3A%2F%2F\w[\w%-]*?)(?:[^\w%-]|$) + """, re.IGNORECASE | re.VERBOSE) + +# Get base64-encoded urls. +B64ENCODED_URL = re.compile(r""" + ( + # b64re '([hH][tT][tT][pP][sS]|[hH][tT][tT][pP]|[fF][tT][pP])://' + # Modified to ignore whitespace. + (?: + [\x2b\x2f-\x39A-Za-z]\s*[\x2b\x2f-\x39A-Za-z]\s*[\x31\x35\x39BFJNRVZdhlptx]\s*[Gm]\s*[Vd]\s*[FH]\s*[A]\s*\x36\s*L\s*y\s*[\x2b\x2f\x38-\x39]\s*| + [\x2b\x2f-\x39A-Za-z]\s*[\x2b\x2f-\x39A-Za-z]\s*[\x31\x35\x39BFJNRVZdhlptx]\s*[Io]\s*[Vd]\s*[FH]\s*[R]\s*[Qw]\s*[O]\s*i\s*\x38\s*v\s*[\x2b\x2f-\x39A-Za-z]\s*| + [\x2b\x2f-\x39A-Za-z]\s*[\x2b\x2f-\x39A-Za-z]\s*[\x31\x35\x39BFJNRVZdhlptx]\s*[Io]\s*[Vd]\s*[FH]\s*[R]\s*[Qw]\s*[Uc]\s*[z]\s*o\s*v\s*L\s*[\x2b\x2f-\x39w-z]\s*| + [\x2b\x2f-\x39A-Za-z]\s*[\x30\x32EGUWkm]\s*[Z]\s*[\x30U]\s*[Uc]\s*[D]\s*o\s*v\s*L\s*[\x2b\x2f-\x39w-z]\s*| + [\x2b\x2f-\x39A-Za-z]\s*[\x30\x32EGUWkm]\s*[h]\s*[\x30U]\s*[Vd]\s*[FH]\s*[A]\s*\x36\s*L\s*y\s*[\x2b\x2f\x38-\x39]\s*| + [\x2b\x2f-\x39A-Za-z]\s*[\x30\x32EGUWkm]\s*[h]\s*[\x30U]\s*[Vd]\s*[FH]\s*[B]\s*[Tz]\s*[O]\s*i\s*\x38\s*v\s*[\x2b\x2f-\x39A-Za-z]\s*| + [RZ]\s*[ln]\s*[R]\s*[Qw]\s*[O]\s*i\s*\x38\s*v\s*[\x2b\x2f-\x39A-Za-z]\s*| + [Sa]\s*[FH]\s*[R]\s*[\x30U]\s*[Uc]\s*[D]\s*o\s*v\s*L\s*[\x2b\x2f-\x39w-z]\s*| + [Sa]\s*[FH]\s*[R]\s*[\x30U]\s*[Uc]\s*[FH]\s*[M]\s*\x36\s*L\s*y\s*[\x2b\x2f\x38-\x39]\s* + ) + # Up to 260 characters (pre-encoding, reasonable URL length). + [A-Za-z0-9+/=\s]{1,357} + ) + (?=[^A-Za-z0-9+/=\s]|$) + """, re.VERBOSE) + +# Get some valid obfuscated ip addresses. +IPV4 = re.compile(r""" + (?:^| + (?![^\d\.]) + ) + (?: + (?:[1-9]?\d|1\d\d|2[0-4]\d|25[0-5]) + [\[\(\\]*?\.[\]\)]*? + ){3} + (?:[1-9]?\d|1\d\d|2[0-4]\d|25[0-5]) + (?:(?=[^\d\.])|$) + """, re.VERBOSE) + +# Experimental IPv6 regex, will not catch everything but should be sufficent for now. +IPV6 = re.compile(r""" + \b(?:[a-f0-9]{1,4}:|:){2,7}(?:[a-f0-9]{1,4}|:)\b + """, re.IGNORECASE | re.VERBOSE) + +# Capture email addresses including common defangs. +EMAIL = re.compile(r""" + ( + [a-z0-9_.+-]+ + [\(\[{\x20]* + (?:@|\Wat\W) + [\)\]}\x20]* + [a-z0-9-]+ + (?: + (?: + (?: + \x20* + """ + SEPARATOR_DEFANGS + r""" + \x20* + )* + \. + (?: + \x20* + """ + SEPARATOR_DEFANGS + r""" + \x20* + )* + | + \W+dot\W+ + ) + [a-z0-9-]+? + )+ + ) + """ + END_PUNCTUATION + r""" + (?=\s|$) + """, re.IGNORECASE | re.VERBOSE | re.UNICODE) + +MD5 = re.compile(r"(?:[^a-fA-F\d]|\b)([a-fA-F\d]{32})(?:[^a-fA-F\d]|\b)") +SHA1 = re.compile(r"(?:[^a-fA-F\d]|\b)([a-fA-F\d]{40})(?:[^a-fA-F\d]|\b)") +SHA256 = re.compile(r"(?:[^a-fA-F\d]|\b)([a-fA-F\d]{64})(?:[^a-fA-F\d]|\b)") +SHA512 = re.compile( + r"(?:[^a-fA-F\d]|\b)([a-fA-F\d]{128})(?:[^a-fA-F\d]|\b)") + +# YARA regex. +YARA_PARSE = re.compile(r""" + (?:^|\s) + ( + (?: + \s*?import\s+?"[^\r\n]*?[\r\n]+| + \s*?include\s+?"[^\r\n]*?[\r\n]+| + \s*?//[^\r\n]*[\r\n]+| + \s*?/\*.*?\*/\s*? + )* + (?: + \s*?private\s+| + \s*?global\s+ + )* + rule\s*? + \w+\s*? + (?: + :[\s\w]+ + )? + \s+\{ + .*? + condition\s*?: + .*? + \s*\} + ) + (?:$|\s) + """, re.MULTILINE | re.DOTALL | re.VERBOSE) + +CREDIT_CARD = re.compile(r"[0-9]{4}[ ]?[-]?[0-9]{4}[ ]?[-]?[0-9]{4}[ ]?[-]?[0-9]{4}") + +rintels = [(GENERIC_URL, "GENERIC_URL"), + (BRACKET_URL, "BRACKET_URL"), + (BACKSLASH_URL, "BACKSLASH_URL"), + (HEXENCODED_URL, "HEXENCODED_URL"), + (URLENCODED_URL, "URLENCODED_URL"), + (B64ENCODED_URL, "B64ENCODED_URL"), + (IPV4, "IPV4"), + (IPV6, "IPV6"), + (EMAIL, "EMAIL"), + (MD5, "MD5"), + (SHA1, "SHA1"), + (SHA256, "SHA256"), + (SHA512, "SHA512"), + (YARA_PARSE, "YARA_PARSE"), + (CREDIT_CARD, "CREDIT_CARD")] + + +rscript = re.compile(r'<(script|SCRIPT).*(src|SRC)=([^\s>]+)') +rhref = re.compile(r'<[aA].*(href|HREF)=([^\s>]+)') +rendpoint = re.compile(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]') +rentropy = re.compile(r'[\w-]{16,45}') diff --git a/core/lib/Photon/core/requester.py b/core/lib/Photon/core/requester.py new file mode 100644 index 00000000..9711c18e --- /dev/null +++ b/core/lib/Photon/core/requester.py @@ -0,0 +1,72 @@ +import random +import time + +import requests +from requests.exceptions import TooManyRedirects + + +SESSION = requests.Session() +SESSION.max_redirects = 3 + +def requester( + url, + main_url=None, + delay=0, + cook=None, + headers=None, + timeout=10, + host=None, + proxies=[None], + user_agents=[None], + failed=None, + processed=None + ): + """Handle the requests and return the response body.""" + cook = cook or set() + headers = headers or set() + user_agents = user_agents or ['Photon'] + failed = failed or set() + processed = processed or set() + # Mark the URL as crawled + processed.add(url) + # Pause/sleep the program for specified time + time.sleep(delay) + + def make_request(url): + """Default request""" + final_headers = headers or { + 'Host': host, + # Selecting a random user-agent + 'User-Agent': random.choice(user_agents), + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip', + 'DNT': '1', + 'Connection': 'close', + } + try: + response = SESSION.get( + url, + cookies=cook, + headers=final_headers, + verify=False, + timeout=timeout, + stream=True, + proxies=random.choice(proxies) + ) + except TooManyRedirects: + return 'dummy' + + if 'text/html' in response.headers['content-type'] or \ + 'text/plain' in response.headers['content-type']: + if response.status_code != '404': + return response.text + else: + response.close() + failed.add(url) + return 'dummy' + else: + response.close() + return 'dummy' + + return make_request(url) diff --git a/core/lib/Photon/core/updater.py b/core/lib/Photon/core/updater.py new file mode 100644 index 00000000..3583923e --- /dev/null +++ b/core/lib/Photon/core/updater.py @@ -0,0 +1,43 @@ +import os +import re +import sys +from os import path +sys.path.append(os.path.abspath('.')) + +from .colors import run, que, good, green, end, info +from .requester import requester + + +def updater(): + """Update the current installation. + + git clones the latest version and merges it with the current directory. + """ + print('%s Checking for updates' % run) + # Changes must be separated by ; + changes = '''major bug fixes;removed ninja mode;dropped python < 3.2 support;fixed unicode output;proxy support;more intels''' + latest_commit = requester('https://raw.githubusercontent.com/s0md3v/Photon/master/core/updater.py', host='raw.githubusercontent.com') + # Just a hack to see if a new version is available + if changes not in latest_commit: + changelog = re.search(r"changes = '''(.*?)'''", latest_commit) + # Splitting the changes to form a list + changelog = changelog.group(1).split(';') + print('%s A new version of Photon is available.' % good) + print('%s Changes:' % info) + for change in changelog: # print changes + print('%s>%s %s' % (green, end, change)) + + current_path = os.getcwd().split('/') # if you know it, you know it + folder = current_path[-1] # current directory name + path = '/'.join(current_path) # current directory path + choice = input('%s Would you like to update? [Y/n] ' % que).lower() + + if choice != 'n': + print('%s Updating Photon' % run) + os.system('git clone --quiet https://github.com/s0md3v/Photon %s' + % (folder)) + os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' + % (path, folder, path, path, folder)) + print('%s Update successful!' % good) + else: + print('%s Photon is up to date!' % good) diff --git a/core/lib/Photon/core/user-agents.txt b/core/lib/Photon/core/user-agents.txt new file mode 100644 index 00000000..dc25d833 --- /dev/null +++ b/core/lib/Photon/core/user-agents.txt @@ -0,0 +1,18 @@ +Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 7.02 Bork-edition [en] +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.0.3705) +Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729) +Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html) +Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0) +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246 +Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; FSL 7.0.7.01001) +Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1 +Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1 +Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.02 +Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 +Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727) +Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0 +Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1 +Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8 +Opera/9.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.01 diff --git a/core/lib/Photon/core/utils.py b/core/lib/Photon/core/utils.py new file mode 100644 index 00000000..3c6a7fe4 --- /dev/null +++ b/core/lib/Photon/core/utils.py @@ -0,0 +1,208 @@ +import requests +import math +import os.path +import re +import argparse +import sys +sys.path.append(os.path.abspath('.')) + +import tld + +from .colors import info +from .config import VERBOSE, BAD_TYPES + +from urllib.parse import urlparse + + +def regxy(pattern, response, supress_regex, custom): + """Extract a string based on regex pattern supplied by user.""" + try: + matches = re.findall(r'%s' % pattern, response) + for match in matches: + verb('Custom regex', match) + custom.add(match) + except: + supress_regex = True + + +def is_link(url, processed, files): + """ + Determine whether or not a link should be crawled + A url should not be crawled if it + - Is a file + - Has already been crawled + + Args: + url: str Url to be processed + processed: list[str] List of urls that have already been crawled + + Returns: + bool If `url` should be crawled + """ + if url not in processed: + if url.startswith('#') or url.startswith('javascript:'): + return False + is_file = url.endswith(BAD_TYPES) + if is_file: + files.add(url) + return False + return True + return False + + +def remove_regex(urls, regex): + """ + Parse a list for non-matches to a regex. + + Args: + urls: iterable of urls + regex: string regex to be parsed for + + Returns: + list of strings not matching regex + """ + + if not regex: + return urls + + # To avoid iterating over the characters of a string + if not isinstance(urls, (list, set, tuple)): + urls = [urls] + + try: + non_matching_urls = [url for url in urls if not re.search(regex, url)] + except TypeError: + return [] + + return non_matching_urls + + +def writer(datasets, dataset_names, output_dir): + """Write the results.""" + for dataset, dataset_name in zip(datasets, dataset_names): + if dataset: + filepath = output_dir + '/' + dataset_name + '.txt' + with open(filepath, 'w+') as out_file: + joined = '\n'.join(dataset) + out_file.write(str(joined.encode('utf-8').decode('utf-8'))) + out_file.write('\n') + + +def timer(diff, processed): + """Return the passed time.""" + # Changes seconds into minutes and seconds + minutes, seconds = divmod(diff, 60) + try: + # Finds average time taken by requests + time_per_request = diff / float(len(processed)) + except ZeroDivisionError: + time_per_request = 0 + return minutes, seconds, time_per_request + + +def entropy(string): + """Calculate the entropy of a string.""" + entropy = 0 + for number in range(256): + result = float(string.encode('utf-8').count( + chr(number))) / len(string.encode('utf-8')) + if result != 0: + entropy = entropy - result * math.log(result, 2) + return entropy + + +def xml_parser(response): + """Extract links from .xml files.""" + # Regex for extracting URLs + return re.findall(r'foobaz
" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEqual(comment.__class__, Comment) - - # The comment is properly integrated into the tree. - foo = soup.find(text="foo") - self.assertEqual(comment, foo.next_element) - baz = soup.find(text="baz") - self.assertEqual(comment, baz.previous_element) - - def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved inand