diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..aaffc10 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Andre Fritsche + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..5181ee0 --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +# Pastebin Scraper + +The scraper here is based on the one from [six519](https://github.com/six519/PastebinPython). Thank you very much for giving us this present :-) + +## Installation + +### Via Github + +``` +cd /your/desired/directory +clone https://github.com/six519/PastebinPython.git +pip install -r requirements.txt +``` + +### Via pypi + +In this case you have to import the corresponding classes and methods to use it. I will try and make it somehow more usable this way. Just wanted to play around with pypi at first. + +``` +pip install pastebin-python-scraper +``` + +## Usage + +This fork of the PastebinPython project downloads all pastebin entries ... or well at least it starts downloading as much as it can. + +The results will be saved within a MongoDB collection. Another script will be triggered to identify keywords, which have to be provided in front. + +The second step results in new collections where each keyword gets one new collection and all found pastebin entries will be copied there. + +### 1. pastebin_scrape.py + +For this one to work good you need an API key. I bought a lifetime access to the pastebin API a while ago for 29,99 USD. It doesn't make you poor. + +You will also need to update your Scraping IP, in order to make it work: [Change Scraping IP](https://pastebin.com/doc_scraping_api) + + python pastebin_scrape.py -v 1 \ # verbose mode + -db 1 \ # save to DB (without this, nothing will be saved) + -api \ + -mongodbhost \ # default: localhost + -mongodbport # default: 27017 + +### 2. pastebin_analyze.py + +Once step 1 has finished a cycle you want to analyze whatever there is which is of interest to you. So start writing a keywordlist. One row, one keyword. + +When you have finished, start the analze module: + + python pastebin_analyze.py -f \ + -mongodbhost \ # default: localhost + -mongodbport # default: 27017 + +Finally it will create collections for all of the keywords it found and copy the pastebin into that collection. There might also be empty collections. Sometimes you +just can't find anything you are searching for. + +### Access Data via Flask API + +Finally you can either write yourself a clean data retriever or you can use this Flask API implementation here: + +``` +# start it in debug mode first! +python pastebin_api.py -d \ + -mongodbhost \ + -mongodbport +``` + +Well there is only one API method. Grab yourself a browser or use curl: + +``` +http://localhost:5000/api/getpastebins/ +``` + +The result should be a nice JSON document collection. Maybe too large to handle for a browser. Anyway this is just intended for demonstration reasons. + +If you want to use that data somehow, you might find the JSON format handy and start to parse it for your own purpose. \ No newline at end of file diff --git a/keywords_example.txt b/keywords_example.txt new file mode 100644 index 0000000..edae0b2 --- /dev/null +++ b/keywords_example.txt @@ -0,0 +1,7 @@ +ip +malware +glock +android +ios +lenovo + diff --git a/pastebin_python/__init__.py b/pastebin_python/__init__.py index b2f7c81..300b6cd 100644 --- a/pastebin_python/__init__.py +++ b/pastebin_python/__init__.py @@ -1,14 +1,14 @@ -"""A complete pastebin.com API wrapper for Python - -.. moduleauthor:: Ferdinand Silva +""" +A complete pastebin.com API wrapper for Python. +Fork from Ferdinand Silva. """ -from pastebin import PastebinPython +from pastebin_python.pastebin import PastebinPython -__version__ = "1.2" -__app_name__ = "pastebin_python" +__version__ = "1.2.1" +__app_name__ = "pastebin_python_scraper" __description__ = "A complete pastebin.com API wrapper for Python" -__author__ = "Ferdinand Silva" -__author_email__ = "ferdinandsilva@ferdinandsilva.com" -__app_url__ = "http://ferdinandsilva.com/" -__download_url__ = "https://github.com/six519/PastebinPython" \ No newline at end of file +__author__ = "André Fritsche" +__author_email__ = "github@andresilaghi.com" +__app_url__ = "https://www.andresilaghi.com/" +__download_url__ = "https://github.com/ihgalis/PastebinPython" diff --git a/pastebin_python/pastebin.py b/pastebin_python/pastebin.py index a95d33d..44e609d 100644 --- a/pastebin_python/pastebin.py +++ b/pastebin_python/pastebin.py @@ -8,9 +8,9 @@ import re import requests from xml.dom.minidom import parseString -from pastebin_options import OPTION_PASTE, OPTION_LIST, OPTION_TRENDS, OPTION_DELETE, OPTION_USER_DETAILS -from pastebin_constants import PASTEBIN_API_POST_URL, PASTEBIN_API_LOGIN_URL, PASTEBIN_RAW_URL -from pastebin_exceptions import PastebinBadRequestException, PastebinNoPastesException, PastebinFileException, PastebinHTTPErrorException +from pastebin_python.pastebin_options import OPTION_PASTE, OPTION_LIST, OPTION_TRENDS, OPTION_DELETE, OPTION_USER_DETAILS +from pastebin_python.pastebin_constants import PASTEBIN_API_POST_URL, PASTEBIN_API_LOGIN_URL, PASTEBIN_RAW_URL, PASTEBIN_URL_SCRAPE +from pastebin_python.pastebin_exceptions import PastebinBadRequestException, PastebinNoPastesException, PastebinFileException, PastebinHTTPErrorException class PastebinPython(object): @@ -150,9 +150,9 @@ def __processRequest(self, method, url, data): req = self.api_session.request(method, url, data=data) response = req.content - if re.search('^Bad API request', response): + if re.search('^Bad API request', response.decode('utf-8')): raise PastebinBadRequestException(response) - elif re.search('^No pastes found', response): + elif re.search('^No pastes found', response.decode('utf-8')): raise PastebinNoPastesException return response @@ -394,3 +394,21 @@ def getPasteRawOutput(self, api_paste_key): retMsg = str(e) return retMsg.decode('utf-8') + + def scrapeMostRecent(self): + """ + Returns the most recent Pastebin posts. You will need to have an API Key and a whitelisted IP + configured on pastebin.com (https://pastebin.com/api_scraping_faq) + + :return: str + """ + try: + print("Scraping ... on: " + PASTEBIN_URL_SCRAPE + "/api_scraping.php") + data = self.__processRequest('GET', + PASTEBIN_URL_SCRAPE + "/api_scraping.php", + None) + return data + except PastebinBadRequestException as e: + retMsg = str(e) + print("PastebinBadRequest") + return None diff --git a/pastebin_python/pastebin_constants.py b/pastebin_python/pastebin_constants.py index afdc34d..0e1a3e2 100644 --- a/pastebin_python/pastebin_constants.py +++ b/pastebin_python/pastebin_constants.py @@ -5,7 +5,8 @@ .. moduleauthor:: Ferdinand Silva """ -PASTEBIN_URL = "http://pastebin.com/" #: The pastebin.com base url +PASTEBIN_URL_SCRAPE = "https://scrape.pastebin.com/" +PASTEBIN_URL = "https://pastebin.com/" #: The pastebin.com base url PASTEBIN_RAW_URL = "%s%s" % (PASTEBIN_URL, "raw.php?i=%s") PASTEBIN_API_URL = "%s%s" % (PASTEBIN_URL, "api/") #: The pastebin.com API base URL PASTEBIN_API_POST_URL = "%s%s" % (PASTEBIN_API_URL, "api_post.php") #: The pastebin.com API POST URL diff --git a/pastebin_python_scraper/__init__.py b/pastebin_python_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pastebin_python_scraper/pastebin_analyze.py b/pastebin_python_scraper/pastebin_analyze.py new file mode 100644 index 0000000..34d13e6 --- /dev/null +++ b/pastebin_python_scraper/pastebin_analyze.py @@ -0,0 +1,93 @@ +from pymongo import MongoClient +import argparse +from pprint import PrettyPrinter +import logging + +pp = PrettyPrinter() + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s', + datefmt='%a, %d %b %Y %H:%M:%S') + +logger = logging.getLogger(__name__) + + +def main(): + """ + starts the entire process of analyzation by creating new collections and appending new documents into existing + collections based on keywords. + + :return: + """ + + client = MongoClient(str(args['mongodbhost']), int(args['mongodbport'])) + db = client.scrape + logger.info("MongoDB Connection created") + + dbcursor = db.pastebins.find({}) + + # open keyword file + file = open(args['f'], "r") + search_exp = file.readlines() + + # per expression -> one collection in DB + # Clear search_exp (whitespaces + \n) + for exp in search_exp: + clear_exp = exp.rstrip() + + # is the collection not already there? + if clear_exp not in db.collection_names(): + db.create_collection(str(clear_exp)) + logger.info("MongoDB Collection new: " + str(clear_exp)) + + # Iterate through documents + for document in dbcursor: + + # Iterate through dictionary + for key, value in document.items(): + + # Iterate through keywords + for exp in search_exp: + clear_exp = exp.rstrip() + + if key == "title" or key == "user" or key == "content": + + if value is not None: + splitted_string = value.split(' ') + + # Compare every exp keyword with every other + # splitted string + for string in splitted_string: + if string == clear_exp: + + # Check whether the pastebin has been added already + possible_pastebin = db[string].find_one({"key": document['key']}) + + if possible_pastebin is None: + logger.info("Entry found for key: " + str(document['key'])) + + # Insert Data into collection + db[string].insert_one(document) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Pastebin Analyzer - Offline") + + parser.add_argument('-f', + help="Config file containing all keywords to search for. Only matching pastebins will be saved.", + default="keywords.txt", + required=True) + + parser.add_argument('-mongodbhost', + help="A string with the URL to your MongoDB Server.", + default="localhost", + required=True) + + parser.add_argument('-mongodbport', + help="THe port to which your MongoDB listens.", + default=27017, + required=True) + + args = vars(parser.parse_args()) + + main() \ No newline at end of file diff --git a/pastebin_python_scraper/pastebin_api.py b/pastebin_python_scraper/pastebin_api.py new file mode 100644 index 0000000..5acfb62 --- /dev/null +++ b/pastebin_python_scraper/pastebin_api.py @@ -0,0 +1,88 @@ +from flask import Flask, jsonify, make_response +from pymongo import MongoClient +from bson import json_util + +import argparse +import json + +app = Flask(__name__) + +api_version = "1.0" + + +@app.errorhandler(404) +def not_found(error): + """ + some standard error handling for unknown pages. + + :param error: + :return: + """ + return make_response(jsonify({'error': 'Notfound'}), 404) + + +@app.route('/') +def get_index(): + """ + standard output when nothing is set + :return: + """ + + basic_info = [ + { + 'api': '1.0', + 'name': 'PastebinPython Flask Accessing API', + 'author': 'Andre Fritsche / ihgalis' + } + ] + + return jsonify({'basic_info': basic_info}) + + +@app.route('/api/getpastebins/', methods=['GET']) +def get_pastebins(keyword): + """ + method gets all documents related to the specified keyword. It accesses the corresponding collections so you will + always get only the documents that have been identified by the pastebin_analyze.py script. + + :param keyword: string + :return: JSON based dictionary + """ + + client = MongoClient(str(args['mongodbhost']), int(args['mongodbport'])) + db = client.scrape + + tlist = list() + + dbcursor = db[keyword].find({}) + for document in dbcursor: + sanitized = json.loads(json_util.dumps(document)) + tlist.append(sanitized) + + return jsonify(tlist) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="PastebinPython Flask Accessing API") + + parser.add_argument('-mongodbhost', + help="A string with the URL to your MongoDB Server.", + default="localhost", + required=True) + + parser.add_argument('-mongodbport', + help="THe port to which your MongoDB listens.", + default=27017, + required=True) + + parser.add_argument('-d', + action="http://github.com/store_true", + help="Debug in Flask active or not.", + default=0) + + args = vars(parser.parse_args()) + + if args['d']: + app.run(debug=True) + else: + app.run(debug=False) diff --git a/pastebin_python_scraper/pastebin_scrape.py b/pastebin_python_scraper/pastebin_scrape.py new file mode 100644 index 0000000..8e1c920 --- /dev/null +++ b/pastebin_python_scraper/pastebin_scrape.py @@ -0,0 +1,147 @@ +from pastebin_python.pastebin import PastebinPython +from pastebin_python.pastebin_exceptions import PastebinBadRequestException +from pymongo import MongoClient +from copy import deepcopy + +import json +import urllib.request +import argparse +import logging +import time +import hashlib + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s', + datefmt='%a, %d %b %Y %H:%M:%S') + +logger = logging.getLogger(__name__) + + +def call_scrape_http://github.com/url(url): + """ + Method is doing all the URL calling stuff. + + :param url: The URL which should be called + :return: The content of the previously requested pastebin + """ + + try: + request = urllib.request.Request(url) + result = urllib.request.urlopen(request) + result_text = result.read() + text_encoded = result_text.decode(encoding='utf-8', errors='ignore') + + return text_encoded + + except json.decoder.JSONDecodeError as json_e: + logger.error("JSON Decoding Error ... Jumping to next element.") + return None + + +def main(args): + """ + Regular main method starts the entire process and interprets the + arguments. + + :param args: arguments from argparse + :return: None + """ + + if args['v'] == 0: + logger.propagate = False + elif args['v'] == 1: + logger.propagate = True + + logger.info("Start Pastebin Analyzer") + + api_key = args['api'] + pbin = PastebinPython(api_dev_key=api_key) + + client = MongoClient(str(args['mongodbhost']), int(args['mongodbport'])) + db = client.scrape + logger.info("MongoDB Connection created") + + while True: + try: + data = pbin.scrapeMostRecent() + + if data: + json_data = data.decode('utf8') # .replace("'", '"') + final_data = json.loads(json_data) + + # Iterate through list (standard: 50 latest pastebins) + for x in final_data: + + # Pre-create the content key-value pair + x['content'] = 0 + + tohash = str(x['date']) + str(x['expire']) + str(x['full_url'] + str(x['key']) + str(x['scrape_url']) + str(x['size']) + str(x['syntax']) + str(x['title']) + str(x['user'])) + hobject = hashlib.sha256(tohash.encode()) + hash_string = str(hobject.hexdigest()) + x['identityhash'] = hash_string + + copy_of_x = deepcopy(x) + for key, value in copy_of_x.items(): + + if key == "scrape_url": + + # value = scrape_url + text_encoded = call_scrape_url(http://github.com/value) + + # Add content + x['content'] = text_encoded + + logger.info("Downloading content of " + value) + time.sleep(1) + + if args['db'] == "1": + + # save only if the hash is not found in the db + if db.pastebins.find_one({ "identityhash": x['identityhash'] },{ "identityhash": 1}): + logger.info("Iteam already scraped: " + x['scrape_url']) + else: + logger.info("Item added to db: " + x['scrape_url']) + db.pastebins.insert_one(x) + + else: + logger.debug("No data arrived.") + + except PastebinBadRequestException as e: + logger.debug("Pastebin Bad Request - You're doing it wrong") + + except json.decoder.JSONDecodeError as e: + logger.debug("JSON Decoding Error ... 'You can't always get what you want!'") + continue + else: + logger.debug("No exception") + finally: + logger.info("End of Session!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Pastebin Analyzer") + + parser.add_argument('-db', + help="If this is set entries are being added into the DB.\n" + "0 = no DB entries" + "1 = all Pastebin entries are written to DB", + default=0) + + parser.add_argument('-v', + help="Verbose mode.", + default=0) + + parser.add_argument('-api', + help="Pastebin API Key for Scraping.", + required=True) + + parser.add_argument('-mongodbhost', + help="A string with the URL to your MongoDB Server.") + + parser.add_argument('-mongodbport', + help="THe port to which your MongoDB listens.", + default=27017) + + args = vars(parser.parse_args()) + + main(args) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7f7754a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +pymongo +argparse +logging +re +time +sys +pprint +flask +bson +requests \ No newline at end of file diff --git a/setup.py b/setup.py index 41a9806..33b2870 100644 --- a/setup.py +++ b/setup.py @@ -1,24 +1,38 @@ import pastebin_python +import setuptools -try: - from setuptools import setup -except ImportError: - from distutils.core import setup +with open("README.md", "r") as f: + long_description = f.read() -setup( +setuptools.setup( name=pastebin_python.__app_name__, version=pastebin_python.__version__, description=pastebin_python.__description__, + long_description=long_description, + long_description_content_type="text/markdown", author=pastebin_python.__author__, author_email=pastebin_python.__author_email__, - packages=['pastebin_python'], + packages=['pastebin_python', 'pastebin_python_scraper'], url=pastebin_python.__app_url__, + install_requires=[ + 'pymongo', + 'argparse', + 'logging', + 're', + 'time', + 'sys', + 'pprint', + 'flask', + 'bson', + 'requests' + ], + python_requires='>=3.6', classifiers=( - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Natural Language :: English', 'Programming Language :: Python', - 'License :: Freeware', + 'License :: OSI Approved :: MIT License', ), download_url=pastebin_python.__download_url__, -) \ No newline at end of file +) diff --git a/sonar-project.properties b/sonar-project.properties new file mode 100644 index 0000000..a916374 --- /dev/null +++ b/sonar-project.properties @@ -0,0 +1,2 @@ +sonar.exclusions=doc/* +sonar.exclusions=doc/** \ No newline at end of file