Skip to content

Commit 57b33c5

Browse files
committed
added the function to scrape the most recent pastebins and created the first step. basic scraping. still missing the duplicate check
1 parent c94718a commit 57b33c5

File tree

6 files changed

+217
-7
lines changed

6 files changed

+217
-7
lines changed

README.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Pastebin Scraper
2+
3+
The scraper here is based on the one from [six519](https://github.com/six519/PastebinPython). Thank you very much for giving us this present :-)
4+
5+
## Installation
6+
7+
Simply clone it:
8+
9+
```
10+
cd /your/desired/directory
11+
clone https://github.com/six519/PastebinPython.git
12+
pip install -r requirements.txt
13+
```
14+
15+
## Usage
16+
17+
This fork of the PastebinPython project downloads all pastebin entries ... or well at least it starts downloading as much as it can.
18+
19+
The results will be saved within a MongoDB collection. Another script will be triggered to identify keywords, which have to be provided in front.
20+
21+
The second step results in new collections where each keyword gets one new collection and all found pastebin entries will be copied there.
22+
23+
The third step might be the accumulator. It identifies specific words (similiar to step 2) but also specials like eMail addresses, Bitcoin addresses, URLs, IP Adresses etc.
24+
25+
### 1. pastebin_scrape.py
26+
27+
For this one to work good you need an API key. I bought a lifetime access to the pastebin API a while ago for 29,99 USD. It doesn't make you poor.
28+
29+
You will also need to update your Scraping IP, in order to make it work: [Change Scraping IP](https://pastebin.com/doc_scraping_api)
30+
31+
python pastebin_scrape.py -v 1 -db 1 -api <YOUR_PASTE_BIN_API_KEY>
32+
33+
### 2. pastebin_analyze.py
34+
35+
### 3. pastebin_accumulate.py
36+
37+
### Access Data via Flask API
38+
39+
Finally you can either write yourself a clean data retriever or you can use this Flask API implementation here:
40+
41+
```
42+
# start it in debug and verbose mode first!
43+
python pastebin_api.py -d -v
44+
```
45+
46+
Well there is only one API method. Grab yourself a browser or use curl:
47+
48+
```
49+
http://localhost:5000/api/getpastebins/<keyword>
50+
```
51+
52+
The result should be a nice JSON document collection. Maybe too large to handle for a browser. Anyway this is just intended for demonstration reasons.
53+
54+
If you want to use that data somehow, you might find the JSON format handy and start to parse it for your own purpose.

pastebin_python/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
.. moduleauthor:: Ferdinand Silva <[email protected]>
44
55
"""
6-
from pastebin import PastebinPython
6+
from pastebin_python.pastebin import PastebinPython
77

88
__version__ = "1.2"
99
__app_name__ = "pastebin_python"

pastebin_python/pastebin.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
import re
99
import requests
1010
from xml.dom.minidom import parseString
11-
from pastebin_options import OPTION_PASTE, OPTION_LIST, OPTION_TRENDS, OPTION_DELETE, OPTION_USER_DETAILS
12-
from pastebin_constants import PASTEBIN_API_POST_URL, PASTEBIN_API_LOGIN_URL, PASTEBIN_RAW_URL
13-
from pastebin_exceptions import PastebinBadRequestException, PastebinNoPastesException, PastebinFileException, PastebinHTTPErrorException
11+
from pastebin_python.pastebin_options import OPTION_PASTE, OPTION_LIST, OPTION_TRENDS, OPTION_DELETE, OPTION_USER_DETAILS
12+
from pastebin_python.pastebin_constants import PASTEBIN_API_POST_URL, PASTEBIN_API_LOGIN_URL, PASTEBIN_RAW_URL, PASTEBIN_URL_SCRAPE
13+
from pastebin_python.pastebin_exceptions import PastebinBadRequestException, PastebinNoPastesException, PastebinFileException, PastebinHTTPErrorException
1414

1515

1616
class PastebinPython(object):
@@ -150,9 +150,9 @@ def __processRequest(self, method, url, data):
150150
req = self.api_session.request(method, url, data=data)
151151

152152
response = req.content
153-
if re.search('^Bad API request', response):
153+
if re.search('^Bad API request', response.decode('utf-8')):
154154
raise PastebinBadRequestException(response)
155-
elif re.search('^No pastes found', response):
155+
elif re.search('^No pastes found', response.decode('utf-8')):
156156
raise PastebinNoPastesException
157157

158158
return response
@@ -394,3 +394,21 @@ def getPasteRawOutput(self, api_paste_key):
394394
retMsg = str(e)
395395

396396
return retMsg.decode('utf-8')
397+
398+
def scrapeMostRecent(self):
399+
"""
400+
Returns the most recent Pastebin posts. You will need to have an API Key and a whitelisted IP
401+
configured on pastebin.com (https://pastebin.com/api_scraping_faq)
402+
403+
:return: str
404+
"""
405+
try:
406+
print("Scraping ... on: " + PASTEBIN_URL_SCRAPE + "/api_scraping.php")
407+
data = self.__processRequest('GET',
408+
PASTEBIN_URL_SCRAPE + "/api_scraping.php",
409+
None)
410+
return data
411+
except PastebinBadRequestException as e:
412+
retMsg = str(e)
413+
print("PastebinBadRequest")
414+
return None

pastebin_python/pastebin_constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
.. moduleauthor:: Ferdinand Silva <[email protected]>
66
77
"""
8-
PASTEBIN_URL = "http://pastebin.com/" #: The pastebin.com base url
8+
PASTEBIN_URL_SCRAPE = "https://scrape.pastebin.com"
9+
PASTEBIN_URL = "https://pastebin.com/" #: The pastebin.com base url
910
PASTEBIN_RAW_URL = "%s%s" % (PASTEBIN_URL, "raw.php?i=%s")
1011
PASTEBIN_API_URL = "%s%s" % (PASTEBIN_URL, "api/") #: The pastebin.com API base URL
1112
PASTEBIN_API_POST_URL = "%s%s" % (PASTEBIN_API_URL, "api_post.php") #: The pastebin.com API POST URL

pastebin_scrape.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from pastebin_python.pastebin import PastebinPython
2+
from pastebin_python.pastebin_exceptions import PastebinBadRequestException
3+
from pymongo import MongoClient
4+
from copy import deepcopy
5+
6+
import json
7+
import urllib.request
8+
import argparse
9+
import logging
10+
import time
11+
12+
logging.basicConfig(level=logging.INFO,
13+
format='%(asctime)s %(levelname)s %(message)s',
14+
datefmt='%a, %d %b %Y %H:%M:%S')
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
def call_scrape_url(url):
20+
"""
21+
Method is doing all the URL calling stuff.
22+
23+
:param url: The URL which should be called
24+
:return: The content of the previously requested pastebin
25+
"""
26+
27+
try:
28+
request = urllib.request.Request(url)
29+
result = urllib.request.urlopen(request)
30+
result_text = result.read()
31+
text_encoded = result_text.decode(encoding='utf-8', errors='ignore')
32+
33+
return text_encoded
34+
35+
except json.decoder.JSONDecodeError as json_e:
36+
logger.error("JSON Decoding Error ... Jumping to next element.")
37+
return None
38+
39+
40+
def main(args):
41+
"""
42+
Regular main method starts the entire process and interprets the
43+
arguments.
44+
45+
:param args: arguments from argparse
46+
:return: None
47+
"""
48+
49+
if args['v'] == 0:
50+
logger.propagate = False
51+
elif args['v'] == 1:
52+
logger.propagate = True
53+
54+
logger.info("Start Pastebin Analyzer")
55+
56+
api_key = args['api']
57+
pbin = PastebinPython(api_dev_key=api_key)
58+
59+
client = MongoClient(str(args['mongodbhost']), int(args['mongodbport']))
60+
db = client.scrape
61+
logger.info("MongoDB Connection created")
62+
63+
while True:
64+
try:
65+
data = pbin.scrapeMostRecent()
66+
67+
if data:
68+
json_data = data.decode('utf8') # .replace("'", '"')
69+
final_data = json.loads(json_data)
70+
71+
# Iterate through list
72+
for x in final_data:
73+
74+
# Pre-create the content key-value pair
75+
x['content'] = 0
76+
77+
copy_of_x = deepcopy(x)
78+
for key, value in copy_of_x.items():
79+
80+
if key == "scrape_url":
81+
82+
# value = scrape_url
83+
text_encoded = call_scrape_url(value)
84+
time.sleep(1)
85+
86+
logger.info("Downloading content of " + value)
87+
88+
# Add content
89+
x['content'] = text_encoded
90+
91+
## TODO: Add some identity check
92+
93+
# DB Save mode args['db'] == 2
94+
if args['db'] == "1":
95+
db.pastebins.insert_one(x)
96+
else:
97+
logger.debug("No data arrived.")
98+
99+
except PastebinBadRequestException as e:
100+
logger.debug("Pastebin Bad Request - You're doing it wrong")
101+
102+
except json.decoder.JSONDecodeError as e:
103+
logger.debug("JSON Decoding Error ... 'You can't always get what you want!'")
104+
continue
105+
else:
106+
logger.debug("No exception")
107+
finally:
108+
logger.info("End of Session!")
109+
110+
111+
if __name__ == "__main__":
112+
parser = argparse.ArgumentParser(description="Pastebin Analyzer")
113+
114+
parser.add_argument('-db',
115+
help="If this is set entries are being added into the DB.\n"
116+
"0 = no DB entries"
117+
"1 = all Pastebin entries are written to DB",
118+
default=0)
119+
120+
parser.add_argument('-v',
121+
help="Verbose mode.",
122+
default=0)
123+
124+
parser.add_argument('-api',
125+
help="Pastebin API Key for Scraping.",
126+
required=True)
127+
128+
parser.add_argument('-mongodbhost',
129+
help="A string with the URL to your MongoDB Server.")
130+
131+
parser.add_argument('-mongodbport',
132+
help="THe port to which your MongoDB listens.",
133+
default=27017)
134+
135+
args = vars(parser.parse_args())
136+
137+
main(args)

requirements.txt

Whitespace-only changes.

0 commit comments

Comments
 (0)