Skip to content

Commit 25e0401

Browse files
committed
added the identitiyhash in order to make it possible to identify items which bave been already scraped
1 parent 57b33c5 commit 25e0401

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

pastebin_scrape.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import argparse
99
import logging
1010
import time
11+
import hashlib
1112

1213
logging.basicConfig(level=logging.INFO,
1314
format='%(asctime)s %(levelname)s %(message)s',
@@ -68,31 +69,40 @@ def main(args):
6869
json_data = data.decode('utf8') # .replace("'", '"')
6970
final_data = json.loads(json_data)
7071

71-
# Iterate through list
72+
# Iterate through list (standard: 50 latest pastebins)
7273
for x in final_data:
7374

7475
# Pre-create the content key-value pair
7576
x['content'] = 0
7677

78+
tohash = str(x['date']) + str(x['expire']) + str(x['full_url'] + str(x['key']) + str(x['scrape_url']) + str(x['size']) + str(x['syntax']) + str(x['title']) + str(x['user']))
79+
hobject = hashlib.sha256(tohash.encode())
80+
hash_string = str(hobject.hexdigest())
81+
x['identityhash'] = hash_string
82+
7783
copy_of_x = deepcopy(x)
7884
for key, value in copy_of_x.items():
7985

8086
if key == "scrape_url":
8187

8288
# value = scrape_url
8389
text_encoded = call_scrape_url(value)
84-
time.sleep(1)
85-
86-
logger.info("Downloading content of " + value)
8790

8891
# Add content
8992
x['content'] = text_encoded
9093

91-
## TODO: Add some identity check
94+
logger.info("Downloading content of " + value)
95+
time.sleep(1)
9296

93-
# DB Save mode args['db'] == 2
9497
if args['db'] == "1":
95-
db.pastebins.insert_one(x)
98+
99+
# save only if the hash is not found in the db
100+
if db.pastebins.find_one({ "identityhash": x['identityhash'] },{ "identityhash": 1}):
101+
logger.info("Iteam already scraped: " + x['scrape_url'])
102+
else:
103+
logger.info("Item added to db: " + x['scrape_url'])
104+
db.pastebins.insert_one(x)
105+
96106
else:
97107
logger.debug("No data arrived.")
98108

0 commit comments

Comments
 (0)