|
8 | 8 | import argparse
|
9 | 9 | import logging
|
10 | 10 | import time
|
| 11 | +import hashlib |
11 | 12 |
|
12 | 13 | logging.basicConfig(level=logging.INFO,
|
13 | 14 | format='%(asctime)s %(levelname)s %(message)s',
|
@@ -68,31 +69,40 @@ def main(args):
|
68 | 69 | json_data = data.decode('utf8') # .replace("'", '"')
|
69 | 70 | final_data = json.loads(json_data)
|
70 | 71 |
|
71 |
| - # Iterate through list |
| 72 | + # Iterate through list (standard: 50 latest pastebins) |
72 | 73 | for x in final_data:
|
73 | 74 |
|
74 | 75 | # Pre-create the content key-value pair
|
75 | 76 | x['content'] = 0
|
76 | 77 |
|
| 78 | + tohash = str(x['date']) + str(x['expire']) + str(x['full_url'] + str(x['key']) + str(x['scrape_url']) + str(x['size']) + str(x['syntax']) + str(x['title']) + str(x['user'])) |
| 79 | + hobject = hashlib.sha256(tohash.encode()) |
| 80 | + hash_string = str(hobject.hexdigest()) |
| 81 | + x['identityhash'] = hash_string |
| 82 | + |
77 | 83 | copy_of_x = deepcopy(x)
|
78 | 84 | for key, value in copy_of_x.items():
|
79 | 85 |
|
80 | 86 | if key == "scrape_url":
|
81 | 87 |
|
82 | 88 | # value = scrape_url
|
83 | 89 | text_encoded = call_scrape_url(value)
|
84 |
| - time.sleep(1) |
85 |
| - |
86 |
| - logger.info("Downloading content of " + value) |
87 | 90 |
|
88 | 91 | # Add content
|
89 | 92 | x['content'] = text_encoded
|
90 | 93 |
|
91 |
| - ## TODO: Add some identity check |
| 94 | + logger.info("Downloading content of " + value) |
| 95 | + time.sleep(1) |
92 | 96 |
|
93 |
| - # DB Save mode args['db'] == 2 |
94 | 97 | if args['db'] == "1":
|
95 |
| - db.pastebins.insert_one(x) |
| 98 | + |
| 99 | + # save only if the hash is not found in the db |
| 100 | + if db.pastebins.find_one({ "identityhash": x['identityhash'] },{ "identityhash": 1}): |
| 101 | + logger.info("Iteam already scraped: " + x['scrape_url']) |
| 102 | + else: |
| 103 | + logger.info("Item added to db: " + x['scrape_url']) |
| 104 | + db.pastebins.insert_one(x) |
| 105 | + |
96 | 106 | else:
|
97 | 107 | logger.debug("No data arrived.")
|
98 | 108 |
|
|
0 commit comments