1+ #!/bin/python 
2+ 
3+ """ 
4+ Write a function that returns a list of all the duplicate files.  
5+ 
6+ the first item is the duplicate file 
7+ the second item is the original file 
8+ For example: 
9+ 
10+   [('/tmp/parker_is_dumb.mpg', '/home/parker/secret_puppy_dance.mpg'), 
11+  ('/home/trololol.mov', '/etc/apache2/httpd.conf')] 
12+ You can assume each file was only duplicated once. 
13+ """ 
14+ 
15+ import  os 
16+ import  hashlib 
17+ 
18+ def  find_duplicate_files (starting_directory ):
19+     files_seen_already  =  {}
20+     stack  =  [starting_directory ]
21+ 
22+     duplicates  =  []
23+ 
24+     while  len (stack ):
25+         current_path  =  stack .pop ()
26+ 
27+         if  os .path .isdir (current_path ):
28+             for  path  in  os .listdir (current_path ):
29+                 full_path  =  os .path .join (current_path , path )
30+                 stack .append (full_path )
31+ 
32+         else :
33+             file_hash  =  sample_hash_file (current_path )
34+ 
35+             current_last_edited_time  =  os .path .getmtime (current_path )
36+ 
37+             if  file_hash  in  files_seen_already :
38+                 existing_last_edited_time , existing_path  =  files_seen_already [file_hash ]
39+                 if  current_last_edited_time  >  existing_last_edited_time :
40+ 
41+                     duplicates .append ((current_path , existing_path ))
42+                 else :
43+ 
44+                     duplicates .append ((existing_path , current_path ))
45+                     files_seen_already [file_hash ] =  (current_last_edited_time , current_path )
46+ 
47+             else :
48+                 files_seen_already [file_hash ] =  (current_last_edited_time , current_path )
49+ 
50+     return  duplicates 
51+ 
52+ 
53+ def  sample_hash_file (path ):
54+     num_bytes_to_read_per_sample  =  4000 
55+     total_bytes  =  os .path .getsize (path )
56+     hasher  =  hashlib .sha512 ()
57+ 
58+     with  open (path , 'rb' ) as  file :
59+ 
60+         if  total_bytes  <  num_bytes_to_read_per_sample  *  3 :
61+             hasher .update (file .read ())
62+         else :
63+             num_bytes_between_samples  =  (
64+                 (total_bytes  -  num_bytes_to_read_per_sample  *  3 ) /  2 
65+             )
66+ 
67+             for  offset_multiplier  in  range (3 ):
68+                 start_of_sample  =  (
69+                     offset_multiplier 
70+                     *  (num_bytes_to_read_per_sample  +  num_bytes_between_samples )
71+                 )
72+                 file .seek (start_of_sample )
73+                 sample  =  file .read (num_bytes_to_read_per_sample )
74+                 hasher .update (sample )
75+ 
76+     return  hasher .hexdigest ()
0 commit comments