You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
	
	
		
			69 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
		
		
			
		
	
	
			69 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
|   
											11 months ago
										 | import os | ||
|  | from funcs import generate_phash  # Assuming this function computes the pHash and returns a string | ||
|  | import imagehash | ||
|  | 
 | ||
|  | def get_files(directory): | ||
|  |     # Recursively get all files in the directory | ||
|  |     file_list = [] | ||
|  |     for root, dirs, files in os.walk(directory): | ||
|  |         for filename in files: | ||
|  |             file_list.append(os.path.join(root, filename)) | ||
|  |     return file_list | ||
|  | 
 | ||
|  | # Function to compute pHashes for all images in a directory | ||
|  | def compute_phashes(image_paths): | ||
|  |     phash_dict = {} | ||
|  |     for image_path in image_paths: | ||
|  |         try: | ||
|  |             # Compute pHash and get it as a string | ||
|  |             phash_str = generate_phash(image_path) | ||
|  |             # Convert the hash string to an ImageHash object | ||
|  |             phash = imagehash.hex_to_hash(phash_str) | ||
|  |             phash_dict[image_path] = phash | ||
|  |         except Exception as e: | ||
|  |             print(f"Error processing {image_path}: {e}") | ||
|  |     return phash_dict | ||
|  | 
 | ||
|  | # Get all image files from 'ready_to_upload' and 'sorted' directories | ||
|  | ready_images = get_files('ready_to_upload') | ||
|  | ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')] | ||
|  | 
 | ||
|  | sorted_images = get_files('sorted') | ||
|  | sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')] | ||
|  | 
 | ||
|  | # Compute pHashes for images in 'ready_to_upload' | ||
|  | print("Computing pHashes for 'ready_to_upload' images...") | ||
|  | ready_image_phashes = compute_phashes(ready_images) | ||
|  | 
 | ||
|  | # Compute pHashes for images in 'sorted' | ||
|  | print("Computing pHashes for 'sorted' images...") | ||
|  | sorted_image_phashes = compute_phashes(sorted_images) | ||
|  | 
 | ||
|  | # Prepare the 'already_processed' directory | ||
|  | os.makedirs('already_processed', exist_ok=True) | ||
|  | 
 | ||
|  | # Set a Hamming distance threshold for considering images as duplicates | ||
|  | threshold = 5  # Adjust this value as needed | ||
|  | 
 | ||
|  | # Find and move duplicates | ||
|  | for sorted_image, sorted_phash in sorted_image_phashes.items(): | ||
|  |     duplicate_found = False | ||
|  |     for ready_image, ready_phash in ready_image_phashes.items(): | ||
|  |         # Compute Hamming distance between the two pHashes | ||
|  |         try: | ||
|  |             distance = sorted_phash - ready_phash | ||
|  |         except TypeError as e: | ||
|  |             print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}") | ||
|  |             continue | ||
|  | 
 | ||
|  |         if distance <= threshold: | ||
|  |             # Duplicate found | ||
|  |             newpath = sorted_image.replace('sorted', 'already_processed') | ||
|  |             os.makedirs(os.path.dirname(newpath), exist_ok=True) | ||
|  |             print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'") | ||
|  |             os.rename(sorted_image, newpath) | ||
|  |             duplicate_found = True | ||
|  |             break  # Exit the loop since a duplicate is found | ||
|  |     if not duplicate_found: | ||
|  |         print(f"No duplicate found for {sorted_image}") |