You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.7 KiB
Python

11 months ago
import os
from funcs import generate_phash # Assuming this function computes the pHash and returns a string
import imagehash
def get_files(directory):
# Recursively get all files in the directory
file_list = []
for root, dirs, files in os.walk(directory):
for filename in files:
file_list.append(os.path.join(root, filename))
return file_list
# Function to compute pHashes for all images in a directory
def compute_phashes(image_paths):
phash_dict = {}
for image_path in image_paths:
try:
# Compute pHash and get it as a string
phash_str = generate_phash(image_path)
# Convert the hash string to an ImageHash object
phash = imagehash.hex_to_hash(phash_str)
phash_dict[image_path] = phash
except Exception as e:
print(f"Error processing {image_path}: {e}")
return phash_dict
# Get all image files from 'ready_to_upload' and 'sorted' directories
ready_images = get_files('ready_to_upload')
ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
sorted_images = get_files('sorted')
sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
# Compute pHashes for images in 'ready_to_upload'
print("Computing pHashes for 'ready_to_upload' images...")
ready_image_phashes = compute_phashes(ready_images)
# Compute pHashes for images in 'sorted'
print("Computing pHashes for 'sorted' images...")
sorted_image_phashes = compute_phashes(sorted_images)
# Prepare the 'already_processed' directory
os.makedirs('already_processed', exist_ok=True)
# Set a Hamming distance threshold for considering images as duplicates
threshold = 5 # Adjust this value as needed
# Find and move duplicates
for sorted_image, sorted_phash in sorted_image_phashes.items():
duplicate_found = False
for ready_image, ready_phash in ready_image_phashes.items():
# Compute Hamming distance between the two pHashes
try:
distance = sorted_phash - ready_phash
except TypeError as e:
print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
continue
if distance <= threshold:
# Duplicate found
newpath = sorted_image.replace('sorted', 'already_processed')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
os.rename(sorted_image, newpath)
duplicate_found = True
break # Exit the loop since a duplicate is found
if not duplicate_found:
print(f"No duplicate found for {sorted_image}")