from moviepy.editor import VideoFileClip import os, cv2, hashlib, requests from PIL import Image import numpy as np import imagehash headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"} proxies={"http": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/","https": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/"} def generate_phash(image_path): try: image = Image.open(image_path) return str(imagehash.phash(image)) except Exception as e: print(f"Error generating phash for {image_path}: {e}") return False def cleanEmptyFolders(path): for root, dirs, fs in os.walk(path): for d in dirs: cleanEmptyFolders(os.path.join(root, d)) if not os.listdir(root): os.rmdir(root) def get_files(directory): files = [] for root, dirs, filenames in os.walk(directory): for filename in filenames: files.append(os.path.join(root, filename)) return files import cv2 import numpy as np def compare_images(image_path1, image_path2): # Load the images in grayscale img1 = cv2.imread(image_path1, cv2.IMREAD_GRAYSCALE) img2 = cv2.imread(image_path2, cv2.IMREAD_GRAYSCALE) if img1 is None or img2 is None: print("Error loading images!") return False # Or you could raise an exception # Initialize SIFT detector sift = cv2.SIFT_create() # Find keypoints and descriptors with SIFT kp1, des1 = sift.detectAndCompute(img1, None) kp2, des2 = sift.detectAndCompute(img2, None) # Check if descriptors are None if des1 is None or des2 is None: return False # FLANN parameters index_params = dict(algorithm=1, trees=5) search_params = dict(checks=50) # FLANN based matcher flann = cv2.FlannBasedMatcher(index_params, search_params) # Matching descriptor vectors using KNN algorithm matches = flann.knnMatch(des1, des2, k=2) # Apply ratio test good = [] for m, n in matches: if m.distance < 0.6 * n.distance: # More stringent ratio good.append(m) # Minimum number of matches MIN_MATCH_COUNT = 15 # Adjust this threshold as needed if len(good) > MIN_MATCH_COUNT: # Extract location of good matches src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2) dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2) # Find homography M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0) matchesMask = mask.ravel().tolist() if np.sum(matchesMask) > 10: # Check if enough points agree on homography return True else: return False else: return False def remove_empty_folders(dir_path): import shutil def is_folder_empty(folder_path): return len(os.listdir(folder_path)) == 0 num_folder = 0 for root, dirs, files in os.walk(dir_path, topdown=False): for dir_name in dirs: dir_path = os.path.join(root, dir_name) if not os.path.isdir(dir_path): continue if '$' in dir_name or '$' in dir_path: print(f"Skipping system folder: {dir_path}") continue if 'system volume information' in dir_name.lower() or 'system volume information' in dir_path.lower(): print(f"Skipping system folder: {dir_path}") continue if is_folder_empty(dir_path) or dir_name.lower() == '__pycache__': shutil.rmtree(dir_path) print(f"Moved empty folder: {dir_path}") num_folder+=1 def download_file(url, filePath): try: response = requests.get(url, stream=True, headers=headers) response.raise_for_status() directory = os.path.dirname(filePath) if not os.path.exists(directory): os.makedirs(directory) with open(filePath, "wb") as out_file: for chunk in response.iter_content(chunk_size=8192): out_file.write(chunk) print(f"Downloaded {filePath}") except Exception as e: print(f"Failed to download {url}. Error: {e}") def determine_post_type(filepath): width, height = get_media_dimensions(filepath) if 0 in (width, height): return False aspect_ratio = width / height if aspect_ratio > 0.5 and aspect_ratio < 0.6: return 'stories' else: return 'posts' def get_media_type(filename): image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".svg", ".eps", ".raw", ".cr2", ".nef", ".orf", ".sr2", ".heic", ".indd", ".ai", ".psd", ".svg"} video_extensions = {".mp4", ".mov"} filetype_dict = {"image": image_extensions, "video": video_extensions} extension = os.path.splitext(filename.lower())[1] # Get the extension and convert to lower case for filetype, extensions in filetype_dict.items(): if extension in extensions: return filetype return None def get_video_duration(file_path): if not os.path.exists(file_path): print(f"File not found: {file_path}") return 0 video_types = {".mp4", ".mov", ".mkv"} extension = os.path.splitext(file_path.lower())[1] if extension not in video_types: print(f"File is not a video: {file_path}") return 0 try: with VideoFileClip(file_path) as video: duration = video.duration if duration == 0: duration = 1 return duration except Exception as e: print(f"Error getting duration for {file_path}: {e}") return 0 def get_video_dimensions(video_path): cap = cv2.VideoCapture(video_path) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() return width, height def get_media_dimensions(media_path): if get_media_type(media_path) == 'video': return get_video_dimensions(media_path) else: with Image.open(media_path) as img: return img.size def get_video_data(video_path): data = {'duration': 0, 'width': 0, 'height': 0} try: with VideoFileClip(video_path) as video: data['duration'] = video.duration data['width'] = video.size[0] data['height'] = video.size[1] except Exception as e: print(f"Error getting video data for {video_path}: {e}") return data def calculate_file_hash(file_path, hash_func='sha256'): h = hashlib.new(hash_func) with open(file_path, 'rb') as file: chunk = file.read(8192) while chunk: h.update(chunk) chunk = file.read(8192) return h.hexdigest()