from datetime import datetime import os, config, funcs, cv2 from uuid import uuid4 directory = 'snapchat' def UploadMedia(media): username = media['username'] timestamp = media['timestamp'] filepath = media['filepath'] filename = os.path.basename(filepath) media_id = media['media_id'] thumbnail_url = None phash = None if filename in existing_files: print('Duplicate file detected. Removing...') os.remove(filepath) return True if media_id in existing_files: print('Duplicate file detected. Removing...') return True media_type = funcs.get_media_type(filename) file_hash = funcs.calculate_file_hash(filepath) if '-' in timestamp: timestamp = timestamp.split('-')[0] post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now() width, height = funcs.get_media_dimensions(filepath) duration = funcs.get_video_duration(filepath) if media_type == 'image': phash = funcs.generate_phash(filepath) elif media_type == 'video': try: thumb_path = generate_thumbnail(filepath) obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg" phash = funcs.generate_phash(thumb_path) os.remove(thumb_path) except: print('Error generating thumbnail. Skipping...') return False file_extension = os.path.splitext(filename)[1].lower() new_filename = f'{file_hash}{file_extension}' server_path = f'media/snaps/{username}/{filename}' file_url = f"https://storysave.b-cdn.net/{server_path}" obj_storage.PutFile(filepath, server_path) # slow as fuck query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" values = (username, media_type, file_url, width, height, 'story', post_date, file_hash, filename, duration, thumbnail_url, phash, 'snapchat') newCursor.execute(query, values) # slower newDB.commit() print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') os.remove(filepath) return True def generate_thumbnail(filepath): thumb_path = f'temp/{uuid4()}.jpg' cap = cv2.VideoCapture(filepath) ret, frame = cap.read() cv2.imwrite(thumb_path, frame) cap.release() return thumb_path def get_media_data(filepath): filename = os.path.basename(filepath) parts = filename.split('~') if len(parts) < 3: return False username = parts[0] timestamp = parts[1] snap_id = parts[2] snap_id = os.path.splitext(snap_id)[0] data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'media_id': snap_id} return data def get_media(folder_path): medias = [] for root, dirs, files in os.walk(folder_path): for filename in files: filepath = os.path.join(root, filename) data = get_media_data(filepath) if data: medias.append(data) return medias def dump(folder_path): medias = get_media(folder_path) for media in medias: UploadMedia(media) def process_snap_ids(filenames): snap_ids = [] for filename in filenames: snap_id = filename.split('~')[2] snap_id = os.path.splitext(snap_id)[0] if snap_id not in snap_ids: snap_ids.append(snap_id) return snap_ids if __name__ == '__main__': print('Starting processing...') if not os.listdir(directory): print('No files to process. Exiting...') exit() newDB, newCursor = config.gen_connection() obj_storage = config.get_storage() newCursor.execute("SELECT filename FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'") existing_files = [image[0] for image in newCursor.fetchall()] existing_files = process_snap_ids(existing_files) dump(directory) print("Processing completed.")