diff --git a/.DS_Store b/.DS_Store index 9c6c5bf..cf6637d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/config.py b/config.py index 9043c1a..1385f7d 100644 --- a/config.py +++ b/config.py @@ -10,7 +10,7 @@ def gen_connection(): print("Connecting to database") newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port) print("Connected to database") - return newDB, newDB.cursor() + return newDB, newDB.cursor(dictionary=True) def get_storage(): from BunnyCDN.Storage import Storage diff --git a/storysave_scanner.py b/scanner.py similarity index 82% rename from storysave_scanner.py rename to scanner.py index 2a4d535..c66a5e3 100644 --- a/storysave_scanner.py +++ b/scanner.py @@ -1,7 +1,6 @@ from watchdog.events import FileSystemEventHandler from watchdog.observers import Observer import shutil -import time import os from funcs import get_media_dimensions @@ -13,20 +12,6 @@ os.makedirs(stories_dir, exist_ok=True) os.makedirs(posts_dir, exist_ok=True) -def wait_for_complete(file_path, timeout=10): - prev_size = -1 - for _ in range(timeout * 2): # check every 0.5 sec - try: - size = os.path.getsize(file_path) - except FileNotFoundError: - return False - if size == prev_size: - return True - prev_size = size - time.sleep(0.5) - return False - - def is_story(width, height, tolerance=0.02): if width == 0 or height == 0: return False @@ -50,16 +35,13 @@ class DownloadHandler(FileSystemEventHandler): def process_file(self, file_path): file = os.path.basename(file_path) + # Ignore incomplete or weird temp names if "crdownload" in file or file.count("~") != 3: return if not os.path.exists(file_path): return - if not wait_for_complete(file_path): - print(f"File {file_path} did not stabilize. Skipping.") - return - post_type = determine_post_type(file_path) if post_type == "posts": dest_dir = posts_dir @@ -91,6 +73,13 @@ class DownloadHandler(FileSystemEventHandler): if __name__ == "__main__": download_path = os.path.join(os.path.expanduser("~"), "Downloads") event_handler = DownloadHandler() + + # Initial scan for files already in Downloads + for f in os.listdir(download_path): + full_path = os.path.join(download_path, f) + if os.path.isfile(full_path): + event_handler.process_file(full_path) + observer = Observer() observer.schedule(event_handler, download_path, recursive=False) observer.start() diff --git a/storysave_dump.py b/storysave_dump.py index b62f109..d512758 100644 --- a/storysave_dump.py +++ b/storysave_dump.py @@ -12,20 +12,20 @@ directory = 'media' os.makedirs(temp_directory, exist_ok=True) media_types = { - 'stories' : 'story', - 'posts' : 'post', - 'profile' : 'profile' + 'stories': 'story', + 'posts': 'post', + 'profile': 'profile' } for media_type, _ in media_types.items(): os.makedirs(os.path.join(directory, media_type), exist_ok=True) -existing_media_ids = {} - +existing_media_ids = set() UPLOAD_CUSTOM = False CACHE_FILE = os.path.join(temp_directory, 'existing_media_ids.json') CACHE_TTL = timedelta(hours=48) + def UploadMedia(media): username = media['username'] user_id = media['user_id'] @@ -37,12 +37,12 @@ def UploadMedia(media): post_type = media['post_type'] thumbnail_url = None phash = None - + if media_id and media_id in existing_media_ids: print('Duplicate file detected. Removing...') os.remove(filepath) return True - + file_size = os.path.getsize(filepath) filename = os.path.basename(filepath) file_extension = os.path.splitext(filename)[1].lower() @@ -56,14 +56,16 @@ def UploadMedia(media): print(f'Error determining media type for {filename}. Skipping...') return False - try:post_date = datetime.fromtimestamp(int(timestamp)) - except:post_date = datetime.fromtimestamp(os.path.getctime(filepath)) + try: + post_date = datetime.fromtimestamp(int(timestamp)) + except: + post_date = datetime.fromtimestamp(os.path.getctime(filepath)) width, height = funcs.get_media_dimensions(filepath) if 0 in (width, height): print(f'Error getting dimensions for {filename}. Skipping...') return False - + duration = funcs.get_video_duration(filepath) if media_type == 'image': @@ -71,7 +73,7 @@ def UploadMedia(media): elif media_type == 'video': try: thumb_path = generate_thumbnail(filepath) - obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes + obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') thumbnail_url = f"https://cdn.altpins.com/thumbnails/{file_hash}.jpg" phash = funcs.generate_phash(thumb_path) os.remove(thumb_path) @@ -81,18 +83,17 @@ def UploadMedia(media): custom_filename = media_id if media_id else file_hash newFilename = f'{custom_filename}{file_extension}' - server_path = f'media/{post_type}/{username}/{newFilename}' file_url = f"https://cdn.altpins.com/{server_path}" - + obj_storage.PutFile(filepath, server_path) if highlight_id: - newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id)) + newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", + (highlight_id, user_id, media_id)) newDB.commit() print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}') - query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, file_size) @@ -104,14 +105,13 @@ def UploadMedia(media): print(f'File: {filename}') print(f'URL: {file_url}') print(f'Pin URL: https://altpins.com/pin/{newCursor.lastrowid}') - print("="*100) + print("=" * 100) os.remove(filepath) - existing_media_ids.add(media_id) - return newCursor.lastrowid + def generate_thumbnail(filepath): thumb_path = os.path.join(temp_directory, f'{uuid4()}.jpg') cap = cv2.VideoCapture(filepath) @@ -120,16 +120,16 @@ def generate_thumbnail(filepath): cap.release() return thumb_path + def get_user_id(username): username = username.lower() if username in existing_users: return existing_users[username] - return None + def get_media_data(filepath): filename = os.path.basename(filepath) - parts = filename.split('~') if len(parts) != 4: return False @@ -141,7 +141,7 @@ def get_media_data(filepath): platform = 'instagram' highlight_id = user_id.replace('highlight', '') if 'highlight' in user_id else None - + if user_id.isdigit(): user_id = int(user_id) else: @@ -152,17 +152,17 @@ def get_media_data(filepath): else: media_id = None - data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id, 'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform} - + data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id, + 'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform} return data + def get_media(): medias = [] failed_medias = [] for media_type, post_type in media_types.items(): media_folder_path = os.path.join(directory, media_type) - if not os.path.exists(media_folder_path): continue @@ -172,26 +172,23 @@ def get_media(): if not data: failed_medias.append(filepath) continue - data['post_type'] = post_type medias.append(data) - + return medias, failed_medias + def get_custom_media(failed_medias): medias = [] - for media_type, post_type in media_types.items(): folder_path = os.path.join(directory, media_type) - user_dirs = [d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))] + for username in user_dirs: user_folder_path = os.path.join(folder_path, username) - for filename in os.listdir(user_folder_path): if filename.startswith('.'): continue - filepath = os.path.join(user_folder_path, filename) if not filepath in failed_medias: continue @@ -199,7 +196,7 @@ def get_custom_media(failed_medias): user_id = get_user_id(username) timestamp = int(os.path.getctime(filepath)) media_id = os.path.splitext(filename)[0] - + if media_id.isdigit(): media_id = int(media_id) if media_id < 10000000: @@ -217,40 +214,35 @@ def get_custom_media(failed_medias): "highlight_id": None, "post_type": post_type } - medias.append(data) - return medias + def save_highlight_data(highlights): filename = f'{uuid4()}.json' filepath = os.path.join('highlight_data', filename) with open(filepath, 'w') as f: json.dump(highlights, f) + def dump_instagram(): medias, failed_medias = get_media() medias = clean_dupes(medias) failed_medias = get_custom_media(failed_medias) - + medias.sort(key=lambda x: (x['username'].lower(), x['timestamp'])) - # Update new user ids and existing user ids new_user_ids = {} for media in medias: user_id = media['user_id'] username = media['username'] - if not media['user_id']: continue - if username in existing_users: continue - existing_users[username] = user_id new_user_ids[username] = user_id - # Assign user ids for media in medias: if media['user_id']: continue @@ -262,13 +254,12 @@ def dump_instagram(): if not media['highlight_id']: continue highlights.append({ - "media_id": media["media_id"], - "user_id": media["user_id"], - "highlight_id": media['highlight_id'], - "username": media['username'], - }) + "media_id": media["media_id"], + "user_id": media["user_id"], + "highlight_id": media['highlight_id'], + "username": media['username'], + }) - # save highlights data into folder highlight_Data if highlights: save_highlight_data(highlights) @@ -280,85 +271,97 @@ def dump_instagram(): for media in failed_medias: pinid = UploadMedia(media) + def clean_dupes(medias): removed_count = 0 new_medias = [] for media in medias: media_id = media['media_id'] filepath = media['filepath'] - if not media_id: print(f'Invalid media_id for file {filepath}. Skipping...') continue - if media_id in existing_media_ids: removed_count += 1 print(f'Found duplicate file {filepath}. Removing...') os.remove(filepath) continue - if re.search(r'\(\d+\)', filepath): removed_count += 1 print(f'Found duplicate file {filepath}. Removing...') os.remove(filepath) continue - new_medias.append(media) - print(f'Removed {removed_count} duplicate files.') return new_medias + +# -------------------- CACHE SYSTEM -------------------- + def get_cached_data(): if not os.path.exists(CACHE_FILE): print('No cache file found. Generating new cache…') - return None, None - + return None, None, None try: with open(CACHE_FILE, 'r') as f: - cache_data = json.load(f) - - timestamp = datetime.fromisoformat(cache_data.get('timestamp', '')) - if datetime.now() - timestamp < CACHE_TTL: - print('Using cached data…') - return set(tuple(x) for x in cache_data.get('existing_media_ids', [])), cache_data.get('existing_users', {}) + cache = json.load(f) + media_ids = set(cache.get('media_ids', [])) + users = {k.lower(): v for k, v in cache.get('existing_users', {}).items()} + last_id = cache.get('last_id', 0) + return media_ids, users, last_id except Exception as e: print(f"Cache read error: {e}") + return None, None, None - return None, None -def save_cached_data(existing_media_ids, existing_users): +def save_cached_data(media_ids, existing_users, last_id): with open(CACHE_FILE, 'w') as f: - json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f) + json.dump({ + 'timestamp': datetime.now().isoformat(), + 'media_ids': list(media_ids), + 'existing_users': existing_users, + 'last_id': last_id + }, f) + + +def get_user_ids(cur): + cur.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform='instagram'") + rows = cur.fetchall() + return {user['username'].lower(): user['user_id'] for user in rows} + -def get_existing_medias(newCursor): - existing_media_ids, existing_users = get_cached_data() +def get_existing_media_ids(cur): + cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public'") + rows = cur.fetchall() + media_ids = {row['media_id'] for row in rows} + last_id = max((row['id'] for row in rows), default=0) + return media_ids, last_id - if existing_media_ids and existing_users: - newest_id = max(existing_media_ids, key=lambda x: x[0])[0] - existing_media_ids = {image[1] for image in existing_media_ids} +def get_existing_medias(cur): + media_ids, users, last_id = get_cached_data() - newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public' AND id > %s ORDER BY id DESC", (newest_id,)) - new_media_ids = {image[1] for image in newCursor.fetchall()} + if not media_ids or not users: + print('Cold cache → pulling full data...') + media_ids, last_id = get_existing_media_ids(cur) + users = get_user_ids(cur) + save_cached_data(media_ids, users, last_id) + return media_ids, users - for media_id in new_media_ids: - existing_media_ids.add(media_id) - - return existing_media_ids, existing_users - - print('Getting existing files and users...') - newCursor.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform = 'instagram' AND status = 'public';") - existing_media_ids = {image for image in newCursor.fetchall()} + cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public' AND id > %s ORDER BY id ASC", (last_id,)) + rows = cur.fetchall() - print('Getting existing users...') - newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform = 'instagram'") - existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()} + for r in rows: + media_ids.add(r['media_id']) + last_id = max(last_id, r['id']) - cache_file = os.path.join(temp_directory, 'existing_media_ids.json') - with open(cache_file, 'w') as f: - json.dump({'timestamp': datetime.now().isoformat(), 'existing_media_ids': list(existing_media_ids), 'existing_users': existing_users}, f) - - return existing_media_ids, existing_users + if rows: + save_cached_data(media_ids, users, last_id) + + return media_ids, users + + +# -------------------- MAIN -------------------- if __name__ == '__main__': print('Starting processing...') @@ -366,16 +369,11 @@ if __name__ == '__main__': if not funcs.get_files(directory): print('No files to process. Exiting...') exit() - - newDB, newCursor = config.gen_connection() + newDB, newCursor = config.gen_connection() obj_storage = config.get_storage() existing_media_ids, existing_users = get_existing_medias(newCursor) - dump_instagram() - print("Processing completed.") - - # for mediatype, _ in media_types.items(): - # funcs.clean_empty_folders(os.path.join(directory, mediatype)) \ No newline at end of file + print("Processing completed.") \ No newline at end of file