diff --git a/.gitignore b/.gitignore index 9c5186c..7d8b184 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,19 @@ facebook/ media/ cache/ temp/ -*.pyc \ No newline at end of file +*.pyc +/old_altpins_cache +/__pycache__ +/STORAGE_IMPORTED +/STREAM_VIDEOS +/STREAM_VIDEOS_IMPORTED +/STORAGE +/other +/Sort +*.pyc +/images +/sortlater +/videos +/duplicates +/ready_to_upload +/archive diff --git a/config.py b/config.py index 4c51f7c..aa6981a 100644 --- a/config.py +++ b/config.py @@ -1,20 +1,8 @@ import mysql.connector - -altpins_username = "xantorn" -altpins_password = "AVNS_lGiLOVTTyGMtoOoRn5Q" -altpins_host = "archivebate-db-do-user-13308724-0.b.db.ondigitalocean.com" -altpins_port = 25060 -altpins_database = "altpins" -altpins_sslmode = "REQUIRED" - -def altpins_gen_connection(): - print("Connecting to database") - newDB = mysql.connector.connect(host=altpins_host, user=altpins_username, password=altpins_password, database=altpins_database, port=altpins_port) - print("Connected to database") - return newDB, newDB.cursor() +from BunnyCDN.Storage import Storage username = "doadmin" -password = "AVNS_KNXK1IjScgTCe09gI9F" +password = "AVNS_2qeFJuiGRpBQXkJjlA6" host = "storysave-do-user-13308724-0.c.db.ondigitalocean.com" port = 25060 database = "storysave" @@ -25,3 +13,6 @@ def gen_connection(): newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port) print("Connected to database") return newDB, newDB.cursor() + +def get_storage(): + return Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') \ No newline at end of file diff --git a/emptyfolders.py b/emptyfolders.py new file mode 100644 index 0000000..4b6a80d --- /dev/null +++ b/emptyfolders.py @@ -0,0 +1,12 @@ +import os + +def remove_empty_folders(folder): + for root, dirs, files in os.walk(folder): + for dir in dirs: + dirpath = os.path.join(root, dir) + if not os.listdir(dirpath): + print(f"Removing empty folder {dirpath}") + os.rmdir(dirpath) + +folder = 'media' +remove_empty_folders(folder) \ No newline at end of file diff --git a/find_duplicate_videos_improved.py b/find_duplicate_videos_improved.py new file mode 100644 index 0000000..4ef7a37 --- /dev/null +++ b/find_duplicate_videos_improved.py @@ -0,0 +1,85 @@ +import os +import config +import cv2 +from funcs import get_files # Assuming this is defined elsewhere +import imagehash +from PIL import Image + +def generate_thumbnail_phash(filepath, hash_size=8): # Set hash_size to 8 + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cap.release() + + if not ret: + print(f"Error reading frame from {filepath}") + return None + + # Resize frame to a standard size + standard_size = (320, 240) + resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA) + + # Convert OpenCV image (BGR) to PIL Image (RGB) + image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(image_rgb) + + # Compute pHash + phash = imagehash.phash(pil_image, hash_size=hash_size) + + return phash + +def are_phashes_duplicates(phash1, phash2, threshold=5): + # Compute Hamming distance between the pHashes + try: + distance = phash1 - phash2 + except TypeError as e: + print(f"Error comparing pHashes: {e}") + return False + + return distance <= threshold + +def get_media_by_phash(phash, username, existing_medias, threshold=5): + for media in existing_medias: + existing_phash_str = media[1] + existing_username = media[2] + if existing_username != username: + continue + + # Convert stored phash string to ImageHash object + existing_phash = imagehash.hex_to_hash(existing_phash_str) + + if are_phashes_duplicates(phash, existing_phash, threshold=threshold): + return media + return None + +# Database connection +db, cursor = config.gen_connection() + +# Fetch existing videos with pHashes +cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video']) +existing_medias = cursor.fetchall() + +users = os.listdir('videos') +for username in users: + user_videos_path = os.path.join('videos', username) + if not os.path.isdir(user_videos_path): + continue + + videos = os.listdir(user_videos_path) + for video in videos: + print(f'Processing {video}...') + filepath = os.path.join(user_videos_path, video) + + phash = generate_thumbnail_phash(filepath, hash_size=8) # Use hash_size=8 + if phash is None: + continue + + phash_str = str(phash) + + duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5) + if duplicate_media: + print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}') + print(f'Duplicate video path: {filepath}') + newpath = filepath.replace('videos', 'duplicates') + os.makedirs(os.path.dirname(newpath), exist_ok=True) + os.rename(filepath, newpath) + print(f'Moved {video} to duplicates/') \ No newline at end of file diff --git a/find_duplicates_by_phash.py b/find_duplicates_by_phash.py new file mode 100644 index 0000000..a0cd501 --- /dev/null +++ b/find_duplicates_by_phash.py @@ -0,0 +1,81 @@ +import os +import config +import imagehash +from PIL import Image +from funcs import get_files # Assuming this is defined elsewhere + +def generate_image_phash(filepath, hash_size=8): + try: + # Open the image using PIL + pil_image = Image.open(filepath) + + # Compute pHash using the imagehash library + phash = imagehash.phash(pil_image, hash_size=hash_size) + return phash + except Exception as e: + print(f"Error processing image {filepath}: {e}") + return None + +def are_phashes_duplicates(phash1, phash2, threshold=5): + try: + # Compute the Hamming distance between the pHashes + distance = phash1 - phash2 + return distance <= threshold + except TypeError as e: + print(f"Error comparing pHashes: {e}") + return False + +def get_media_by_phash(phash, username, existing_medias, threshold=6): + for media in existing_medias: + existing_phash_str = media[1] + existing_username = media[2] + + if existing_username != username: + continue # Only compare with the same user's media + + # Convert stored pHash string to ImageHash object + existing_phash = imagehash.hex_to_hash(existing_phash_str) + + # Check if the current pHash is a duplicate + if are_phashes_duplicates(phash, existing_phash, threshold=threshold): + return media + return None + +# Database connection +db, cursor = config.gen_connection() + +# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed) +cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image']) +existing_medias = cursor.fetchall() + +# Go through the 'sorted' folder where each subfolder is a username +users = os.listdir('sorted') + +for username in users: + user_images_path = os.path.join('sorted', username) + if not os.path.isdir(user_images_path): + continue # Skip non-directory files + + # Get all images for the current user + images = get_files(user_images_path) # Assuming this gets all image files + + for filepath in images: + image_filename = os.path.basename(filepath) + print(f'Processing {image_filename}...') + + # Generate pHash for the image + phash = generate_image_phash(filepath, hash_size=8) + if phash is None: + continue # Skip this image if there's an issue + + phash_str = str(phash) + + # Check if the image is a duplicate of any in the database + duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5) + if duplicate_media: + print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}') + print(f'Duplicate image path: {filepath}') + newpath = filepath.replace('sorted', 'duplicates') + os.makedirs(os.path.dirname(newpath), exist_ok=True) + os.rename(filepath, newpath) + print(f'Moved {image_filename} to duplicates/') \ No newline at end of file diff --git a/find_static_videos.py b/find_static_videos.py new file mode 100644 index 0000000..eae0e77 --- /dev/null +++ b/find_static_videos.py @@ -0,0 +1,76 @@ +import cv2, os +import imagehash +from PIL import Image +from funcs import get_files + +def is_static_video_phash_optimized(video_path, frame_sample_rate=30, hash_size=16, hamming_threshold=1): + """ + Determines if a video is static using perceptual hashing (pHash) by comparing consecutive frames. + + Parameters: + - video_path: Path to the video file. + - frame_sample_rate: Number of frames to skip between comparisons. + - hash_size: Size of the hash; larger values increase sensitivity. + - hamming_threshold: Maximum Hamming distance between consecutive frames to consider the video static. + + Returns: + - True if the video is static, False otherwise. + """ + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + print("Error: Cannot open video file.") + return False + + ret, frame = cap.read() + if not ret: + print("Error: Cannot read video frames.") + cap.release() + return False + + # Convert first frame to PIL Image and compute hash + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(frame_rgb) + previous_hash = imagehash.phash(pil_image, hash_size=hash_size) + + frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + is_static = True + + current_frame_number = 1 + + while True: + # Skip frames according to the sample rate + cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame_number) + ret, frame = cap.read() + if not ret: + break + + # Convert frame to PIL Image and compute hash + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(frame_rgb) + current_hash = imagehash.phash(pil_image, hash_size=hash_size) + + # Compute Hamming distance between hashes + hamming_distance = previous_hash - current_hash + + if hamming_distance > hamming_threshold: + is_static = False + break + + # Update the previous hash + previous_hash = current_hash + + # Move to the next frame according to the sample rate + current_frame_number += frame_sample_rate + + cap.release() + return is_static + + +directory = 'videos' + +files = get_files(directory) + +for video_file in files: + if video_file.endswith('.mp4'): + if is_static_video_phash_optimized(video_file): + print("The video is static: " + video_file) \ No newline at end of file diff --git a/fix_filepaths.py b/fix_filepaths.py new file mode 100644 index 0000000..3dffccc --- /dev/null +++ b/fix_filepaths.py @@ -0,0 +1,40 @@ +import config, os, json +from PIL import Image +import imagehash + +def find_file(filename, directory): + filename = filename.lower().split('.')[0] + for root, dirs, files in os.walk(directory): + for file in files: + if filename in file: + return os.path.join(root, file) + return None + +def generate_phash(image_path): + image = Image.open(image_path) + return str(imagehash.phash(image)) + +count = 0 + +cacheDir = 'sorted' +dataPath = 'pins.json' + +os.makedirs(cacheDir, exist_ok=True) + +medias = json.load(open(dataPath)) + +for item in medias: + count += 1 + + filepath = item['filepath'] + if os.path.exists(filepath): + continue + + newfilepath = find_file(os.path.basename(filepath), cacheDir) + if newfilepath: + print(f"Found file {newfilepath} for {filepath}") + item['filepath'] = newfilepath + + +with open(dataPath, 'w') as f: + json.dump(medias, f) \ No newline at end of file diff --git a/funcs.py b/funcs.py index f1ef80a..fc04370 100644 --- a/funcs.py +++ b/funcs.py @@ -1,22 +1,124 @@ from moviepy.editor import VideoFileClip -import os, cv2, hashlib +import os, cv2, hashlib, requests from PIL import Image +import numpy as np +import imagehash +headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"} +proxies={"http": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/","https": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/"} -def get_video_dimensions(video_path): - cap = cv2.VideoCapture(video_path) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - cap.release() - return width, height +def generate_phash(image_path): + try: + image = Image.open(image_path) + return str(imagehash.phash(image)) + except: + return False + +def cleanEmptyFolders(path): + for root, dirs, fs in os.walk(path): + for d in dirs: + cleanEmptyFolders(os.path.join(root, d)) + if not os.listdir(root): + os.rmdir(root) + +def get_files(directory): + files = [] + for root, dirs, filenames in os.walk(directory): + for filename in filenames: + files.append(os.path.join(root, filename)) + return files + +import cv2 +import numpy as np + +def compare_images(image_path1, image_path2): + # Load the images in grayscale + img1 = cv2.imread(image_path1, cv2.IMREAD_GRAYSCALE) + img2 = cv2.imread(image_path2, cv2.IMREAD_GRAYSCALE) + + if img1 is None or img2 is None: + print("Error loading images!") + return False # Or you could raise an exception + + # Initialize SIFT detector + sift = cv2.SIFT_create() + + # Find keypoints and descriptors with SIFT + kp1, des1 = sift.detectAndCompute(img1, None) + kp2, des2 = sift.detectAndCompute(img2, None) + + # Check if descriptors are None + if des1 is None or des2 is None: + return False + + # FLANN parameters + index_params = dict(algorithm=1, trees=5) + search_params = dict(checks=50) + + # FLANN based matcher + flann = cv2.FlannBasedMatcher(index_params, search_params) + + # Matching descriptor vectors using KNN algorithm + matches = flann.knnMatch(des1, des2, k=2) + + # Apply ratio test + good = [] + for m, n in matches: + if m.distance < 0.6 * n.distance: # More stringent ratio + good.append(m) + + # Minimum number of matches + MIN_MATCH_COUNT = 15 # Adjust this threshold as needed + + if len(good) > MIN_MATCH_COUNT: + # Extract location of good matches + src_pts = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2) + dst_pts = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2) + + # Find homography + M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0) + matchesMask = mask.ravel().tolist() + + if np.sum(matchesMask) > 10: # Check if enough points agree on homography + return True + else: + return False + else: + return False + +def download_file(url, filePath): + try: + response = requests.get(url, stream=True, headers=headers) + response.raise_for_status() + + directory = os.path.dirname(filePath) + + if not os.path.exists(directory): + os.makedirs(directory) + + with open(filePath, "wb") as out_file: + for chunk in response.iter_content(chunk_size=8192): + out_file.write(chunk) + print(f"Downloaded {filePath}") + except Exception as e: + print(f"Failed to download {url}. Error: {e}") def determine_post_type(filepath, mediatype): if mediatype == 'image': - with Image.open(filepath) as img: - width, height = img.size + try: + with Image.open(filepath) as img: + width, height = img.size + except: + print(f"Error opening image {filepath}") + return False elif mediatype == 'video': width, height = get_video_dimensions(filepath) - + else: + return False + + if 0 in (width, height): + return False + aspect_ratio = width / height if aspect_ratio > 0.5 and aspect_ratio < 0.6: return 'stories' @@ -43,6 +145,24 @@ def get_video_duration(file_path): except Exception as e: print(f"Error getting duration for {file_path}: {e}") return 0 + +def get_video_dimensions(video_path): + cap = cv2.VideoCapture(video_path) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + cap.release() + return width, height + +def get_video_data(video_path): + data = {'duration': 0, 'width': 0, 'height': 0} + try: + with VideoFileClip(video_path) as video: + data['duration'] = video.duration + data['width'] = video.size[0] + data['height'] = video.size[1] + except Exception as e: + print(f"Error getting video data for {video_path}: {e}") + return data def calculate_file_hash(file_path, hash_func='sha256'): h = hashlib.new(hash_func) @@ -51,4 +171,4 @@ def calculate_file_hash(file_path, hash_func='sha256'): while chunk: h.update(chunk) chunk = file.read(8192) - return h.hexdigest() + return h.hexdigest() \ No newline at end of file diff --git a/old/bunny.py b/old/bunny.py new file mode 100644 index 0000000..4330df6 --- /dev/null +++ b/old/bunny.py @@ -0,0 +1,141 @@ +import requests, hashlib, os + +access_key = "471cd2e1-a943-4c61-ae69ddc6c2c2-c36d-4737" +video_library_id = 125094 + +def create_video(title): + url = f"https://video.bunnycdn.com/library/{video_library_id}/videos" + + payload = f"{{\"title\":\"{title}\"}}" + headers = { + "accept": "application/json", + "content-type": "application/*+json", + "AccessKey": access_key + } + + response = requests.post(url, data=payload, headers=headers) + + return response + +def generate_signature(library_id, api_key, expiration_time, video_id): + signature = hashlib.sha256((library_id + api_key + str(expiration_time) + video_id).encode()).hexdigest() + return signature + +def upload_video_process(file_path, video_id): + url = f"https://video.bunnycdn.com/library/{video_library_id}/videos/{video_id}" + + headers = {"accept": "application/json","AccessKey": access_key} + + with open(file_path, "rb") as file: + file_data = file.read() + + response = requests.put(url, headers=headers, data=file_data) + + return response.status_code + +def upload_video(file_path, title=None): + video_item = create_video(title) + if video_item.status_code != 200: + return False + + video_id = video_item.json()['guid'] + upload_video_process(file_path, video_id) + + return { + "embed_link": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/playlist.m3u8", + "animated_thumbnail": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/preview.webp", + "default_thumbnail": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/thumbnail.jpg", + } + + +def upload_video_recurbate(videoInfo): + title = f"{videoInfo['username']} {videoInfo['platform']}" + video_item = create_video(title) + if video_item.status_code != 200: + return False + + video_id = video_item.json()['guid'] + upload_video_process(videoInfo['filename'], video_id) + + videoInfo["embed_link"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/playlist.m3u8" + videoInfo["animated_thumbnail"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/preview.webp" + videoInfo["default_thumbnail"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/thumbnail.jpg" + + return True + +def delete_video(video_id): + video_id = video_id.replace('https://vz-58ca89f1-986.b-cdn.net/', '').replace('/playlist.m3u8', '') + + url = f"https://video.bunnycdn.com/library/{video_library_id}/videos/{video_id}" + + headers = {"accept": "application/json","AccessKey": access_key} + + response = requests.delete(url, headers=headers) + + return response.status_code + +def list_videos(): + url = f"https://video.bunnycdn.com/library/{video_library_id}/videos" + + params = { + "page": 1, + "itemsPerPage": 1000, + "orderBy": "date" + } + + headers = {"accept": "application/json","AccessKey": access_key} + + videos = [] + while True: + response = requests.get(url, headers=headers, params=params) + + data = response.json() + videos += data['items'] + + if len(videos) == data['totalItems']: + return videos + + params['page'] += 1 + +def get_heatmap(video_id): + url = "https://video.bunnycdn.com/library/libraryId/videos/videoId/heatmap" + url = url.replace('libraryId', str(video_library_id)).replace('videoId', str(video_id)) + + headers = {"accept": "application/json","AccessKey": access_key} + + response = requests.get(url, headers=headers).json() + + return response + +def get_video(video_id): + url = "https://video.bunnycdn.com/library/libraryId/videos/videoId" + url = url.replace('libraryId', str(video_library_id)).replace('videoId', str(video_id)) + + headers = {"accept": "application/json","AccessKey": access_key} + + response = requests.get(url, headers=headers).json() + + return response + + +def download_video(video_id, directory): + download_url = f'https://storage.bunnycdn.com/vz-dd4ea005-7c2/{video_id}/' + + params = {'download': '','accessKey': '5b1766f7-c1ab-463f-b05cce6f1f2e-1190-4c09'} + + video_response = requests.get(download_url, params=params) + + if video_response.status_code == 200: + content_disposition = video_response.headers.get('Content-Disposition') + if content_disposition: + filename = content_disposition.split('filename=')[1].strip('"') + ext = filename.split('.')[-1] + + filename = f'{video_id}.{ext}' + filePath = os.path.join(directory, filename) + + with open(filePath, 'wb') as video_file: + video_file.write(video_response.content) + print(f'Video downloaded successfully as {filePath}') + else: + print('Failed to download video', video_response.status_code, video_response.text) diff --git a/old/cleanup.py b/old/cleanup.py new file mode 100644 index 0000000..edbde81 --- /dev/null +++ b/old/cleanup.py @@ -0,0 +1,23 @@ +import json + +with open('bunny_data/missing_videos.json', 'r') as f: + missing_videos = json.load(f) + +with open('bunny_data/allVideos.json', 'r') as f: + all_videos = json.load(f) + +all_videos_guids = {video['guid'] for video in all_videos} + +for video in missing_videos: + if video['guid'] in all_videos_guids: + video['imported'] = True + +combined_data = { + "missing_videos": missing_videos, + "all_videos": all_videos +} + +with open('bunny_data/combined_videos.json', 'w') as f: + json.dump(combined_data, f, indent=4) + +print("Combined data has been written to bunny_data/combined_videos.json") diff --git a/old/concat.py b/old/concat.py new file mode 100644 index 0000000..691e3b1 --- /dev/null +++ b/old/concat.py @@ -0,0 +1,16 @@ +import os, json + + +pins = open('db_pins.json', 'r') +pins = json.load(pins) + +importedPins = open('db_pins_imported.json', 'r') +importedPins = json.load(importedPins) + +allPins = pins + importedPins +print(len(allPins)) + +finalPins = open('allPins.json', 'r') +finalPins = json.load(finalPins) + +print(len(finalPins)) \ No newline at end of file diff --git a/old/dump_facebook.py b/old/dump_facebook.py new file mode 100644 index 0000000..a62e4be --- /dev/null +++ b/old/dump_facebook.py @@ -0,0 +1,110 @@ +from BunnyCDN.Storage import Storage +import os, uuid, config, funcs, cv2 +from datetime import datetime +from PIL import Image + +def dump_facebook(folder_path): + for filename in os.listdir(folder_path): + if os.path.isdir(os.path.join(folder_path, filename)): + continue + + username = filename.split("'")[0] + + filepath = os.path.join(folder_path, filename) + + mediatype = funcs.get_media_type(filename) + post_type = funcs.determine_post_type(filepath, mediatype) + + upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type) + + for folder in os.listdir(folder_path): + if os.path.isdir(os.path.join(folder_path, folder)): + username = folder + + for filename in os.listdir(os.path.join(folder_path, folder)): + filepath = os.path.join(folder_path, folder, filename) + + mediatype = funcs.get_media_type(filename) + post_type = funcs.determine_post_type(filepath, mediatype) + + upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type) + +def upload_file(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None): + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + file_hash = funcs.calculate_file_hash(filepath) + + if file_hash in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return False + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 + + if "FB_IMG" in filename: media_id = filename.split("_")[2].split(".")[0] + else: media_id = uuid.uuid4().hex + + dirtype = funcs.determine_post_type(filepath, media_type) + server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}') + + obj_storage.PutFile(filepath, server_path) + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + if media_type == 'image': + with Image.open(filepath) as img: + width, height = img.size + else: + width, height = funcs.get_video_dimensions(filepath) + + thumbnail_url = None + if media_type == 'video': + thumbPath = f'temp/{media_id}.jpg' + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cv2.imwrite(thumbPath, frame) + cap.release() + obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') + thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg" + + post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now() + + if post_type == 'stories': + post_type = 'story' + else: + post_type = 'post' + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, platform, hash, filename, duration, thumbnail) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, post_type, post_date, user_id, 'facebook', file_hash, filename, duration, thumbnail_url) + + try: + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + except Exception as e: + print(f"Database error: {e}") + return False + + try: + if newCursor.rowcount > 0: + os.remove(filepath) + except Exception as e: + print(f"Failed to remove local file {filepath}: {e}") + + return True + + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT hash FROM media WHERE platform='facebook' AND hash IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + dump_facebook('facebook/') + + print("Processing completed.") \ No newline at end of file diff --git a/old/dump_tiktok.py b/old/dump_tiktok.py new file mode 100644 index 0000000..ca5cc1f --- /dev/null +++ b/old/dump_tiktok.py @@ -0,0 +1,67 @@ +from BunnyCDN.Storage import Storage +import os, uuid, config, funcs +from datetime import datetime +from PIL import Image + +def dump_facebook(folder_path): + for folder in os.listdir(folder_path): + if os.path.isdir(os.path.join(folder_path, folder)): + username = folder + + for filename in os.listdir(os.path.join(folder_path, folder)): + filepath = os.path.join(folder_path, folder, filename) + + upload_file(username=username, filepath=filepath) + +def upload_file(filepath, username): + filename = os.path.basename(filepath) + media_id = filename.split('.')[0] + + file_extension = os.path.splitext(filename)[1].lower() + + media_type = funcs.get_media_type(filename) + + file_hash = funcs.calculate_file_hash(filepath) + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 + + width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size + + + dirtype = funcs.determine_post_type(filepath, media_type) + server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}') + + obj_storage.PutFile(filepath, server_path) + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + if file_hash in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return False + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, platform, hash, filename, duration, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, 'tiktok', file_hash, filename, duration, media_id) + + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + if newCursor.rowcount > 0: + os.remove(filepath) + + return True + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT hash FROM media WHERE platform='tiktok' AND hash IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + dump_facebook('tiktok/') + + print("Processing completed.") \ No newline at end of file diff --git a/old/editor.py b/old/editor.py new file mode 100644 index 0000000..e5c9d33 --- /dev/null +++ b/old/editor.py @@ -0,0 +1,38 @@ +import os, json + +def getMedia(filename, list): + for item in list: + if filename.split('.')[0] in item['filepath']: + return item + return None + + +data = json.loads(open('oldpins.json').read()) +files = os.listdir('STORAGE') + +count = 0 +for file in files: + filepath = f'STORAGE/{file}' + + if os.path.isdir(filepath): + continue + media = getMedia(file, data) + if not media: + continue + + username = media['title'] + filetype = media['type'] + filetype = 'jpg' if filetype == 'image' else 'mp4' + filename = media['filepath'].split('/')[-1] + '.' + filetype + + output = os.path.join('STORAGE', username, filename) + os.makedirs(os.path.dirname(output), exist_ok=True) + if os.path.exists(output): + os.remove(output) + output = os.path.join('STORAGE', username, file) + os.rename(filepath, output) + + count += 1 + print(f'File: {file}') + +print(f'Total: {count}') \ No newline at end of file diff --git a/old/fillhash.py b/old/fillhash.py new file mode 100644 index 0000000..fad94ae --- /dev/null +++ b/old/fillhash.py @@ -0,0 +1,45 @@ +import funcs, json, os, config + +db, newCursor = config.gen_connection() + +newCursor.execute("SELECT hash FROM media") +hashes = [hash[0] for hash in newCursor.fetchall()] + +file = 'bunnyVideos.json' + +data = json.loads(open(file).read()) + +for media in data: + if media['imported'] == True: + if os.path.exists(media['filepath']): + print(f'File {media["filepath"]} does not exist. Skipping...') + continue + + +countImported = 0 +countSkipped = 0 +for media in data: + filepath = os.path.join('STREAM_VIDEOS_IMPORTED', media['guid'] + '.mp4') + if media['imported'] == True: + countImported += 1 + print('File already imported. Skipping...') + continue + + countSkipped += 1 + + if not os.path.exists(filepath): + print(f'File {filepath} does not exist. Skipping...') + continue + + hash = funcs.calculate_file_hash(filepath) + + if '67caa15e-390c-4223-b7b9-4d7842f3b443' in filepath: + print(f'File {filepath} does not exist. Skipping...') + continue + + if hash in hashes: + print('Duplicate file detected. Removing...') + + +print(f'Imported: {countImported}') +print(f'Skipped: {countSkipped}') \ No newline at end of file diff --git a/old/find_dupes_by_phash.py b/old/find_dupes_by_phash.py new file mode 100644 index 0000000..7083b74 --- /dev/null +++ b/old/find_dupes_by_phash.py @@ -0,0 +1,17 @@ +from funcs import get_files, generate_phash +import os, config + + +db, cursor = config.gen_connection() +cursor.execute("SELECT phash FROM media WHERE phash IS NOT NULL;") +phashes = [x[0] for x in cursor.fetchall()] + +files = get_files('images') +for item in files: + phash = generate_phash(item) + if phash in phashes: + print(item) + newpath = item.replace('images', 'duplicates') + newdir = os.path.dirname(newpath) + os.makedirs(newdir, exist_ok=True) + os.rename(item, newpath) \ No newline at end of file diff --git a/old/fixes/fix_facebook_missing_uploads.py b/old/fixes/fix_facebook_missing_uploads.py new file mode 100644 index 0000000..d1e2cbe --- /dev/null +++ b/old/fixes/fix_facebook_missing_uploads.py @@ -0,0 +1,56 @@ +from BunnyCDN.Storage import Storage +import os, config, requests +from moviepy.editor import VideoFileClip + +def get_media_type(filename): + image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp"} + video_extensions = {".mp4", ".mov"} + extension = os.path.splitext(filename.lower())[1] + if extension in image_extensions: + return 'image' + elif extension in video_extensions: + return 'video' + else: + return 'unknown' + +def determine_post_type(media_type): + # Assuming the post type is directly based on media type. + return media_type + +def get_video_dimensions(filepath): + with VideoFileClip(filepath) as clip: + width, height = clip.size + return width, height + +def download_file(url): + local_filename = url.split('/')[-1] + # Note: Stream=True to avoid loading the whole file into memory + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + return local_filename + +if __name__ == '__main__': + newDB, newCursor = config.gen_connection() + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + posts = open('fucked', 'r') + + for item in posts: + username, url = item.strip().split('~') + media_id = url.split('/')[-1].split('.')[0] + media_type = get_media_type(url) + + query = "INSERT IGNORE INTO media (username, media_type, platform, media_url) VALUES (%s, %s, %s, %s)" + values = (username, media_type, 'facebook', url) + + try: + newCursor.execute(query, values) + newDB.commit() + print(f'[{newCursor.rowcount}] records updated.{url}') + except Exception as e: + print(f"Database error: {e}") + + posts.close() diff --git a/old/fixes/fixduration.py b/old/fixes/fixduration.py new file mode 100644 index 0000000..64f507b --- /dev/null +++ b/old/fixes/fixduration.py @@ -0,0 +1,94 @@ +from BunnyCDN.Storage import Storage +from moviepy.editor import VideoFileClip +import config +import hashlib +import requests +import os + +def file_hash_from_url(url, hash_algo='sha256'): + h = hashlib.new(hash_algo) + + response = requests.get(url, stream=True) + + if response.status_code == 200: + for chunk in response.iter_content(8192): + h.update(chunk) + return h.hexdigest() + else: + raise Exception(f"Failed to download file: Status code {response.status_code}") + +def get_video_duration(file_path): + """ + Returns the duration of the video file in seconds. + + :param file_path: Path to the video file + :return: Duration in seconds + """ + try: + with VideoFileClip(file_path) as video: + return video.duration + except: + return 0 + +def file_hash(filename, hash_algo='sha256'): + """ + Compute the hash of a file. + + :param filename: Path to the file. + :param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5'). + :return: Hexadecimal hash string. + """ + # Create a hash object + h = hashlib.new(hash_algo) + + # Open the file in binary mode and read in chunks + with open(filename, 'rb') as file: + while chunk := file.read(8192): + h.update(chunk) + + # Return the hexadecimal digest of the hash + return h.hexdigest() + +# the hash of the images are different due to optimizer + +#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins') +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE duration = 0 AND media_type = 'video' AND status != 'deleted';") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +cacheDir = 'cache' +for result in results: + count += 1 + videoID = result[0] + mediaID = result[1] + mediaURL = result[2] + extension = mediaURL.split('.')[-1] + + serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + print(f"File already exists: {localFilePath}") + else: + obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + + duration = get_video_duration(localFilePath) + + if duration == 0: + print(f"Failed to get duration for {localFilePath}") + continue + + if duration < 1: + duration = 1 + + cursor.execute("UPDATE media SET duration = %s WHERE id = %s;", (duration, result[0])) + db.commit() + + print(f"[{count}/{len(results)}] {result[1]}: {duration}, {cursor.rowcount}") \ No newline at end of file diff --git a/old/fixes/fixhash.py b/old/fixes/fixhash.py new file mode 100644 index 0000000..1bd374e --- /dev/null +++ b/old/fixes/fixhash.py @@ -0,0 +1,47 @@ +from BunnyCDN.Storage import Storage +import config +import hashlib +import os + +def file_hash(filename, hash_algo='sha256'): + """ + Compute the hash of a file. + + :param filename: Path to the file. + :param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5'). + :return: Hexadecimal hash string. + """ + h = hashlib.new(hash_algo) + + with open(filename, 'rb') as file: + while chunk := file.read(8192): + h.update(chunk) + + return h.hexdigest() + + +#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins') +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +for result in results: + count += 1 + serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + + localFilePath = os.path.join(os.getcwd(), 'temp', os.path.basename(serverPath)) + if not os.path.exists(localFilePath): + obj_storage.DownloadFile(storage_path=serverPath, download_path=os.path.join(os.getcwd(), 'temp')) + + filehash = file_hash(localFilePath) + + cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, result[0])) + db.commit() + + print(f"[{count}/{len(results)}] {result[1]}: {filehash}, {cursor.rowcount}") \ No newline at end of file diff --git a/old/fixes/fixphashes.py b/old/fixes/fixphashes.py new file mode 100644 index 0000000..3c881ff --- /dev/null +++ b/old/fixes/fixphashes.py @@ -0,0 +1,41 @@ +import config, os +from PIL import Image +import imagehash + +def generate_phash(image_path): + image = Image.open(image_path) + return str(imagehash.phash(image)) + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash IS NULL;") +results = cursor.fetchall() + +count = 0 +cacheDir = 'cache' +os.makedirs(cacheDir, exist_ok=True) +print(f"Found {len(results)} files to process.") + + +for result in results: + count += 1 + itemID = result[0] + mediaID = result[1] + mediaURL = result[2] + + serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if not os.path.exists(localFilePath): + print(f"File {localFilePath} does not exist, skipping.") + continue + + try: + phash = generate_phash(localFilePath) + + cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID)) + db.commit() + + print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}") + except Exception as e: + print(f"Error processing {mediaID}: {e}") \ No newline at end of file diff --git a/old/fixes/fixresolution.py b/old/fixes/fixresolution.py new file mode 100644 index 0000000..4fa15e9 --- /dev/null +++ b/old/fixes/fixresolution.py @@ -0,0 +1,47 @@ +from BunnyCDN.Storage import Storage +import config, os, funcs +from PIL import Image + +# the hash of the images are different due to optimizer + +#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins') +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0;") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +cacheDir = 'cache' +for result in results: + count += 1 + videoID = result[0] + mediaID = result[1] + mediaURL = result[2] + extension = mediaURL.split('.')[-1] + + serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + print(f"File already exists: {localFilePath}") + else: + obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + + mediaType = funcs.get_media_type(localFilePath) + + if mediaType == 'image': + with Image.open(localFilePath) as img: + width, height = img.size + elif mediaType == 'video': + width, height = funcs.get_video_dimensions(localFilePath) + + + cursor.execute("UPDATE media SET width = %s, height=%s WHERE id = %s;", (width, height, videoID)) + db.commit() + + print(f"[{count}/{len(results)}] width: {width}, height: {height} {cursor.rowcount}") \ No newline at end of file diff --git a/old/fixes/fixthumbnails.py b/old/fixes/fixthumbnails.py new file mode 100644 index 0000000..94861ca --- /dev/null +++ b/old/fixes/fixthumbnails.py @@ -0,0 +1,63 @@ +from BunnyCDN.Storage import Storage +import config, os, cv2 +from concurrent.futures import ThreadPoolExecutor + +# this script will take a screenshot of the first frame of each video and upload it as a thumbnail to BunnyCDN + +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'video' AND thumbnail IS NULL and status = 'public';") +results = cursor.fetchall() + +count = 0 +print(f"Found {len(results)} files to process.") + +cacheDir = 'cache' + +def DownloadFile(serverPath, cacheDir): + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + print(f"File already exists: {localFilePath}") + return localFilePath + + obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + print(f"Downloaded {serverPath} to {localFilePath}") + return localFilePath + +def ImportMedias(): + with ThreadPoolExecutor(max_workers=10) as executor: + for video in results: + serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + executor.submit(DownloadFile, serverPath, cacheDir) + + +for result in results: + count += 1 + itemID = result[0] + mediaID = result[1] + mediaURL = result[2] + extension = mediaURL.split('.')[-1] + + serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + filePath = DownloadFile(serverPath, cacheDir) + + cap = cv2.VideoCapture(localFilePath) + ret, frame = cap.read() + cv2.imwrite('thumbnail.jpg', frame) + cap.release() + + thumbnailURL = f"https://storysave.b-cdn.net/thumbnails/{itemID}.jpg" + + obj_storage.PutFile('thumbnail.jpg', f'thumbnails/{itemID}.jpg') + + + cursor.execute("UPDATE media SET thumbnail = %s WHERE id = %s;", (thumbnailURL, itemID)) + db.commit() + + print(f"[{count}/{len(results)}] thumbnail: {thumbnailURL} {cursor.rowcount}") \ No newline at end of file diff --git a/old/fixes/import_cache.py b/old/fixes/import_cache.py new file mode 100644 index 0000000..aa94b64 --- /dev/null +++ b/old/fixes/import_cache.py @@ -0,0 +1,35 @@ +from concurrent.futures import ThreadPoolExecutor +from BunnyCDN.Storage import Storage +import config, os + +def DownloadFile(serverPath, cacheDir): + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + print(f"File already exists: {localFilePath}") + return localFilePath + + obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + print(f"Downloaded {serverPath} to {localFilePath}") + return localFilePath + +def ImportMedias(results): + with ThreadPoolExecutor(max_workers=10) as executor: + for video in results: + serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/') + executor.submit(DownloadFile, serverPath, cacheDir) + + +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash IS NULL;") +results = cursor.fetchall() + + +count = 0 +cacheDir = 'cache' +print(f"Found {len(results)} files to process.") + +ImportMedias(results) \ No newline at end of file diff --git a/old/fixphashes.py b/old/fixphashes.py new file mode 100644 index 0000000..58d430b --- /dev/null +++ b/old/fixphashes.py @@ -0,0 +1,24 @@ +import os, json +from funcs import generate_phash + +count = 0 +cacheDir = 'cache' +dataPath = 'pins.json' + +os.makedirs(cacheDir, exist_ok=True) + +medias = json.load(open(dataPath)) + +for item in medias: + count += 1 + if item['type'] == 'image': + filepath = item['filepath'] + if not os.path.exists(filepath): + print(f"File {filepath} does not exist, skipping.") + continue + phash = generate_phash(filepath) + item['phash'] = phash + print(f"Processed {count}/{len(medias)}: with pHash {phash}") + +with open(dataPath, 'w') as f: + json.dump(medias, f) \ No newline at end of file diff --git a/old/generate_missing_phash_db.py b/old/generate_missing_phash_db.py new file mode 100644 index 0000000..e2e5211 --- /dev/null +++ b/old/generate_missing_phash_db.py @@ -0,0 +1,33 @@ +import config +from funcs import generate_phash + +count = 0 + +storage = config.get_storage() + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, media_url FROM media WHERE media_type = %s AND phash IS NULL;", ['image']) +medias = cursor.fetchall() + +for item in medias: + count += 1 + + itemID = item[0] + media_url = item[1] + + server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/') + filepath = storage.DownloadFile(server_path, 'temp') + if not filepath: + print(f"Error downloading {server_path}") + continue + + phash = generate_phash(filepath) + if not phash: + print(f"Error generating pHash for {filepath}") + continue + + cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID]) + db.commit() + + print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}") \ No newline at end of file diff --git a/old/generate_missing_phash_db_videos.py b/old/generate_missing_phash_db_videos.py new file mode 100644 index 0000000..a67c1f5 --- /dev/null +++ b/old/generate_missing_phash_db_videos.py @@ -0,0 +1,33 @@ +import config +from funcs import generate_phash + +count = 0 + +storage = config.get_storage() + +db, cursor = config.gen_connection() + +cursor.execute("SELECT id, thumbnail FROM media WHERE media_type = %s AND phash IS NULL AND thumbnail IS NOT NULL;", ['video']) +medias = cursor.fetchall() + +for item in medias: + count += 1 + + itemID = item[0] + media_url = item[1] + + server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/') + filepath = storage.DownloadFile(server_path, 'temp') + if not filepath: + print(f"Error downloading {server_path}") + continue + + phash = generate_phash(filepath) + if not phash: + print(f"Error generating pHash for {filepath}") + continue + + cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID]) + db.commit() + + print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}") \ No newline at end of file diff --git a/old/old_CLEAN_FROM_OLD_DUPES.py b/old/old_CLEAN_FROM_OLD_DUPES.py new file mode 100644 index 0000000..0bb75ef --- /dev/null +++ b/old/old_CLEAN_FROM_OLD_DUPES.py @@ -0,0 +1,24 @@ +import config + +altpins_db, altpins_cursor = config.altpins_gen_connection() +db, cursor = config.gen_connection() + +altpins_cursor.execute("SELECT id, title, hash, url FROM pins WHERE hash IS NOT NULL;") +altpins_results = { (row[1], row[2]): (row[0], row[3]) for row in altpins_cursor.fetchall() } + +cursor.execute("SELECT id, username, hash, media_url FROM media WHERE hash IS NOT NULL;") +media_results = { (row[1], row[2]): (row[0], row[3]) for row in cursor.fetchall() } + +common_items = set(altpins_results.keys()) & set(media_results.keys()) + +for title, hash_value in common_items: + altpins_id, altpins_url = altpins_results[(title, hash_value)] + media_id, media_url = media_results[(title, hash_value)] + + print(f"Found a match for hash {hash_value} with title {title}") + print(f"Altpins URL: {altpins_url}") + print(f"Media URL: {media_url}") + + altpins_cursor.execute("DELETE FROM pins WHERE id = %s;", [altpins_id]) + altpins_db.commit() + print(f"Deleted pin {altpins_id}. {altpins_cursor.rowcount} rows affected") \ No newline at end of file diff --git a/old/old_CREATE_VIDEOS_LIST.py b/old/old_CREATE_VIDEOS_LIST.py new file mode 100644 index 0000000..ba2f477 --- /dev/null +++ b/old/old_CREATE_VIDEOS_LIST.py @@ -0,0 +1,33 @@ +import bunny, json + +medias = json.load(open('videos.json', 'r')) +videoIDS = [media['url'].split('/')[-1] for media in medias] + +videos = bunny.list_videos() + +with open('allVideos.json', 'w') as f: + json.dump(videos, f, indent=4) + +missingVideos = [] +for video in videos: + if video['guid'] in videoIDS: + continue + missingVideos.append(video) + +datas = [] +for video in missingVideos: + data = { + 'guid': video['guid'], + 'title': video['title'], + 'length': video['length'], + 'width': video['width'], + 'height': video['height'], + 'availableResolutions': video['availableResolutions'], + 'storageSize': video['storageSize'], + 'hasMP4Fallback': video['hasMP4Fallback'], + 'category': video['category'], + } + datas.append(data) + +with open('missing_videos.json', 'w') as f: + json.dump(datas, f, indent=4) diff --git a/old/old_DOWNLOAD_STORAGE.py b/old/old_DOWNLOAD_STORAGE.py new file mode 100644 index 0000000..ad857b5 --- /dev/null +++ b/old/old_DOWNLOAD_STORAGE.py @@ -0,0 +1,27 @@ +from BunnyCDN.Storage import Storage +import os, json + +altpins_obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins') +obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + +medias = json.load(open('db_pins.json', 'r')) + +count = 0 +print(f"Found {len(medias)} files to process.") + +cacheDir = 'old_altpins_cache' +for media in medias: + count += 1 + username = media['title'] + mediaID = media['photo_id'] + mediaURL = media['url'] + extension = mediaURL.split('.')[-1] + + serverPath = mediaURL.replace("https://altpins.b-cdn.net/", '').replace('//', '/').replace('\\', '/').replace('https://altpins.b-cdn.net/', '') + localFilePath = os.path.join(cacheDir, os.path.basename(serverPath)) + + if os.path.exists(localFilePath): + continue + + altpins_obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir) + print(f"Downloaded {count}/{len(medias)}: {localFilePath}") \ No newline at end of file diff --git a/old/old_DOWNLOAD_STREAM.py b/old/old_DOWNLOAD_STREAM.py new file mode 100644 index 0000000..b8ca36f --- /dev/null +++ b/old/old_DOWNLOAD_STREAM.py @@ -0,0 +1,16 @@ +import json, bunny, os +from concurrent.futures import ThreadPoolExecutor + +medias = json.load(open('missing_videos.json', 'r')) +#videoIDS = [media['url'].split('/')[-1] for media in medias] +videoIDS = [media['guid'] for media in medias] + +with ThreadPoolExecutor(max_workers=10) as executor: + for id in videoIDS: + filePath = f"MISSING_STREAM_VIDEOS/{id}.zip" + + if os.path.exists(filePath): + print(f'Video already exists as {filePath}. Skipping...') + continue + + executor.submit(bunny.download_video, id) \ No newline at end of file diff --git a/old/old_IMPORTED_PINS_CLEANUP.py b/old/old_IMPORTED_PINS_CLEANUP.py new file mode 100644 index 0000000..cf50199 --- /dev/null +++ b/old/old_IMPORTED_PINS_CLEANUP.py @@ -0,0 +1,29 @@ +import os, json, config + +# Load the data +pins = json.load(open('db_pins.json', 'r')) +files = os.listdir('STORAGE_IMPORTED/') + +db, cursor = config.gen_connection() + +cursor.execute('SELECT hash FROM media WHERE hash IS NOT NULL;') +existing_hashes = [hash[0] for hash in cursor.fetchall()] + +for pin in pins: + if pin['hash'] in existing_hashes: + print(f"Found {pin['hash']} in the imported folder.") + pins.remove(pin) + +alreadyImported = [] +for pin in pins: + filepath = pin['filepath'] + username = pin['title'] + filename = os.path.basename(filepath) + + if filename in files: + print(f"Found {filename} in the imported folder.") + alreadyImported.append(pins.pop(pins.index(pin))) + +# Save to the file +json.dump(pins, open('db_pins.json', 'w')) +json.dump(alreadyImported, open('db_pins_imported.json', 'w')) \ No newline at end of file diff --git a/old/old_SCAN_MP4.py b/old/old_SCAN_MP4.py new file mode 100644 index 0000000..b140418 --- /dev/null +++ b/old/old_SCAN_MP4.py @@ -0,0 +1,14 @@ +import os, json, bunny + +medias = json.load(open('allVideos.json', 'r')) +mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True] + +missing = json.load(open('missing_videos.json', 'r')) + +count = 0 +cacheDir = 'old_mp4fallback_cache' +print(f"Found {len(medias)} files to process.") +for media in mp4Medias: + count += 1 + filePath = os.path.join(cacheDir, media['guid'] + '.mp4') + \ No newline at end of file diff --git a/old/old_SORT_MISSING.py b/old/old_SORT_MISSING.py new file mode 100644 index 0000000..d444b27 --- /dev/null +++ b/old/old_SORT_MISSING.py @@ -0,0 +1,36 @@ +import os, json, bunny, config + +db, cursor = config.gen_connection() + +cursor.execute('SELECT media_id FROM media WHERE media_id IS NOT NULL;') +mediaIDS = cursor.fetchall() + + + +pins = json.load(open('pins.json', 'r')) + +videos = json.load(open('db_videos.json', 'r')) +pins = json.load(open('db_pins.json', 'r')) +ids = [video['id'] for video in videos] + +for pin in pins: + if pin['id'] in ids: + pins.remove(pin) + +# save to the file +json.dump(pins, open('db_pins.json', 'w')) + + +medias = json.load(open('allVideos.json', 'r')) +mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True] + +missing = json.load(open('missing_videos.json', 'r')) + +count = 0 +cacheDir = 'old_mp4fallback_cache' +print(f"Found {len(medias)} files to process.") +for media in mp4Medias: + count += 1 + filePath = os.path.join(cacheDir, media['guid'] + '.mp4') + + \ No newline at end of file diff --git a/old/old_SORT_PINS.py b/old/old_SORT_PINS.py new file mode 100644 index 0000000..ee873e9 --- /dev/null +++ b/old/old_SORT_PINS.py @@ -0,0 +1,53 @@ +import os, json, funcs + +STORAGE_IMPORTED = 'STORAGE_IMPORTED' +pins = json.load(open('db_pins.json', 'r')) + +for pin in pins: + filename = pin['url'].split('/')[-1] + filepath = os.path.join(STORAGE_IMPORTED, filename) + pin['filename'] = filename + if not pin['hash']: + pin['hash'] = funcs.calculate_file_hash(filepath) + +json.dump(pins, open('db_pins.json', 'w'), indent=4) + +files = os.listdir(STORAGE_IMPORTED) + +for file in files: + filepath = os.path.join(STORAGE_IMPORTED, file) + fileHash = funcs.calculate_file_hash(filepath) + if fileHash not in file: + print(f'Renaming {file} to {fileHash}') + os.rename(filepath, os.path.join(STORAGE_IMPORTED, fileHash)) + +pins_by_username = {} +for pin in pins: + username = pin['title'] + if username not in pins_by_username: + pins_by_username[username] = [] + pins_by_username[username].append(pin) + +for username, username_pins in pins_by_username.items(): + username_folder = os.path.join(STORAGE_IMPORTED, username) + os.makedirs(username_folder, exist_ok=True) + for pin in username_pins: + photo_id = pin['photo_id'] + photo_url = pin['url'] + fileHash = pin['hash'] + + if not fileHash: + continue + + extension = photo_url.split('.')[-1] + filename = f'{fileHash}.{extension}' + + filePath = os.path.join(STORAGE_IMPORTED, filename) + outputPath = os.path.join(STORAGE_IMPORTED, username, filename) + + if os.path.exists(outputPath): + print(f'File {outputPath} already exists. Skipping...') + continue + + print(f'Moving {photo_url} to {outputPath}') + os.rename(filePath, outputPath) \ No newline at end of file diff --git a/old/organize.py b/old/organize.py new file mode 100644 index 0000000..f8c3123 --- /dev/null +++ b/old/organize.py @@ -0,0 +1,27 @@ +import os, json + + + +folderPath = 'STREAM_IMPORTED' +jsonFile = 'bunnyVideos.json' + +data = json.load(open(jsonFile)) + +for item in data: + username = item['title'] + filepath = os.path.join(folderPath, item['guid'] + '.mp4') + + if username in filepath: + continue + + username = item['title'] + output = os.path.join(folderPath, username, os.path.basename(filepath)) + os.makedirs(os.path.dirname(output), exist_ok=True) + if os.path.exists(filepath): + os.rename(filepath, output) + item['filepath'] = output + + +# save to fiel +with open(jsonFile, 'w') as f: + json.dump(data, f, indent=4) \ No newline at end of file diff --git a/old/organizer.py b/old/organizer.py new file mode 100644 index 0000000..1bbca07 --- /dev/null +++ b/old/organizer.py @@ -0,0 +1,49 @@ +import json, os +from videohash import VideoHash +from moviepy.editor import VideoFileClip + +def is_valid_video(file_path): + try: + with VideoFileClip(file_path) as video: + return True + except Exception as e: + print(f"Invalid video {file_path}: {str(e)}") + return False + +def load_hashes(file_path): + try: + with open(file_path, 'r') as file: + return json.load(file) + except FileNotFoundError: + return {} + +def save_hashes(hashes, file_path): + with open(file_path, 'w') as file: + json.dump(hashes, file, indent=4) + +hashes = load_hashes('video_hashes.json') +video_directory = 'STORAGE' + +for username in os.listdir(video_directory): + user_dir = os.path.join(video_directory, username) + if not os.path.isdir(user_dir): + continue + + for video_file in os.listdir(user_dir): + video_path = os.path.join(user_dir, video_file) + if not video_file.endswith(('.mp4', '.mkv', '.avi')) or not is_valid_video(video_path): + continue + + if username in hashes and any(v[0] == video_file for v in hashes[username]): + continue + + try: + video_hash = VideoHash(path=video_path) + if username in hashes: + hashes[username].append((video_file, video_hash.hash)) + else: + hashes[username] = [(video_file, video_hash.hash)] + except Exception as e: + print(f"Error processing {video_file}: {str(e)}") + +save_hashes(hashes, 'video_hashes.json') diff --git a/old/organizer_compare.py b/old/organizer_compare.py new file mode 100644 index 0000000..318465e --- /dev/null +++ b/old/organizer_compare.py @@ -0,0 +1,44 @@ +from moviepy.editor import VideoFileClip +import json + +def is_valid_video(file_path): + try: + with VideoFileClip(file_path) as video: + return True + except Exception as e: + print(f"Invalid video {file_path}: {str(e)}") + return False + +def load_hashes(file_path): + try: + with open(file_path, 'r') as file: + return json.load(file) + except FileNotFoundError: + return {} + +def save_hashes(hashes, file_path): + with open(file_path, 'w') as file: + json.dump(hashes, file, indent=4) + +def find_duplicates(video_hashes): + hash_map = {} + for video, v_hash in video_hashes: + if v_hash in hash_map: + hash_map[v_hash].append(video) + else: + hash_map[v_hash] = [video] + + duplicates = {h: vids for h, vids in hash_map.items() if len(vids) > 1} + return duplicates + +hashes = load_hashes('video_hashes.json') +for username, user_hashes in hashes.items(): + print(f"Checking for duplicates in '{username}' videos:") + duplicates = find_duplicates(user_hashes) + if duplicates: + for dup_hash, dup_videos in duplicates.items(): + print(f"Duplicate hash: {dup_hash}") + for vid in dup_videos: + print(f" - {vid}") + else: + print("No duplicates found.") diff --git a/old/organizer_images.py b/old/organizer_images.py new file mode 100644 index 0000000..8efe6f1 --- /dev/null +++ b/old/organizer_images.py @@ -0,0 +1,48 @@ +from videohash import VideoHash +import os + +# Directory containing videos grouped by username +video_directory = '/path/to/videos' +hashes = {} + +for username in os.listdir(video_directory): + user_dir = os.path.join(video_directory, username) + if os.path.isdir(user_dir): + for video_file in os.listdir(user_dir): + if video_file.endswith(('.mp4', '.mkv', '.avi')): # Ensure it's a video file + video_path = os.path.join(user_dir, video_file) + try: + # Calculate the hash for each video + video_hash = VideoHash(path=video_path) + print(f"Hash for {video_file}: {video_hash.hash}") + + # Store hashes in a dictionary + if username in hashes: + hashes[username].append((video_file, video_hash.hash)) + else: + hashes[username] = [(video_file, video_hash.hash)] + except Exception as e: + print(f"Error processing {video_file}: {str(e)}") + +def find_duplicates(hashes): + duplicate_videos = [] + all_hashes = [(user, video, hsh) for user, videos in hashes.items() for video, hsh in videos] + hash_dict = {} + + for user, video, hsh in all_hashes: + if hsh in hash_dict: + hash_dict[hsh].append((user, video)) + else: + hash_dict[hsh] = [(user, video)] + + for videos in hash_dict.values(): + if len(videos) > 1: + duplicate_videos.append(videos) + + return duplicate_videos + +duplicates = find_duplicates(hashes) +for duplicate in duplicates: + print("Duplicate videos found:") + for video_info in duplicate: + print(f"User: {video_info[0]}, Video: {video_info[1]}") diff --git a/old/splitfiles.py b/old/splitfiles.py new file mode 100644 index 0000000..e96d195 --- /dev/null +++ b/old/splitfiles.py @@ -0,0 +1,49 @@ +import os, json + +def get_file_type(filepath): + if filepath.endswith('.jpg') or filepath.endswith('.png'): + return 'image' + elif filepath.endswith('.mp4'): + return 'video' + else: + return None + +def get_files(directory): + files = [] + for root, dirs, filenames in os.walk(directory): + for filename in filenames: + files.append(os.path.join(root, filename)) + return files + +files = get_files('STORAGE/') +os.makedirs('images', exist_ok=True) +os.makedirs('videos', exist_ok=True) + +for filepath in files: + if not os.path.exists(filepath): + print(f"File {filepath} does not exist, skipping.") + continue + + # Extract the username from the filepath assuming the structure STORAGE/{username}/{filename} + filepath = filepath.replace('\\', '/') # Replace backslashes with forward slashes + parts = filepath.split('/') # Split the path by the system's separator + if len(parts) < 3 or parts[0] != 'STORAGE': # Check if the structure is valid + print(f"Unexpected filepath format: {filepath}") + continue + + username = parts[1] # Extract the username from the second part + fileType = get_file_type(filepath) # Determine the type of the file + if not fileType: + print(f"Unknown file type for {filepath}") + continue + + if fileType == 'image': + newpath = os.path.join('images', username, os.path.basename(filepath)) + elif fileType == 'video': + newpath = os.path.join('videos', username, os.path.basename(filepath)) + else: + print(f"Unknown media type {fileType} for {filepath}") + continue + + os.makedirs(os.path.dirname(newpath), exist_ok=True) # Create directory structure if it doesn't exist + os.rename(filepath, newpath) # Move the file to the new location \ No newline at end of file diff --git a/old/templates/index.html b/old/templates/index.html new file mode 100644 index 0000000..0ce593c --- /dev/null +++ b/old/templates/index.html @@ -0,0 +1,34 @@ + + + + + + Image Gallery + + + +

Image Gallery

+ + + diff --git a/old/templates/old_index.html b/old/templates/old_index.html new file mode 100644 index 0000000..96f0e86 --- /dev/null +++ b/old/templates/old_index.html @@ -0,0 +1,84 @@ + + + + + + Media Gallery + + + +
+

Media Gallery

+
+
+ + + + diff --git a/old/web.py b/old/web.py new file mode 100644 index 0000000..c0b4d33 --- /dev/null +++ b/old/web.py @@ -0,0 +1,32 @@ +from flask import Flask, render_template, send_from_directory, jsonify, request +import os + +app = Flask(__name__) +media_dir = 'storysaver' +MEDIA_PER_PAGE = 20 + +def get_media_files(start, count): + media_files = [] + for root, dirs, files in os.walk(media_dir): + for filename in files: + if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.mp4', '.mkv', '.mov')): + file_path = os.path.relpath(os.path.join(root, filename), media_dir) + media_files.append(file_path) + return media_files[start:start + count] + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/media/') +def media(filename): + return send_from_directory(media_dir, filename) + +@app.route('/load-more') +def load_more(): + page = int(request.args.get('page', 0)) + media_files = get_media_files(page * MEDIA_PER_PAGE, MEDIA_PER_PAGE) + return jsonify(media_files) + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/storysave_api.py b/storysave_api.py new file mode 100644 index 0000000..2b763ac --- /dev/null +++ b/storysave_api.py @@ -0,0 +1,26 @@ +import json, requests + +def findPost(filePath = 'test.json'): + params = {'av': '17841401225494803','__a': '1','__req': '1','__hs': '19906.HYP:instagram_web_pkg.2.1..0.1','dpr': '1','__ccg': 'UNKNOWN','__rev': '1014609539','__s': 'guk60j:651i2v:pmhu0r','__hsi': '7386834689999716220','__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux609vCwjE1xoswaq0yE6u0nS4oaEd86a3a1YwBgao1aU2swbOU2zxe2GewGw9a362W2K0zEnwhEe82mwww4cwJCwLyES1TwTwFwIwbS1LwTwKG1pg2Xwr86C1mwrd6goK3ibxKi2K7ErwYCz8rwHw','__csr': 'igAzIj5OgR5YBHdRtivbkyFv-zJIZE_ykzfahdAydeHCHAAAqyk4pqBgDzeV4-qlbBF29UlCxFpVokDwAyosyV9KWUmx6iu58WqdwSDCDAFwHxi3C00lWy2FG4k583NxW8yFE0bUyxd06lxO5C2a8yFm2u290ejg1JU2Gw2rQ061U','__comet_req': '7','fb_dtsg': 'NAcPDfX2XufdLkctek6zNxz3DWxPW4t-cJzz39QtOQ5KS-_Rq3erT4A:17843708194158284:1719013044','jazoest': '26262','lsd': 'D0zmaX16yIQu_GwDXKTbMc','__spin_r': '1014609539','__spin_b': 'trunk','__spin_t': '1719881474','__jssesw': '1','fb_api_caller_class': 'RelayModern','fb_api_req_friendly_name': 'PolarisProfilePageContentDirectQuery', 'variables': '{"id":"57771591453","render_surface":"PROFILE"}','server_timestamps': 'true','doc_id': '7663723823674585'} + + data = requests.get('https://www.instagram.com/graphql/query') + + posts = data['data']['xdt_api__v1__feed__user_timeline_graphql_connection']['edges'] + posts = [post['node'] for post in posts] + + return max(posts, key=lambda post: max(c['width'] * c['height'] for c in post['image_versions2']['candidates'])) + +def getHDProfilePicture(): + url = 'https://www.save-free.com/process' + + zoom_data = {'instagram_url': 'natahalieeee','type': 'profile','resource': 'zoom'} + data = {'instagram_url': 'natahalieeee','type': 'profile','resource': 'save'} + + headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36','Referer' : 'https://www.save-free.com/profile-downloader/',} + + response = requests.post(url, data=data, headers=headers) + + response = requests.post(url, data=zoom_data, headers=headers) + + with open('image.jpg', 'wb') as f: + f.write(response.content) \ No newline at end of file diff --git a/storysave_dump.py b/storysave_dump.py new file mode 100644 index 0000000..f4d3816 --- /dev/null +++ b/storysave_dump.py @@ -0,0 +1,149 @@ +from BunnyCDN.Storage import Storage +from datetime import datetime +import os, config, funcs, cv2 +from PIL import Image + + +def UploadMedia(media): + media_id = media['media_id'] + username = media['username'] + timestamp = media['timestamp'] + user_id = media['user_id'] + filepath = media['filepath'] + highlight_id = media['highlight_id'] + thumbnail_url = None + phash = None + + if media_id and int(media_id) in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return True + + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + media_type = funcs.get_media_type(filename) + + post_type = funcs.determine_post_type(filepath, media_type) + if not post_type: + print(f'Error determining post type for {filename}. Skipping...') + return False + + file_hash = funcs.calculate_file_hash(filepath) + + post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now() + + width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower + + if media_type == 'video': + try: + thumbPath = f'temp/{media_id}.jpg' + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cv2.imwrite(thumbPath, frame) + cap.release() + obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower + thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg" + phash = funcs.generate_phash(thumbPath) + os.remove(thumbPath) + except: + print('Error generating thumbnail. Skipping...') + return False + elif media_type == 'image': + phash = funcs.generate_phash(filepath) + + newFilename = f'{media_id}{file_extension}' + server_path = f'media/{post_type}/{username}/{newFilename}' + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + obj_storage.PutFile(filepath, server_path) # slow as fuck + + if highlight_id: + newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id)) + newDB.commit() + print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}') + + post_type = 'story' if post_type == 'stories' else 'post' + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash) + + newCursor.execute(query, values) # slower + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + os.remove(filepath) + + return True + +def get_user_id(username): + username = username.lower() + if username in existing_users: + return existing_users[username] + + return None + +def get_media_data(filepath): + filename = os.path.basename(filepath) + parts = filename.split('~') + if len(parts) < 4: + return False + + username = parts[0] + timestamp = parts[1] + media_id = parts[2] + user_id = parts[3].split('_')[-1].split('.')[0] + + highlight_id = user_id.replace('highlight', '') if 'highlight' in user_id else None + if highlight_id: + user_id = get_user_id(username) + + try: + media_id = int(media_id) + except: + print(f'Invalid media_id for file {filename}. Skipping...') + media_id = None + + data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id, 'filepath': filepath, 'highlight_id': highlight_id} + + return data + +def get_media(folder_path): + medias = [] + + for root, dirs, files in os.walk(folder_path): + for filename in files: + filepath = os.path.join(root, filename) + + data = get_media_data(filepath) + if data: + medias.append(data) + + return medias + +def dump_instagram(folder_path): + medias = get_media(folder_path) + + for media in medias: + UploadMedia(media) + existing_files.append(media['media_id']) + + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL") + existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()} + + dump_instagram('storysaver/') + + print("Processing completed.") \ No newline at end of file diff --git a/storysave_dump_new.py b/storysave_dump_new.py new file mode 100644 index 0000000..95d8f3e --- /dev/null +++ b/storysave_dump_new.py @@ -0,0 +1,137 @@ +from BunnyCDN.Storage import Storage +from datetime import datetime +import os, config, funcs, cv2 +from PIL import Image + + +def UploadMedia(media): + media_id = media['media_id'] + username = media['username'] + post_date = media['timestamp'] + user_id = media['user_id'] + filepath = media['filepath'] + highlight_id = media['highlight_id'] + post_type = media['post_type'] + thumbnail_url = None + phash = None + + if media_id and int(media_id) in existing_files: + print('Duplicate file detected. Removing...') + os.remove(filepath) + return True + + filename = os.path.basename(filepath) + file_extension = os.path.splitext(filename)[1].lower() + + media_type = funcs.get_media_type(filename) + + file_hash = funcs.calculate_file_hash(filepath) + + width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size + + duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0 # slower + + if media_type == 'video': + try: + thumbPath = f'temp/{media_id}.jpg' + cap = cv2.VideoCapture(filepath) + ret, frame = cap.read() + cv2.imwrite(thumbPath, frame) + cap.release() + obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower + thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg" + phash = funcs.generate_phash(thumbPath) + os.remove(thumbPath) + except: + print('Error generating thumbnail. Skipping...') + return False + elif media_type == 'image': + phash = funcs.generate_phash(filepath) + + if media_id: + newFilename = f'{media_id}{file_extension}' + else: + newFilename = f'{file_hash}{file_extension}' + + server_path = f'media/{post_type}/{username}/{newFilename}' + + file_url = f"https://storysave.b-cdn.net/{server_path}" + + obj_storage.PutFile(filepath, server_path) # slow as fuck + + if highlight_id: + newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id)) + newDB.commit() + print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}') + + query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash) + + newCursor.execute(query, values) # slower + newDB.commit() + print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}') + + os.remove(filepath) + + return True + +def get_user_id(username): + username = username.lower() + if username in existing_users: + return existing_users[username] + + return None + +def get_media(): + medias = [] + post_types = { + 'posts': 'post', + 'stories': 'story', + 'profile': 'profile', + } + + for post_type in os.listdir('media'): + users = os.listdir(f'media/{post_type}') + for user in users: + user_path = f'media/{post_type}/{user}' + for filename in os.listdir(user_path): + data = {} + filepath = os.path.join(user_path, filename) + + data['post_type'] = post_types[post_type] + data['username'] = user + data['timestamp'] = filename.split('__')[-1].split('.')[0] if 'com.instagram.android__' in filename else datetime.now() + if 'com.instagram.android__' in filename: + data['timestamp'] = datetime.strptime(data, '%Y%m%d%H%M%S%f') + data['filepath'] = filepath + data['media_id'] = None + data['user_id'] = get_user_id(data['username']) + data['highlight_id'] = None + medias.append(data) + + return medias + +def dump_instagram(): + medias = get_media() + + for media in medias: + UploadMedia(media) + existing_files.append(media['media_id']) + + +if __name__ == '__main__': + print('Starting processing...') + + newDB, newCursor = config.gen_connection() + + obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave') + + newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL") + existing_files = [image[0] for image in newCursor.fetchall()] + + newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL") + existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()} + + dump_instagram() + + print("Processing completed.") \ No newline at end of file diff --git a/storysave_scanner.py b/storysave_scanner.py new file mode 100644 index 0000000..b62f854 --- /dev/null +++ b/storysave_scanner.py @@ -0,0 +1,36 @@ +import os, shutil, time +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler + +class DownloadHandler(FileSystemEventHandler): + def process_file(self, file_path): + file = os.path.basename(file_path) + if 'crdownload' not in file and file.count('~') == 3: + print(f'Moving {file}...') + outputPath = os.path.join('storysaver', file) + try: + shutil.move(file_path, outputPath) + except Exception as e: + print(f'Failed to move file: {e}') + + def on_created(self, event): + if not event.is_directory and 'crdownload' not in event.src_path: + self.process_file(event.src_path) + + def on_moved(self, event): + if not event.is_directory and 'crdownload' not in event.dest_path: + self.process_file(event.dest_path) + +if __name__ == "__main__": + downloadPath = os.path.join(os.path.expanduser('~'), 'Downloads') + event_handler = DownloadHandler() + observer = Observer() + observer.schedule(event_handler, downloadPath, recursive=False) + observer.start() + + try: + while True: + time.sleep(1) # Add a 1-second sleep to reduce CPU usage + except KeyboardInterrupt: + observer.stop() + observer.join() diff --git a/storysaver.py b/storysaver.py index e1c102f..e970fe2 100644 --- a/storysaver.py +++ b/storysaver.py @@ -50,9 +50,6 @@ def login(force=False): else: raise FileNotFoundError except (FileNotFoundError, json.JSONDecodeError): - # username = input("Enter your Instagram username: ") - # password = getpass.getpass("Enter your Instagram password: ") - with open("p.enc", "rb") as encrypted_file: encrypted_data = encrypted_file.read() diff --git a/storysaver_new.py b/storysaver_new.py index f9e6c31..3cd06e0 100644 --- a/storysaver_new.py +++ b/storysaver_new.py @@ -1,30 +1,33 @@ -import cv2, os, json, config, hashlib, requests - -from concurrent.futures import ThreadPoolExecutor -from moviepy.editor import VideoFileClip from cryptography.fernet import Fernet from BunnyCDN.Storage import Storage from instagrapi import Client +from uuid import uuid4 from PIL import Image +import os, config, funcs -headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"} -proxies={"http": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/","https": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/"} -def file_hash(filename): - h = hashlib.new('sha256') +def insert_highlight_items(media_ids, highlight_id, title, user_id): + try: + db, cursor = config.gen_connection() - with open(filename, "rb") as file: - while chunk := file.read(8192): - h.update(chunk) + query = "INSERT IGNORE INTO highlights (media_id, highlight_id, title, user_id) VALUES (%s, %s, %s, %s)" - return h.hexdigest() + values = [(media_id, highlight_id, title, user_id) for media_id in media_ids] + cursor.executemany(query, values) + db.commit() + if cursor.rowcount > 0: + print(f"Added {cursor.rowcount} highlight items to the database.") + except Exception as e: + print(f"Failed to add highlight items to the database. Error: {e}") -def get_video_duration(file_path): +def upload_to_storage(local_path, server_path): try: - with VideoFileClip(file_path) as video: - return video.duration - except:return 0 + obj_storage = Storage("345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e", "storysave") + obj_storage.PutFile(local_path, server_path) + print(f"Uploaded to https://storysave.b-cdn.net/{server_path}") + except Exception as e: + print(f"Failed to upload {local_path} to {server_path}. Error: {e}") def login(): @@ -67,33 +70,6 @@ def parse_media_data(media_item): return mediaInfo -def download_file(url, filePath): - try: - response = requests.get(url, stream=True, headers=headers) - response.raise_for_status() - - directory = os.path.dirname(filePath) - - if not os.path.exists(directory): - os.makedirs(directory) - - with open(filePath, "wb") as out_file: - for chunk in response.iter_content(chunk_size=8192): - out_file.write(chunk) - print(f"Downloaded {filePath}") - except Exception as e: - print(f"Failed to download {url}. Error: {e}") - - -def upload_to_storage(local_path, server_path): - try: - obj_storage = Storage("345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e", "storysave") - obj_storage.PutFile(local_path, server_path) - print(f"Uploaded to https://storysave.b-cdn.net/{server_path}") - except Exception as e: - print(f"Failed to upload {local_path} to {server_path}. Error: {e}") - - def add_media_to_db(mediaInfo): media_id = mediaInfo["media_id"] user_id = mediaInfo["user_id"] @@ -124,30 +100,22 @@ def add_media_to_db(mediaInfo): print(f"Failed to add media for {username} to the database. Error: {e}") -def insert_highlight_items(media_ids, highlight_id, title, user_id): - try: - db, cursor = config.gen_connection() - - query = "INSERT IGNORE INTO highlights (media_id, highlight_id, title, user_id) VALUES (%s, %s, %s, %s)" - - values = [(media_id, highlight_id, title, user_id) for media_id in media_ids] - cursor.executemany(query, values) - db.commit() - if cursor.rowcount > 0: - print(f"Added {cursor.rowcount} highlight items to the database.") - except Exception as e: - print(f"Failed to add highlight items to the database. Error: {e}") - +def get_profile_picture(client, user_id, username): + mediaInfo = {} + mediaInfo['mediaDir'] = 'profile' + mediaInfo['username'] = username + mediaInfo['user_id'] = user_id + mediaInfo['media_id'] = None + mediaInfo['media_type'] = 'image' + mediaInfo['post_type'] = 'profile' + mediaInfo['media_url'] = client.user_info(user_id).profile_pic_url_hd + mediaInfo['duration'] = 0 + mediaInfo['filename'] = f"{uuid4()}.jpg" -def get_video_dimensions(video_path): - cap = cv2.VideoCapture(video_path) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - cap.release() - return width, height + return mediaInfo -def getAllStories(client, user_id, firstImport=False): +def get_all_stories(client, user_id, firstImport=False): stories = client.user_stories(user_id) highlights = client.user_highlights(user_id) @@ -163,7 +131,7 @@ def getAllStories(client, user_id, firstImport=False): return stories -def getAllPosts(client, user_id): +def get_all_posts(client, user_id): posts = client.user_medias(user_id, 36) medias = [] @@ -187,25 +155,13 @@ if __name__ == "__main__": db, cursor = config.gen_connection() - cursor.execute("SELECT instagram_username, instagram_user_id, favorite FROM following ORDER BY id DESC;") + cursor.execute("SELECT instagram_username, instagram_user_id, favorite FROM following ORDER BY favorite DESC;") following = cursor.fetchall() - new_following = [] - for user in following: - username, user_id, favorite = user - - if bool(favorite): - new_following.insert(0, user) - else: - new_following.append(user) - - following = new_following - cursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL;") existing_files = [media[0] for media in cursor.fetchall()] continueFromLast = input("Continue from the last user? (y/N): ").lower() == "y" - if continueFromLast: cursor.execute("SELECT username FROM media ORDER BY id DESC LIMIT 1;") lastUser = cursor.fetchone() @@ -218,76 +174,56 @@ if __name__ == "__main__": for user in following: - while True: - try: - firstImport = False - username, user_id, isFavorite = user - - if not user_id: - firstImport = True - user_id = client.user_id_from_username(username) - cursor.execute("UPDATE following SET instagram_user_id = %s WHERE instagram_username = %s;", (user_id, username)) - db.commit() - print(f"Updated user ID for {username} to {user_id}") - - #################### profile picture #################### - #profilePath = os.path.join('media', 'profile', username, 'profile.jpg') - #profileURL = client.user_info(user_id).profile_pic_url_hd - #download_file(profileURL, profilePath) - - #fileHash = file_hash(profilePath) - #serverPath = os.path.join(os.path.dirname(profilePath), f"{fileHash}.jpg") - - #upload_to_storage(profilePath, serverPath) - - #mediaInfo = { - # 'username': username, - # 'user_id': user_id, - # 'media_id': None, - # 'media_type': 'image', - # 'post_type': 'profile', - # 'media_url': f"https://storysave.b-cdn.net/{serverPath}", - # 'duration': 0, - # 'hash': fileHash - #} - - #add_media_to_db(mediaInfo) - #################### profile picture #################### - - allStories = getAllStories(client, user_id, firstImport) - allPosts = getAllPosts(client, user_id) - - medias = allStories + allPosts - for media in medias: - mediaInfo = parse_media_data(media) - - mediaType = "stories" if mediaInfo["post_type"] == "story" else "posts" - filePath = os.path.join('media', mediaType, username, mediaInfo['filename']) - - mediaInfo["hash"] = file_hash(filePath) - - download_file(mediaInfo['media_url'], filePath) - - if mediaInfo["media_type"] == "image": - with Image.open(filePath) as img: - mediaInfo["width"], mediaInfo["height"] = img.size - else: - mediaInfo["width"], mediaInfo["height"] = get_video_dimensions(filePath) - mediaInfo["duration"] = get_video_duration(filePath) - - upload_to_storage(filePath, filePath) - add_media_to_db(mediaInfo) - os.remove(filePath) - - existing_files.append(mediaInfo["media_id"]) - - print("=====================================") - break - except Exception as e: - if "login_required" in str(e): - print("Please log in to your account again.") - os.remove("session_data.json") - client = login() + try: + firstImport = False + username, user_id, isFavorite = user + + if not user_id: + firstImport = True + user_id = client.user_id_from_username(username) + cursor.execute("UPDATE following SET instagram_user_id = %s WHERE instagram_username = %s;", (user_id, username)) + db.commit() + print(f"Updated user ID for {username} to {user_id}") + + profile = get_profile_picture(client, user_id, username) + allStories = get_all_stories(client, user_id, firstImport) + allPosts = get_all_posts(client, user_id) + + medias = allStories + allPosts + for mediaInfo in medias: + filePath = os.path.join('media', mediaInfo['mediaDir'], username, mediaInfo['filename']) + + funcs.download_file(mediaInfo['media_url'], filePath) + + mediaInfo["hash"] = funcs.calculate_file_hash(filePath) + + if mediaInfo["media_type"] == "image": + with Image.open(filePath) as img: + mediaInfo["width"], mediaInfo["height"] = img.size else: - print("An unexpected error occurred:", e) - break \ No newline at end of file + mediaInfo["width"], mediaInfo["height"] = funcs.get_video_dimensions(filePath) + mediaInfo["duration"] = funcs.get_video_duration(filePath) + + upload_to_storage(filePath, filePath) + add_media_to_db(mediaInfo) + os.remove(filePath) + + existing_files.append(mediaInfo["media_id"]) + except Exception as e: + if "login_required" in str(e): + print("Please log in to your account again.") + os.remove("session_data.json") + client = login() + else: + print("An unexpected error occurred:", e) + + +#https://www.instagram.com/anya_shtril/ +#https://www.instagram.com/anyarodionov/ +#https://www.instagram.com/neomi_hanukayev/ +#https://www.instagram.com/osher_yakir/ +#https://www.instagram.com/m1ry2m_/ +#https://www.instagram.com/4m1t_f1shpot/ +#https://www.instagram.com/yarden.bengigi/ +#https://www.instagram.com/a.roniiiiii/ +#https://www.instagram.com/nonsalemwitch/ \ No newline at end of file