MASS CLEANUP

main
oscar 8 months ago
parent e6ad418ecd
commit 373f3ab661

@ -1,35 +0,0 @@
import os
import json
import gzip
data_dir = 'data'
data_compressed_dir = 'data_compressed'
os.makedirs(data_compressed_dir, exist_ok=True)
def compress_file(filepath, output_file):
with open(filepath, 'r') as f:
data = json.load(f)
compress_data(data, output_file)
return output_file
def compress_data(data, output_file):
with gzip.open(output_file, 'wb') as f:
f.write(json.dumps(data).encode('utf-8'))
return output_file
data_files = os.listdir(data_dir)
for file in data_files:
if not file.endswith('.json'):
continue
filepath = f'{data_dir}/{file}'
output_file = f'{data_compressed_dir}/{file}.gz'
output_file = compress_file(filepath, output_file)
if output_file:
print(f'Compressed {file} to {output_file}')
os.remove(filepath)
else:
print(f'Failed to compress {file}')
print('Data compression completed')

@ -1,137 +0,0 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
import hashlib
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def calculate_file_hash(file_path, hash_func='sha256'):
h = hashlib.new(hash_func)
with open(file_path, 'rb') as file:
chunk = 0
while chunk != b'':
chunk = file.read(8192)
h.update(chunk)
return h.hexdigest()
def extract_file_info(filename):
try:
username = filename.split("~")[0]
timestamp = filename.split("~")[1]
user_id = filename.split("~")[2]
media_id, some2 = user_id.split("_")
user_id = some2.split(".")[0]
return username, media_id, user_id, timestamp
except:
return None, None, None, None
def extract_file_info2(filename):
try:
username = filename.split("~")[0]
elements = filename.split("~")[1].split("_")
media_id, user_id = elements[0], elements[1].split(".")[0]
return username, media_id, user_id
except:
return None, None, None
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story', user_id = None, date = None):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
dirtype = 'stories' if post_type == 'story' else 'posts'
server_path = f'users/{dirtype}/{username}/{media_id if media_id else uuid.uuid4().hex}.{file_extension}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
fileHash = calculate_file_hash(filepath)
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, user_id, hash, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, user_id, fileHash, date)
newCursor.execute(query, values)
newDB.commit()
existing_files.append(media_id)
if newCursor.rowcount == 0:
print('What the fuck just happend?')
obj_storage.PutFile(filepath, server_path)
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
if "~" not in filename:
continue
username, media_id, user_id, timestamp = extract_file_info(filename)
if None in [username, media_id, user_id, timestamp]:
username, media_id, user_id = extract_file_info2(filename)
if None in [username, media_id, user_id]:
print(f"Failed to extract info from {filename}")
continue
media_id = int(media_id) if media_id else None
if media_id in existing_files:
print(f'Duplicate, {filename}')
os.remove(os.path.join(folder_path, filename))
continue
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, user_id = user_id,)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT media_id FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_instagram('StorySave/')
print("Processing completed.")

@ -1,4 +1,4 @@
from funcs import get_files
from funcs import get_files, get_media_type
from PIL import Image
import imagehash
import config
@ -25,10 +25,14 @@ def are_phashes_duplicates(phash1, phash2, threshold=5):
print(f"Error comparing pHashes: {e}")
return False
def find_duplicate_phash(phash, existing_medias, threshold=5):
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
if username:
if username != existing_username:
continue
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
@ -38,46 +42,39 @@ def find_duplicate_phash(phash, existing_medias, threshold=5):
return media
return None
def get_media_by_hash(hash, existing_medias):
for media in existing_medias:
existing_hash = media[1]
if hash == existing_hash:
return media
return None
def get_media_by_id(media_id, existing_medias):
for media in existing_medias:
existing_media_id = media[1]
if media_id == existing_media_id:
return media
return None
def get_data_by_filename(filename, data):
for item in data:
if filename in item['filepath']:
return item
return None
def get_image_files(directory):
return [file for file in get_files(directory) if get_media_type(file) == 'image']
def get_images_with_username(directory):
files = {}
for username in os.listdir(directory):
user_files = get_image_files(os.path.join(directory, username))
files[username] = user_files
# Database connection
db, cursor = config.gen_connection()
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL AND media_id IS NULL;", ['image'])
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
existing_medias = cursor.fetchall()
directory = 'check_if_exists/' # Directory containing user images
files = [file for file in get_files(directory) if file.endswith(('.jpg', '.jpeg', '.png'))]
# Directory containing user images
directory = 'media/check_if_exists'
usernames = os.listdir(directory)
files = get_image_files(directory)
username = None
for filepath in files:
image_filename = os.path.basename(filepath)
print(f'Processing {image_filename}...')
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue
continue # Skip this image if there's an issue
# Check if the image is a duplicate of any in the database
duplicate_media = find_duplicate_phash(phash, existing_medias)
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')

@ -1,11 +1,11 @@
from funcs import get_files
from funcs import get_files, get_media_type
from PIL import Image
import imagehash
import config
import cv2
import os
def get_video_phash(filepath, hash_size=8):
def get_video_phash(filepath, hash_size=8): # Set hash_size to 8
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cap.release()
@ -37,43 +37,54 @@ def are_phashes_duplicates(phash1, phash2, threshold=5):
return distance <= threshold
def get_media_by_phash(phash, existing_medias, threshold=5):
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
if username:
if existing_username != username:
continue
# Convert stored phash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
def get_video_files(directory):
return [file for file in get_files(directory) if get_media_type(file) == 'video']
def get_videos_with_username(directory):
videos = {}
for username in os.listdir(directory):
user_videos = get_video_files(os.path.join(directory, username))
videos[username] = user_videos
return videos
# Database connection
db, cursor = config.gen_connection()
# Directory containing user videos
directory = 'check_if_exists/' # Directory containing user images
directory = 'check_if_exists'
# Fetch existing videos with pHashes
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL;", ['video'])
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
existing_medias = cursor.fetchall()
# make a list of all video files
files = [file for file in get_files(directory) if file.endswith(('.mp4', '.avi', '.mov'))]
for filepath in files:
video_filename = os.path.basename(filepath)
videos = get_video_files(directory)
username = None
for filepath in videos:
phash = get_video_phash(filepath, hash_size=8) # Use hash_size=8
if phash is None:
continue
duplicate_media = get_media_by_phash(phash, existing_medias, threshold=5)
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate video path: {filepath}')
newpath = os.path.join('duplicates', duplicate_media[2], video_filename)
newpath = filepath.replace(directory, 'duplicates')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {filepath} to duplicates/')

@ -1,96 +0,0 @@
from funcs import get_files
from PIL import Image
import imagehash
import config
import os
def generate_image_phash(filepath, hash_size=8):
try:
# Open the image using PIL
pil_image = Image.open(filepath)
# Compute pHash using the imagehash library
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
except Exception as e:
print(f"Error processing image {filepath}: {e}")
return None
def are_phashes_duplicates(phash1, phash2, threshold=5):
try:
# Compute the Hamming distance between the pHashes
distance = phash1 - phash2
return distance <= threshold
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
# if username != existing_username:
# continue
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
# Check if the current pHash is a duplicate
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
def get_media_by_hash(hash, existing_medias):
for media in existing_medias:
existing_hash = media[1]
if hash == existing_hash:
return media
return None
def get_media_by_id(media_id, existing_medias):
for media in existing_medias:
existing_media_id = media[1]
if media_id == existing_media_id:
return media
return None
def get_data_by_filename(filename, data):
for item in data:
if filename in item['filepath']:
return item
return None
directory = 'media/check_if_exists' # Directory containing user images
# Database connection
db, cursor = config.gen_connection()
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
existing_medias = cursor.fetchall()
usernames = os.listdir(directory)
for username in usernames:
files = get_files(os.path.join(directory, username))
for filepath in files:
image_filename = os.path.basename(filepath)
print(f'Processing {image_filename}...')
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue # Skip this image if there's an issue
phash_str = str(phash)
# Check if the image is a duplicate of any in the database
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {image_filename} to duplicates/')

@ -1,68 +0,0 @@
from funcs import generate_phash # Assuming this function computes the pHash and returns a string
import imagehash
import os
def get_files(directory):
# Recursively get all files in the directory
file_list = []
for root, dirs, files in os.walk(directory):
for filename in files:
file_list.append(os.path.join(root, filename))
return file_list
# Function to compute pHashes for all images in a directory
def compute_phashes(image_paths):
phash_dict = {}
for image_path in image_paths:
try:
# Compute pHash and get it as a string
phash_str = generate_phash(image_path)
# Convert the hash string to an ImageHash object
phash = imagehash.hex_to_hash(phash_str)
phash_dict[image_path] = phash
except Exception as e:
print(f"Error processing {image_path}: {e}")
return phash_dict
# Get all image files from 'ready_to_upload' and 'sorted' directories
ready_images = get_files('ready_to_upload')
ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
sorted_images = get_files('sorted')
sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
# Compute pHashes for images in 'ready_to_upload'
print("Computing pHashes for 'ready_to_upload' images...")
ready_image_phashes = compute_phashes(ready_images)
# Compute pHashes for images in 'sorted'
print("Computing pHashes for 'sorted' images...")
sorted_image_phashes = compute_phashes(sorted_images)
# Prepare the 'already_processed' directory
os.makedirs('already_processed', exist_ok=True)
# Set a Hamming distance threshold for considering images as duplicates
threshold = 5 # Adjust this value as needed
# Find and move duplicates
for sorted_image, sorted_phash in sorted_image_phashes.items():
duplicate_found = False
for ready_image, ready_phash in ready_image_phashes.items():
# Compute Hamming distance between the two pHashes
try:
distance = sorted_phash - ready_phash
except TypeError as e:
print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
continue
if distance <= threshold:
# Duplicate found
newpath = sorted_image.replace('sorted', 'already_processed')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
os.rename(sorted_image, newpath)
duplicate_found = True
break # Exit the loop since a duplicate is found
if not duplicate_found:
print(f"No duplicate found for {sorted_image}")

@ -1,59 +0,0 @@
import config
# Function to find the closest perceptual hash (phash) match
def find_almost_identical_phash(phash, usernames, max_distance=1):
"""
Find a username whose phash is nearly identical to the given phash.
:param phash: The phash to compare (e.g., from the 'unknown' image).
:param usernames: List of tuples containing (username, phash).
:param max_distance: Maximum Hamming distance to consider as "identical".
:return: The matching username and phash, or None if no match is found.
"""
for username in usernames:
dist = hamming_distance(phash, username[1])
if dist <= max_distance:
return username
return None
def hamming_distance(phash1, phash2):
"""
Calculate the Hamming distance between two binary strings.
"""
if len(phash1) != len(phash2):
raise ValueError("Hashes must be of the same length")
return sum(c1 != c2 for c1, c2 in zip(phash1, phash2))
# Establish database connection
db, cursor = config.gen_connection()
# Fetch all images with an 'unknown' username
cursor.execute("SELECT id, username, phash FROM media WHERE username = 'unknown'")
rows = cursor.fetchall()
# Fetch all non-unknown usernames and their associated phash
cursor.execute("SELECT username, phash FROM media WHERE username != 'unknown' AND phash IS NOT NULL AND status = 'public'")
usernames = cursor.fetchall()
# Ensure there are valid usernames to compare against
if not usernames:
print("No known usernames found in the database.")
exit()
# Adjusted section in your script
for row in rows:
id = row[0]
phash = row[2]
# Find a nearly identical phash match
closest = find_almost_identical_phash(phash, usernames, max_distance=2)
if closest:
print(f"Found match for image {id}: {closest[0]} with phash {closest[1]}")
cursor.execute(
"UPDATE media SET username = %s WHERE id = %s",
(closest[0], id),
)
db.commit()
else:
print(f"No nearly identical match found for image {id}.")

@ -1,90 +0,0 @@
from funcs import get_files # Assuming this is defined elsewhere
from PIL import Image
import imagehash
import config
import os
def generate_image_phash(filepath, hash_size=8):
try:
# Open the image using PIL
pil_image = Image.open(filepath)
# Compute pHash using the imagehash library
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
except Exception as e:
print(f"Error processing image {filepath}: {e}")
return None
def are_phashes_duplicates(phash1, phash2, threshold=5):
try:
# Compute the Hamming distance between the pHashes
distance = phash1 - phash2
return distance <= threshold
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
# existing_username = media[2]
# if existing_username != username:
# continue # Only compare with the same user's media
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
# Check if the current pHash is a duplicate
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
# Database connection
db, cursor = config.gen_connection()
directory = 'check_if_exists' # Directory containing user images
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
existing_medias = cursor.fetchall()
existing_phashes = [media[1] for media in existing_medias]
# Go through the directory folder where each subfolder is a username
users = os.listdir(directory)
for username in users:
user_images_path = os.path.join(directory, username)
if not os.path.isdir(user_images_path):
continue # Skip non-directory files
# Get all images for the current user
images = get_files(user_images_path) # Assuming this gets all image files
for filepath in images:
image_filename = os.path.basename(filepath)
print(f'Processing {image_filename}...')
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue # Skip this image if there's an issue
phash_str = str(phash)
if phash_str not in existing_phashes:
print(f'No duplicate found for {image_filename}')
continue
# Check if the image is a duplicate of any in the database
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
found_username = duplicate_media[2]
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')
newpath = os.path.join('duplicates', found_username, image_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {image_filename} to duplicates/')

@ -1,87 +0,0 @@
from PIL import Image
import imagehash
import config
import cv2
import os
def generate_thumbnail_phash(filepath, hash_size=8): # Set hash_size to 8
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cap.release()
if not ret:
print(f"Error reading frame from {filepath}")
return None
# Resize frame to a standard size
standard_size = (320, 240)
resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
# Convert OpenCV image (BGR) to PIL Image (RGB)
image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image_rgb)
# Compute pHash
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
def are_phashes_duplicates(phash1, phash2, threshold=5):
# Compute Hamming distance between the pHashes
try:
distance = phash1 - phash2
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
return distance <= threshold
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
if existing_username != username:
continue
# Convert stored phash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
# Database connection
db, cursor = config.gen_connection()
# Directory containing user videos
directory = 'check_if_exists'
# Fetch existing videos with pHashes
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
existing_medias = cursor.fetchall()
users = os.listdir(directory) # Assuming 'check_if_exists' contains user videos
for username in users:
user_videos_path = os.path.join(directory, username)
if not os.path.isdir(user_videos_path):
continue
videos = [video for video in os.listdir(user_videos_path) if video.endswith(('.mp4', '.avi', '.mov'))]
for video in videos:
print(f'Processing {video}...')
filepath = os.path.join(user_videos_path, video)
phash = generate_thumbnail_phash(filepath, hash_size=8) # Use hash_size=8
if phash is None:
continue
phash_str = str(phash)
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate video path: {filepath}')
newpath = filepath.replace(directory, 'duplicates')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {video} to duplicates/')

@ -1,58 +0,0 @@
from funcs import generate_phash
import os
def find_duplicates(source_dir, target_dir, extensions, max_distance):
"""Remove duplicates in target_dir that are found in source_dir based on Hamming distance."""
source_files = {}
target_files = {}
# Helper function to filter files by extension
def filter_files(files):
return [f for f in files if os.path.splitext(f)[1].lower() in extensions]
# Build hash map of source directory
for dirpath, _, filenames in os.walk(source_dir):
for filename in filter_files(filenames):
filepath = os.path.join(dirpath, filename)
filehash = generate_phash(filepath, str=False)
if filehash:
source_files[filehash] = filepath
# Build hash map of target directory and compare
for dirpath, _, filenames in os.walk(target_dir):
for filename in filter_files(filenames):
filepath = os.path.join(dirpath, filename)
filehash = generate_phash(filepath, str=False)
if not filehash:
continue
# Check if this file is similar to any of the source files
is_duplicate = False
for source_hash in source_files.keys():
distance = filehash - source_hash # Hamming distance
if distance <= max_distance:
is_duplicate = True
break # Found a duplicate
if is_duplicate:
newpath = os.path.join('duplicates', filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f"Moved duplicate: {filepath} to duplicates/ (distance: {distance})")
else:
target_files[filehash] = filepath
if __name__ == '__main__':
# Paths to the directories
source_dir = 'D:/Crawlers/media/Coomer/sadierayxo'
target_dir = 'sorted/sadierayxo'
# List of accepted extensions
extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif'}
# Maximum Hamming distance to consider as duplicates
MAX_DISTANCE = 5 # Adjust this threshold as needed
find_duplicates(source_dir, target_dir, extensions, MAX_DISTANCE)
print("Duplicate removal process completed.")

@ -1,110 +0,0 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
def scan_dupes(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
if media_id:
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
except:
print(f'Error: {filepath}')
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story'):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
return True
except: media_id = uuid.uuid4().hex
dirtype = 'stories' if post_type == 'story' else 'posts'
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type)
newCursor.execute(query, values)
newDB.commit()
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
post_type = 'story' if folder_path.split('\\')[0] == 'stories' else 'post'
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, post_type=post_type)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT media_id FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_instagram('media/posts')
dump_instagram('media/stories')
scan_dupes('media/posts')
scan_dupes('media/stories')
clean_empty_folders('media/posts')
clean_empty_folders('media/stories')
print("Processing completed.")

@ -1,110 +0,0 @@
from BunnyCDN.Storage import Storage
import os, uuid, config, funcs, cv2
from datetime import datetime
from PIL import Image
def dump_facebook(folder_path):
for filename in os.listdir(folder_path):
if os.path.isdir(os.path.join(folder_path, filename)):
continue
username = filename.split("'")[0]
filepath = os.path.join(folder_path, filename)
mediatype = funcs.get_media_type(filename)
post_type = funcs.determine_post_type(filepath, mediatype)
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
for folder in os.listdir(folder_path):
if os.path.isdir(os.path.join(folder_path, folder)):
username = folder
for filename in os.listdir(os.path.join(folder_path, folder)):
filepath = os.path.join(folder_path, folder, filename)
mediatype = funcs.get_media_type(filename)
post_type = funcs.determine_post_type(filepath, mediatype)
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
def upload_file(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
file_hash = funcs.calculate_file_hash(filepath)
if file_hash in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return False
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
if "FB_IMG" in filename: media_id = filename.split("_")[2].split(".")[0]
else: media_id = uuid.uuid4().hex
dirtype = funcs.determine_post_type(filepath, media_type)
server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = funcs.get_video_dimensions(filepath)
thumbnail_url = None
if media_type == 'video':
thumbPath = f'temp/{media_id}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumbPath, frame)
cap.release()
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg')
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
if post_type == 'stories':
post_type = 'story'
else:
post_type = 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, platform, hash, filename, duration, thumbnail) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, 'facebook', file_hash, filename, duration, thumbnail_url)
try:
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
except Exception as e:
print(f"Database error: {e}")
return False
try:
if newCursor.rowcount > 0:
os.remove(filepath)
except Exception as e:
print(f"Failed to remove local file {filepath}: {e}")
return True
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT hash FROM media WHERE platform='facebook' AND hash IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_facebook('facebook/')
print("Processing completed.")

@ -1,82 +0,0 @@
from BunnyCDN.Storage import Storage
from datetime import datetime
import os, config, funcs
from PIL import Image
def dump_instagram(folder_path):
for filename in os.listdir(folder_path):
parts = filename.split('_')
try:
username = '_'.join(parts[:-2]) # Join all except last two
timestamp = int(parts[-2]) # Second last is timestamp
user_id = int(parts[-1].split('.')[0]) # Last part before extension is user_id
except Exception as e:
print(f"Invalid filename: {filename}. Error: {e}")
continue
filepath = os.path.join(folder_path, filename)
mediatype = funcs.get_media_type(filename)
post_type = funcs.determine_post_type(filepath, mediatype)
UploadMedia(username=username, media_type=mediatype, filepath=filepath, post_type=post_type, timestamp=timestamp, user_id=user_id)
def UploadMedia(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
if 'tero' in username:
pass
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
file_hash = funcs.calculate_file_hash(filepath)
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
dirtype = funcs.determine_post_type(filepath, media_type)
server_path = f'media/{dirtype}/{username}/{file_hash}{file_extension}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
if file_hash in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
obj_storage.PutFile(filepath, server_path)
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = funcs.get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration)
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_instagram('storysaver/missingdata/')
print("Processing completed.")

@ -1,67 +0,0 @@
from BunnyCDN.Storage import Storage
import os, uuid, config, funcs
from datetime import datetime
from PIL import Image
def dump_facebook(folder_path):
for folder in os.listdir(folder_path):
if os.path.isdir(os.path.join(folder_path, folder)):
username = folder
for filename in os.listdir(os.path.join(folder_path, folder)):
filepath = os.path.join(folder_path, folder, filename)
upload_file(username=username, filepath=filepath)
def upload_file(filepath, username):
filename = os.path.basename(filepath)
media_id = filename.split('.')[0]
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
dirtype = funcs.determine_post_type(filepath, media_type)
server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if file_hash in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return False
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, platform, hash, filename, duration, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, 'tiktok', file_hash, filename, duration, media_id)
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
if newCursor.rowcount > 0:
os.remove(filepath)
return True
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT hash FROM media WHERE platform='tiktok' AND hash IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_facebook('tiktok/')
print("Processing completed.")

@ -1,32 +0,0 @@
import os, funcs
from funcs import generate_phash
def get_username(image, ready_images):
for ready_image in ready_images:
if os.path.basename(image) in ready_image:
ready_image = ready_image.replace('\\', '/')
return ready_image.split('/')[1]
return None
ready_images = funcs.get_files('ready_to_upload')
ready_images = [image for image in ready_images if not image.endswith('.mp4')]
sorted_images = funcs.get_files('sorted')
sorted_images = [image for image in sorted_images if not image.endswith('.mp4')]
os.makedirs('already_processed', exist_ok=True)
for image in sorted_images:
image = image.replace('\\', '/')
username = image.split('/')[1]
filename = os.path.basename(image)
for ready_image in ready_images:
if filename in ready_image:
username = get_username(image, ready_images)
newpath = ready_image.replace('ready_to_upload', 'already_processed')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
print(f'Moving {image} which is a match for {ready_image} to already_processed')
os.rename(image, newpath)
print(f'Moved {ready_image} to already_processed')
break

@ -1,11 +1,9 @@
from BunnyCDN.Storage import Storage
import config, os, cv2
from concurrent.futures import ThreadPoolExecutor
# this script will take a screenshot of the first frame of each video and upload it as a thumbnail to BunnyCDN
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'video' AND thumbnail IS NULL and status = 'public';")
@ -30,7 +28,7 @@ def DownloadFile(serverPath, cacheDir):
def ImportMedias():
with ThreadPoolExecutor(max_workers=10) as executor:
for video in results:
serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
serverPath = video[2].replace("https://cdn.altpins.com/", '').replace('//', '/').replace('\\', '/')
executor.submit(DownloadFile, serverPath, cacheDir)
@ -41,7 +39,7 @@ for result in results:
mediaURL = result[2]
extension = mediaURL.split('.')[-1]
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
serverPath = result[2].replace("https://cdn.altpins.com/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
@ -52,7 +50,7 @@ for result in results:
cv2.imwrite('thumbnail.jpg', frame)
cap.release()
thumbnailURL = f"https://storysave.b-cdn.net/thumbnails/{itemID}.jpg"
thumbnailURL = f"https://cdn.altpins.com/thumbnails/{itemID}.jpg"
obj_storage.PutFile('thumbnail.jpg', f'thumbnails/{itemID}.jpg')

@ -1,56 +0,0 @@
from BunnyCDN.Storage import Storage
import os, config, requests
from moviepy.editor import VideoFileClip
def get_media_type(filename):
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
video_extensions = {".mp4", ".mov"}
extension = os.path.splitext(filename.lower())[1]
if extension in image_extensions:
return 'image'
elif extension in video_extensions:
return 'video'
else:
return 'unknown'
def determine_post_type(media_type):
# Assuming the post type is directly based on media type.
return media_type
def get_video_dimensions(filepath):
with VideoFileClip(filepath) as clip:
width, height = clip.size
return width, height
def download_file(url):
local_filename = url.split('/')[-1]
# Note: Stream=True to avoid loading the whole file into memory
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return local_filename
if __name__ == '__main__':
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
posts = open('fucked', 'r')
for item in posts:
username, url = item.strip().split('~')
media_id = url.split('/')[-1].split('.')[0]
media_type = get_media_type(url)
query = "INSERT IGNORE INTO media (username, media_type, platform, media_url) VALUES (%s, %s, %s, %s)"
values = (username, media_type, 'facebook', url)
try:
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated.{url}')
except Exception as e:
print(f"Database error: {e}")
posts.close()

@ -1,40 +0,0 @@
import config, os, json
from PIL import Image
import imagehash
def find_file(filename, directory):
filename = filename.lower().split('.')[0]
for root, dirs, files in os.walk(directory):
for file in files:
if filename in file:
return os.path.join(root, file)
return None
def generate_phash(image_path):
image = Image.open(image_path)
return str(imagehash.phash(image))
count = 0
cacheDir = 'sorted'
dataPath = 'pins.json'
os.makedirs(cacheDir, exist_ok=True)
medias = json.load(open(dataPath))
for item in medias:
count += 1
filepath = item['filepath']
if os.path.exists(filepath):
continue
newfilepath = find_file(os.path.basename(filepath), cacheDir)
if newfilepath:
print(f"Found file {newfilepath} for {filepath}")
item['filepath'] = newfilepath
with open(dataPath, 'w') as f:
json.dump(medias, f)

@ -1,28 +0,0 @@
import os, json
from funcs import generate_phash
count = 0
cacheDir = '_sort'
dataPath = 'pins.json'
os.makedirs(cacheDir, exist_ok=True)
medias = json.load(open(dataPath))
for item in medias:
count += 1
if item['type'] == 'image':
filepath = item['filepath']
if 'phash' in item:
print(f"Skipping {count}/{len(medias)}: already processed.")
continue
if not os.path.exists(filepath):
print(f"File {filepath} does not exist, skipping.")
continue
phash = generate_phash(filepath)
item['phash'] = phash
print(f"Processed {count}/{len(medias)}: with pHash {phash}")
with open(dataPath, 'w') as f:
json.dump(medias, f)

@ -1,19 +0,0 @@
import config, storysave_api
db, cursor = config.gen_connection()
usernames = []
with open('usernames.txt', 'r') as f:
for line in f:
usernames.append(line.strip())
for username in usernames:
print(f"Username: {username}")
user_id = storysave_api.get_user_id(username)
# Update the user_id in the database
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
db.commit()
print(f"[{cursor.rowcount}] Updated user_id for {username}")

@ -1,94 +0,0 @@
from BunnyCDN.Storage import Storage
from moviepy.editor import VideoFileClip
import config
import hashlib
import requests
import os
def file_hash_from_url(url, hash_algo='sha256'):
h = hashlib.new(hash_algo)
response = requests.get(url, stream=True)
if response.status_code == 200:
for chunk in response.iter_content(8192):
h.update(chunk)
return h.hexdigest()
else:
raise Exception(f"Failed to download file: Status code {response.status_code}")
def get_video_duration(file_path):
"""
Returns the duration of the video file in seconds.
:param file_path: Path to the video file
:return: Duration in seconds
"""
try:
with VideoFileClip(file_path) as video:
return video.duration
except:
return 0
def file_hash(filename, hash_algo='sha256'):
"""
Compute the hash of a file.
:param filename: Path to the file.
:param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5').
:return: Hexadecimal hash string.
"""
# Create a hash object
h = hashlib.new(hash_algo)
# Open the file in binary mode and read in chunks
with open(filename, 'rb') as file:
while chunk := file.read(8192):
h.update(chunk)
# Return the hexadecimal digest of the hash
return h.hexdigest()
# the hash of the images are different due to optimizer
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE duration = 0 AND media_type = 'video' AND status != 'deleted';")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
cacheDir = 'cache'
for result in results:
count += 1
videoID = result[0]
mediaID = result[1]
mediaURL = result[2]
extension = mediaURL.split('.')[-1]
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if os.path.exists(localFilePath):
print(f"File already exists: {localFilePath}")
else:
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
duration = get_video_duration(localFilePath)
if duration == 0:
print(f"Failed to get duration for {localFilePath}")
continue
if duration < 1:
duration = 1
cursor.execute("UPDATE media SET duration = %s WHERE id = %s;", (duration, result[0]))
db.commit()
print(f"[{count}/{len(results)}] {result[1]}: {duration}, {cursor.rowcount}")

@ -1,47 +0,0 @@
from BunnyCDN.Storage import Storage
import config
import hashlib
import os
def file_hash(filename, hash_algo='sha256'):
"""
Compute the hash of a file.
:param filename: Path to the file.
:param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5').
:return: Hexadecimal hash string.
"""
h = hashlib.new(hash_algo)
with open(filename, 'rb') as file:
while chunk := file.read(8192):
h.update(chunk)
return h.hexdigest()
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(os.getcwd(), 'temp', os.path.basename(serverPath))
if not os.path.exists(localFilePath):
obj_storage.DownloadFile(storage_path=serverPath, download_path=os.path.join(os.getcwd(), 'temp'))
filehash = file_hash(localFilePath)
cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, result[0]))
db.commit()
print(f"[{count}/{len(results)}] {result[1]}: {filehash}, {cursor.rowcount}")

@ -1,47 +0,0 @@
from BunnyCDN.Storage import Storage
import config, os, funcs
from PIL import Image
# the hash of the images are different due to optimizer
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
cacheDir = 'cache'
for result in results:
count += 1
videoID = result[0]
mediaID = result[1]
mediaURL = result[2]
extension = mediaURL.split('.')[-1]
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if os.path.exists(localFilePath):
print(f"File already exists: {localFilePath}")
else:
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
mediaType = funcs.get_media_type(localFilePath)
if mediaType == 'image':
with Image.open(localFilePath) as img:
width, height = img.size
elif mediaType == 'video':
width, height = funcs.get_video_dimensions(localFilePath)
cursor.execute("UPDATE media SET width = %s, height=%s WHERE id = %s;", (width, height, videoID))
db.commit()
print(f"[{count}/{len(results)}] width: {width}, height: {height} {cursor.rowcount}")

@ -1,32 +0,0 @@
import config
import os
temp_directory = "cache"
os.makedirs(temp_directory, exist_ok=True)
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
id, media_url = result
serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
continue
file_size = os.path.getsize(localFilePath)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id))
db.commit()
print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}")

@ -1,36 +0,0 @@
import config
from funcs import generate_phash
count = 0
storage = config.get_storage()
db, cursor = config.gen_connection()
generate_for = 'media_url'
media_type = 'image'
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL;", [media_type])
medias = cursor.fetchall()
for item in medias:
count += 1
itemID = item[0]
media_url = item[1]
server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/')
filepath = storage.DownloadFile(server_path, 'temp')
if not filepath:
print(f"Error downloading {server_path}")
continue
phash = generate_phash(filepath)
if not phash:
print(f"Error generating pHash for {filepath}")
continue
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID])
db.commit()
print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}")

@ -1,39 +0,0 @@
import config, os
from funcs import generate_phash
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = 0;")
results = cursor.fetchall()
count = 0
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
itemID = result[0]
mediaID = result[1]
if not mediaID:
print(f"Media ID is null, skipping.")
continue
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
print(f"File {localFilePath} does not exist, skipping.")
continue
phash = generate_phash(localFilePath)
if not phash:
print(f"Error generating pHash for {localFilePath}, skipping.")
continue
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
db.commit()
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")

@ -1,74 +0,0 @@
import config, os, threading, queue
from funcs import generate_phash
# Initialize database connection
db, cursor = config.gen_connection()
# Query the media table for unprocessed images
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = '0';")
results = cursor.fetchall()
# Setup cache directory
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")
# Thread-safe queue for processed media
processed_media_queue = queue.Queue()
def process_media():
"""Thread function to update database with processed pHash values."""
while True:
try:
item = processed_media_queue.get(timeout=10) # Timeout prevents infinite blocking
if item is None: # Sentinel value to exit the loop
break
itemID, phash = item
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
db.commit()
print(f"Updated database for ID {itemID} with pHash {phash}.")
except queue.Empty:
continue
# Start the database update thread
update_thread = threading.Thread(target=process_media, daemon=True)
update_thread.start()
# Main processing loop for generating pHash
count = 0
for result in results:
count += 1
itemID = result[0]
mediaID = result[1]
if not mediaID:
print(f"Media ID is null, skipping.")
continue
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
print(f"File {localFilePath} does not exist, skipping.")
continue
phash = generate_phash(localFilePath)
if not phash:
print(f"Error generating pHash for {localFilePath}, skipping.")
continue
# Add the processed media to the queue
processed_media_queue.put((itemID, phash))
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
# Signal the update thread to stop
processed_media_queue.put(None)
# Wait for the update thread to finish
update_thread.join()
print("Processing completed.")

@ -1,43 +0,0 @@
import os
import json
import config
# Establish database connection
db, cursor = config.gen_connection()
# Fetch rows with file_size = 0
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")
update_data = []
for result in results:
itemID = result[0]
media_id = result[1]
if not media_id:
print(f"Media ID is null for ID {itemID}, skipping.")
continue
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
print(f"File {localFilePath} does not exist for ID {itemID}, skipping.")
continue
file_size = os.path.getsize(localFilePath)
update_data.append({"id": itemID, "file_size": file_size})
# Save the results to a JSON file
output_file = "update_data.json"
with open(output_file, 'w') as f:
json.dump(update_data, f, indent=4)
print(f"Saved {len(update_data)} updates to {output_file}.")
cursor.close()
db.close()

File diff suppressed because it is too large Load Diff

@ -1,29 +0,0 @@
import json
import config
# Establish database connection
db, cursor = config.gen_connection()
# Load update data from the JSON file
input_file = "update_data.json"
with open(input_file, 'r') as f:
update_data = json.load(f)
print(f"Loaded {len(update_data)} records to update.")
# Process each record one by one
for count, item in enumerate(update_data, start=1):
item_id = item["id"]
file_size = item["file_size"]
try:
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s", (file_size, item_id))
db.commit()
print(f"Processed {count}/{len(update_data)}: ID {item_id} updated with file size {file_size}.")
except Exception as e:
print(f"Error updating ID {item_id}: {e}")
db.rollback()
print("All updates completed.")
cursor.close()
db.close()

@ -1,31 +0,0 @@
from BunnyCDN.Storage import Storage
import config, os
db, cursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
print(f"Found {len(results)} files to process.")
cacheDir = 'cache'
for result in results:
itemID = result[0]
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
continue
file_size = os.path.getsize(localFilePath)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, itemID))
db.commit()
print(f"Processed ID {itemID}: updated with file size {file_size}.")
cursor.close()
db.close()

@ -1,112 +0,0 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
def scan_dupes(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
if media_id:
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
except:
pass
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story'):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
return True
except: media_id = uuid.uuid4().hex
dirtype = 'stories' if post_type == 'story' else 'posts'
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type)
newCursor.execute(query, values)
newDB.commit()
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
post_type = 'story' if folder_path.split('\\')[0] == 'stories' else 'post'
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, post_type=post_type)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT media_id FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
scan_dupes('media/posts')
scan_dupes('media/stories')
scan_dupes('StorySave/')
dump_instagram('media/posts')
dump_instagram('media/stories')
dump_instagram('StorySave/')
clean_empty_folders('media/posts')
clean_empty_folders('media/stories')
clean_empty_folders('StorySave/')
print("Processing completed.")

@ -1,33 +0,0 @@
import bunny, json
medias = json.load(open('videos.json', 'r'))
videoIDS = [media['url'].split('/')[-1] for media in medias]
videos = bunny.list_videos()
with open('allVideos.json', 'w') as f:
json.dump(videos, f, indent=4)
missingVideos = []
for video in videos:
if video['guid'] in videoIDS:
continue
missingVideos.append(video)
datas = []
for video in missingVideos:
data = {
'guid': video['guid'],
'title': video['title'],
'length': video['length'],
'width': video['width'],
'height': video['height'],
'availableResolutions': video['availableResolutions'],
'storageSize': video['storageSize'],
'hasMP4Fallback': video['hasMP4Fallback'],
'category': video['category'],
}
datas.append(data)
with open('missing_videos.json', 'w') as f:
json.dump(datas, f, indent=4)

@ -1,27 +0,0 @@
from BunnyCDN.Storage import Storage
import os, json
altpins_obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
medias = json.load(open('db_pins.json', 'r'))
count = 0
print(f"Found {len(medias)} files to process.")
cacheDir = 'old_altpins_cache'
for media in medias:
count += 1
username = media['title']
mediaID = media['photo_id']
mediaURL = media['url']
extension = mediaURL.split('.')[-1]
serverPath = mediaURL.replace("https://altpins.b-cdn.net/", '').replace('//', '/').replace('\\', '/').replace('https://altpins.b-cdn.net/', '')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if os.path.exists(localFilePath):
continue
altpins_obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
print(f"Downloaded {count}/{len(medias)}: {localFilePath}")

@ -1,16 +0,0 @@
import json, bunny, os
from concurrent.futures import ThreadPoolExecutor
medias = json.load(open('missing_videos.json', 'r'))
#videoIDS = [media['url'].split('/')[-1] for media in medias]
videoIDS = [media['guid'] for media in medias]
with ThreadPoolExecutor(max_workers=10) as executor:
for id in videoIDS:
filePath = f"MISSING_STREAM_VIDEOS/{id}.zip"
if os.path.exists(filePath):
print(f'Video already exists as {filePath}. Skipping...')
continue
executor.submit(bunny.download_video, id)

@ -1,29 +0,0 @@
import os, json, config
# Load the data
pins = json.load(open('db_pins.json', 'r'))
files = os.listdir('STORAGE_IMPORTED/')
db, cursor = config.gen_connection()
cursor.execute('SELECT hash FROM media WHERE hash IS NOT NULL;')
existing_hashes = [hash[0] for hash in cursor.fetchall()]
for pin in pins:
if pin['hash'] in existing_hashes:
print(f"Found {pin['hash']} in the imported folder.")
pins.remove(pin)
alreadyImported = []
for pin in pins:
filepath = pin['filepath']
username = pin['title']
filename = os.path.basename(filepath)
if filename in files:
print(f"Found {filename} in the imported folder.")
alreadyImported.append(pins.pop(pins.index(pin)))
# Save to the file
json.dump(pins, open('db_pins.json', 'w'))
json.dump(alreadyImported, open('db_pins_imported.json', 'w'))

@ -1,14 +0,0 @@
import os, json, bunny
medias = json.load(open('allVideos.json', 'r'))
mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True]
missing = json.load(open('missing_videos.json', 'r'))
count = 0
cacheDir = 'old_mp4fallback_cache'
print(f"Found {len(medias)} files to process.")
for media in mp4Medias:
count += 1
filePath = os.path.join(cacheDir, media['guid'] + '.mp4')

@ -1,36 +0,0 @@
import os, json, bunny, config
db, cursor = config.gen_connection()
cursor.execute('SELECT media_id FROM media WHERE media_id IS NOT NULL;')
mediaIDS = cursor.fetchall()
pins = json.load(open('pins.json', 'r'))
videos = json.load(open('db_videos.json', 'r'))
pins = json.load(open('db_pins.json', 'r'))
ids = [video['id'] for video in videos]
for pin in pins:
if pin['id'] in ids:
pins.remove(pin)
# save to the file
json.dump(pins, open('db_pins.json', 'w'))
medias = json.load(open('allVideos.json', 'r'))
mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True]
missing = json.load(open('missing_videos.json', 'r'))
count = 0
cacheDir = 'old_mp4fallback_cache'
print(f"Found {len(medias)} files to process.")
for media in mp4Medias:
count += 1
filePath = os.path.join(cacheDir, media['guid'] + '.mp4')

@ -1,53 +0,0 @@
import os, json, funcs
STORAGE_IMPORTED = 'STORAGE_IMPORTED'
pins = json.load(open('db_pins.json', 'r'))
for pin in pins:
filename = pin['url'].split('/')[-1]
filepath = os.path.join(STORAGE_IMPORTED, filename)
pin['filename'] = filename
if not pin['hash']:
pin['hash'] = funcs.calculate_file_hash(filepath)
json.dump(pins, open('db_pins.json', 'w'), indent=4)
files = os.listdir(STORAGE_IMPORTED)
for file in files:
filepath = os.path.join(STORAGE_IMPORTED, file)
fileHash = funcs.calculate_file_hash(filepath)
if fileHash not in file:
print(f'Renaming {file} to {fileHash}')
os.rename(filepath, os.path.join(STORAGE_IMPORTED, fileHash))
pins_by_username = {}
for pin in pins:
username = pin['title']
if username not in pins_by_username:
pins_by_username[username] = []
pins_by_username[username].append(pin)
for username, username_pins in pins_by_username.items():
username_folder = os.path.join(STORAGE_IMPORTED, username)
os.makedirs(username_folder, exist_ok=True)
for pin in username_pins:
photo_id = pin['photo_id']
photo_url = pin['url']
fileHash = pin['hash']
if not fileHash:
continue
extension = photo_url.split('.')[-1]
filename = f'{fileHash}.{extension}'
filePath = os.path.join(STORAGE_IMPORTED, filename)
outputPath = os.path.join(STORAGE_IMPORTED, username, filename)
if os.path.exists(outputPath):
print(f'File {outputPath} already exists. Skipping...')
continue
print(f'Moving {photo_url} to {outputPath}')
os.rename(filePath, outputPath)

@ -1,57 +0,0 @@
import os
import hashlib
# Directories
fucked_dir = 'tiktoks/fucked/aleksandra'
source_dir = 'tiktoks/waiting_for_process/aleksandraverse'
def hash_file(filepath):
"""Generate MD5 hash of a file."""
hash_md5 = hashlib.md5()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_file_hashes(directory):
"""Generate a dictionary of file hashes for all files in a directory."""
file_hashes = {}
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
file_hashes[file_path] = hash_file(file_path)
return file_hashes
def files_are_identical(file1, file2):
"""Compare two files byte-by-byte."""
with open(file1, "rb") as f1, open(file2, "rb") as f2:
while True:
chunk1 = f1.read(4096)
chunk2 = f2.read(4096)
if chunk1 != chunk2:
return False
if not chunk1: # End of file
return True
def remove_duplicates(fucked_dir, source_files):
"""Remove files in 'fucked' that are identical to those in 'source_files'."""
for root, _, files in os.walk(fucked_dir):
for file in files:
file_path = os.path.join(root, file)
for source_file in source_files:
if files_are_identical(file_path, source_file):
print(f"Duplicate found. Removing: {file_path}")
os.remove(file_path)
break
def main():
print("Scanning source directory for hashes...")
source_hashes = get_file_hashes(source_dir)
print("Scanning 'fucked' directory for duplicates...")
remove_duplicates(fucked_dir, source_hashes)
print("Cleanup complete.")
if __name__ == "__main__":
main()

@ -1,49 +0,0 @@
import json, os
from videohash import VideoHash
from moviepy.editor import VideoFileClip
def is_valid_video(file_path):
try:
with VideoFileClip(file_path) as video:
return True
except Exception as e:
print(f"Invalid video {file_path}: {str(e)}")
return False
def load_hashes(file_path):
try:
with open(file_path, 'r') as file:
return json.load(file)
except FileNotFoundError:
return {}
def save_hashes(hashes, file_path):
with open(file_path, 'w') as file:
json.dump(hashes, file, indent=4)
hashes = load_hashes('video_hashes.json')
video_directory = 'STORAGE'
for username in os.listdir(video_directory):
user_dir = os.path.join(video_directory, username)
if not os.path.isdir(user_dir):
continue
for video_file in os.listdir(user_dir):
video_path = os.path.join(user_dir, video_file)
if not video_file.endswith(('.mp4', '.mkv', '.avi')) or not is_valid_video(video_path):
continue
if username in hashes and any(v[0] == video_file for v in hashes[username]):
continue
try:
video_hash = VideoHash(path=video_path)
if username in hashes:
hashes[username].append((video_file, video_hash.hash))
else:
hashes[username] = [(video_file, video_hash.hash)]
except Exception as e:
print(f"Error processing {video_file}: {str(e)}")
save_hashes(hashes, 'video_hashes.json')

@ -1,17 +0,0 @@
import os, config, funcs
db, cursor = config.gen_connection()
cursor.execute("SELECT phash FROM media WHERE phash IS NOT NULL")
phashes = set([x[0] for x in cursor.fetchall()])
files = funcs.get_files("check_if_exists")
for file in files:
image_phash = funcs.generate_phash(file)
if image_phash in phashes:
print(f"File {file} exists in the database")
os.remove(file)
funcs.cleanEmptyFolders("check_if_exists")

@ -1,159 +0,0 @@
from snapchat import get_data, get_stories, get_highlight_stories
from datetime import datetime
import requests
import config
import json
import os
"""
media_url_filename = url.split('/')[-1].split('?')[0]
etag = response.headers.get('ETag', '').replace('"', '')
filename = f"{username}~{timestamp}-{media_url_filename}~{etag}{extension}"
filepath = os.path.join(directory, 'highlights', filename)
"""
directory = "snapchat"
data_directory = "data"
def get_existing_snap_ids(directory):
existing_snap_ids = set()
for root, _, files in os.walk(directory):
for file in files:
if '~' not in file:
continue
filename, _ = os.path.splitext(file)
snap_id = filename.split('~')[2]
existing_snap_ids.add(snap_id)
return existing_snap_ids
def find_duplicate_snap(existing_snaps, snap_id, username):
for snap in existing_snaps:
if username == snap[2]:
if snap_id in snap[1]:
return snap
return False
def archive_data(data, username):
data_filename = f"{username}~{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
data_filepath = os.path.join(data_directory, data_filename)
with open(data_filepath, 'w') as f:
f.write(json.dumps(data))
print(f"Archived data for {username} at {data_filepath}")
def get_file_extension(url):
response = requests.head(url)
if response.status_code != 200:
print(f"Failed to access media {url}")
return None
content_type = response.headers.get('Content-Type', '')
if 'image' in content_type:
return '.jpg'
elif 'video' in content_type:
return '.mp4'
else:
print(f"Unknown content type for media {url}")
return None
def extract_file_type(url):
file_types = {
'400': '.jpg',
'1322': '.mp4',
'1325': '.mp4',
'1034': '.mp4',
'1023': '.jpg'
}
base_url = url.split("?")[0] # Remove query string
snap_data = base_url.split('/')[-1]
# Extract the file type number
data_parts = snap_data.split('.')
if len(data_parts) > 1:
file_type_number = data_parts[1]
if file_type_number in file_types:
return file_types[file_type_number]
else:
print(f"Unexpected URL format: {base_url}")
return None
def download_media(url, filepath):
if os.path.exists(filepath):
print(f"File {filepath} already exists. Skipping download.")
return filepath
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to download media {url}")
return None
with open(filepath, 'wb') as f:
f.write(response.content)
return filepath
def main():
if not os.path.exists(directory):
os.makedirs(directory)
db, cursor = config.gen_connection()
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat'")
usernames = [row[0] for row in cursor.fetchall()]
cursor.execute("SELECT id, filename, username FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
existing_medias = cursor.fetchall()
existing_snap_ids = get_existing_snap_ids(directory)
for username in usernames:
print(f"Getting stories for {username}...")
data = get_data(username)
if not data:
continue
archive_data(data, username)
print("Getting stories...")
stories = get_stories(data)
print("Getting highlights...")
stories.extend(get_highlight_stories(data))
for story in stories:
snap_id = story['snap_id']
url = story['url']
timestamp = story['timestamp']
duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
if duplicate_snap:
print(f"Media {snap_id} already exists. Skipping download.")
continue
# Check if media already exists
if snap_id in existing_snap_ids:
print(f"Media {snap_id} already exists. Skipping download.")
continue
# Determine file extension using HEAD request.
# TODO: find a better way to determine file extension without downloading the file.
extension = extract_file_type(url)
if not extension:
continue
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)
# Check if file already exists
if os.path.exists(filepath):
print(f"File {filename} already exists. Skipping download.")
continue
# Download the media
filepath = download_media(url, filepath)
print(f"Downloaded {filename} at {timestamp}")
if __name__ == "__main__":
main()

@ -1,154 +0,0 @@
from datetime import datetime
import config
import funcs
import cv2
import os
directory = 'media/instagram/'
def UploadMedia(media):
media_id = media['media_id']
username = media['username']
post_date = media['timestamp']
user_id = media['user_id']
filepath = media['filepath']
highlight_id = media['highlight_id']
post_type = media['post_type']
thumbnail_url = None
phash = None
if media_id and int(media_id) in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'video':
try:
thumbPath = f'temp/{media_id}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumbPath, frame)
cap.release()
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
phash = funcs.generate_phash(thumbPath)
os.remove(thumbPath)
except:
print('Error generating thumbnail. Skipping...')
return False
elif media_type == 'image':
phash = funcs.generate_phash(filepath)
if media_id:
newFilename = f'{media_id}{file_extension}'
else:
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
if highlight_id:
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media():
medias = []
post_types = {
'posts': 'post',
'stories': 'story',
'profile': 'profile',
}
for post_type in os.listdir(directory):
users_dir = os.path.join(directory, post_type)
if not os.path.isdir(users_dir):
continue
users = os.listdir(users_dir)
for username in users:
user_path = os.path.join(directory, post_type, username)
if not os.path.isdir(user_path):
continue
for filename in os.listdir(user_path):
if filename.startswith('.'):
continue
data = {}
filepath = os.path.join(user_path, filename)
if 'com.instagram.android__' in filename:
timestamp_str = filename.split('__')[-1].split('.')[0]
data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f')
else:
data['timestamp'] = datetime.now()
data['post_type'] = post_types[post_type]
data['username'] = username
data['filepath'] = filepath
data['media_id'] = None
data['user_id'] = get_user_id(data['username'])
data['highlight_id'] = None
medias.append(data)
return medias
def dump_instagram():
medias = get_media()
for media in medias:
UploadMedia(media)
existing_files.append(media['media_id'])
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram()
print("Processing completed.")

@ -1,34 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Image Gallery</title>
<style>
.gallery {
display: flex;
flex-wrap: wrap;
}
.gallery img {
margin: 10px;
max-width: 200px;
height: auto;
}
.gallery div {
text-align: center;
margin: 10px;
}
</style>
</head>
<body>
<h1>Image Gallery</h1>
<div class="gallery">
{% for image in images %}
<div>
<h3>{{ image['username'] }}</h3>
<img src="{{ image['media_url'] }}" alt="Image for {{ image['username'] }}">
</div>
{% endfor %}
</div>
</body>
</html>

@ -1,84 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Media Gallery</title>
<style>
body {
display: flex;
justify-content: center;
}
.container {
max-width: 1600px;
width: 100%;
padding: 20px;
}
.media-container {
column-count: 4;
column-gap: 10px;
}
.media-item {
break-inside: avoid;
margin-bottom: 10px;
}
img, video {
width: 100%;
height: auto;
display: block;
}
</style>
</head>
<body>
<div class="container">
<h1>Media Gallery</h1>
<div class="media-container" id="media-container"></div>
</div>
<script>
let page = 0;
async function loadMore() {
const response = await fetch(`/load-more?page=${page}`);
const mediaFiles = await response.json();
const container = document.getElementById('media-container');
mediaFiles.forEach(file => {
const mediaItem = document.createElement('div');
mediaItem.className = 'media-item';
if (file.endsWith('.png') || file.endsWith('.jpg') || file.endsWith('.jpeg') || file.endsWith('.gif')) {
const img = document.createElement('img');
img.src = `/media/${file}`;
img.alt = file;
mediaItem.appendChild(img);
} else if (file.endsWith('.mp4') || file.endsWith('.mkv') || file.endsWith('.mov')) {
const video = document.createElement('video');
video.controls = false;
video.autoplay = true;
video.muted = true;
video.loop = true;
const source = document.createElement('source');
source.src = `/media/${file}`;
source.type = 'video/mp4';
video.appendChild(source);
mediaItem.appendChild(video);
}
container.appendChild(mediaItem);
});
page += 1;
}
window.addEventListener('scroll', () => {
if (window.innerHeight + window.scrollY >= document.body.offsetHeight) {
loadMore();
}
});
// Initial load
loadMore();
</script>
</body>
</html>

@ -1,32 +0,0 @@
from flask import Flask, render_template, send_from_directory, jsonify, request
import os
app = Flask(__name__)
media_dir = 'storysaver'
MEDIA_PER_PAGE = 20
def get_media_files(start, count):
media_files = []
for root, dirs, files in os.walk(media_dir):
for filename in files:
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.mp4', '.mkv', '.mov')):
file_path = os.path.relpath(os.path.join(root, filename), media_dir)
media_files.append(file_path)
return media_files[start:start + count]
@app.route('/')
def index():
return render_template('index.html')
@app.route('/media/<path:filename>')
def media(filename):
return send_from_directory(media_dir, filename)
@app.route('/load-more')
def load_more():
page = int(request.args.get('page', 0))
media_files = get_media_files(page * MEDIA_PER_PAGE, MEDIA_PER_PAGE)
return jsonify(media_files)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)

@ -1,133 +0,0 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
import hashlib
from moviepy.editor import VideoFileClip
def scan_dupes(folder_path):
newCursor.execute("SELECT hash FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
if media_id:
fileHash = calculate_file_hash(filepath)
if fileHash in existing_files:
print(f'Duplicate')
os.remove(filepath)
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def upload_file(filepath, username, media_type='image', post_type = 'story'):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
dirtype = 'stories' if post_type == 'story' else 'posts'
#dirtype = 'profile'
fileHash = calculate_file_hash(filepath)
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
return True
except: media_id = uuid.uuid4().hex
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
duration = 0
if media_type == 'image':
try:
with Image.open(filepath) as img:
width, height = img.size
except:
os.remove(filepath)
return
else:
width, height = get_video_dimensions(filepath)
duration = get_video_duration(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, hash, filename, media_id, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, fileHash, filename, media_id, duration)
newCursor.execute(query, values)
newDB.commit()
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_video_duration(file_path):
"""
Returns the duration of the video file in seconds.
:param file_path: Path to the video file
:return: Duration in seconds
"""
with VideoFileClip(file_path) as video:
return video.duration
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
post_type = 'post' if 'post' in folder_path.lower() else 'story'
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
def calculate_file_hash(file_path, hash_func='sha256'):
h = hashlib.new(hash_func)
with open(file_path, 'rb') as file:
chunk = 0
while chunk != b'':
chunk = file.read(8192)
h.update(chunk)
return h.hexdigest()
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
storiesPath = 'StorySave/'
dump_instagram(storiesPath)
print("Processing completed.")

@ -0,0 +1,116 @@
import os
from funcs import calculate_file_hash, get_media_dimensions, get_media_type, generate_phash
import config
# --- Configuration & Constants ---
BASE_URL = "https://cdn.altpins.com/"
TEMP_DIR = os.path.join(os.getcwd(), 'temp')
CACHE_DIR = os.path.join(os.getcwd(), 'cache')
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
def normalize_server_path(media_url, replace_all=True):
"""
Remove the BASE_URL from media_url and normalize slashes.
If replace_all is True, replace double slashes and backslashes.
"""
path = media_url.replace(BASE_URL, '')
if replace_all:
path = path.replace('//', '/').replace('\\', '/')
else:
path = path.replace('\\', '/')
return path
def update_hashes(cursor, db, obj_storage):
cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;")
results = cursor.fetchall()
total = len(results)
print(f"Found {total} files to process for hash updating.")
for idx, (record_id, media_id, media_url) in enumerate(results, start=1):
server_path = normalize_server_path(media_url)
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
filehash = calculate_file_hash(local_file)
cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, record_id))
db.commit()
print(f"[{idx}/{total}] {media_id}: {filehash}, Rows affected: {cursor.rowcount}")
def update_dimensions(cursor, db, obj_storage):
cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0 OR height = 0;")
results = cursor.fetchall()
total = len(results)
print(f"Found {total} files to process for dimensions updating.")
for idx, (record_id, media_id, media_url) in enumerate(results, start=1):
server_path = normalize_server_path(media_url)
local_file = os.path.join(CACHE_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=CACHE_DIR)
# Optionally, you could get the media type if needed:
media_type = get_media_type(local_file)
width, height = get_media_dimensions(local_file)
cursor.execute("UPDATE media SET width = %s, height = %s WHERE id = %s;", (width, height, record_id))
db.commit()
print(f"[{idx}/{total}] {media_id}: width: {width}, height: {height}, Rows affected: {cursor.rowcount}")
def update_file_size(cursor, db, obj_storage):
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0 AND status != 'deleted';")
results = cursor.fetchall()
total = len(results)
print(f"Found {total} files to process for file size updating.")
for idx, (record_id, media_url) in enumerate(results, start=1):
server_path = normalize_server_path(media_url)
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
file_size = os.path.getsize(local_file)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, record_id))
db.commit()
print(f"[{idx}/{total}] {media_url}: {file_size} bytes, Rows affected: {cursor.rowcount}")
def update_phash(cursor, db, obj_storage):
generate_for = 'media_url'
media_type = 'image'
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL AND status != 'deleted';", [media_type])
medias = cursor.fetchall()
total = len(medias)
print(f"Found {total} files to process for pHash updating.")
for idx, (record_id, media_url) in enumerate(medias, start=1):
server_path = normalize_server_path(media_url, replace_all=False)
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
phash = generate_phash(local_file)
if not phash:
print(f"Error generating pHash for {local_file}")
continue
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, record_id])
db.commit()
print(f"[{idx}/{total}] Processed record {record_id} with pHash: {phash}")
def main():
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
update_hashes(cursor, db, obj_storage)
update_dimensions(cursor, db, obj_storage)
update_file_size(cursor, db, obj_storage)
update_phash(cursor, db, obj_storage)
if __name__ == '__main__':
main()
Loading…
Cancel
Save