Compare commits

..

No commits in common. '42afcdc5390d98d4894b3b902307421b672b7b77' and '48d23301933f324e365002a783f5ea62c8f67094' have entirely different histories.

BIN
.DS_Store vendored

Binary file not shown.

4
.gitignore vendored

@ -31,7 +31,3 @@ uploadlater
/clips
snapchat.json
/add_to_liked
/.profiles
/.vscode
/highlight_data
.DS_Store

@ -0,0 +1,35 @@
import os
import json
import gzip
data_dir = 'data'
data_compressed_dir = 'data_compressed'
os.makedirs(data_compressed_dir, exist_ok=True)
def compress_file(filepath, output_file):
with open(filepath, 'r') as f:
data = json.load(f)
compress_data(data, output_file)
return output_file
def compress_data(data, output_file):
with gzip.open(output_file, 'wb') as f:
f.write(json.dumps(data).encode('utf-8'))
return output_file
data_files = os.listdir(data_dir)
for file in data_files:
if not file.endswith('.json'):
continue
filepath = f'{data_dir}/{file}'
output_file = f'{data_compressed_dir}/{file}.gz'
output_file = compress_file(filepath, output_file)
if output_file:
print(f'Compressed {file} to {output_file}')
os.remove(filepath)
else:
print(f'Failed to compress {file}')
print('Data compression completed')

@ -0,0 +1,137 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
import hashlib
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def calculate_file_hash(file_path, hash_func='sha256'):
h = hashlib.new(hash_func)
with open(file_path, 'rb') as file:
chunk = 0
while chunk != b'':
chunk = file.read(8192)
h.update(chunk)
return h.hexdigest()
def extract_file_info(filename):
try:
username = filename.split("~")[0]
timestamp = filename.split("~")[1]
user_id = filename.split("~")[2]
media_id, some2 = user_id.split("_")
user_id = some2.split(".")[0]
return username, media_id, user_id, timestamp
except:
return None, None, None, None
def extract_file_info2(filename):
try:
username = filename.split("~")[0]
elements = filename.split("~")[1].split("_")
media_id, user_id = elements[0], elements[1].split(".")[0]
return username, media_id, user_id
except:
return None, None, None
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story', user_id = None, date = None):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
dirtype = 'stories' if post_type == 'story' else 'posts'
server_path = f'users/{dirtype}/{username}/{media_id if media_id else uuid.uuid4().hex}.{file_extension}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
fileHash = calculate_file_hash(filepath)
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, user_id, hash, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, user_id, fileHash, date)
newCursor.execute(query, values)
newDB.commit()
existing_files.append(media_id)
if newCursor.rowcount == 0:
print('What the fuck just happend?')
obj_storage.PutFile(filepath, server_path)
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
if "~" not in filename:
continue
username, media_id, user_id, timestamp = extract_file_info(filename)
if None in [username, media_id, user_id, timestamp]:
username, media_id, user_id = extract_file_info2(filename)
if None in [username, media_id, user_id]:
print(f"Failed to extract info from {filename}")
continue
media_id = int(media_id) if media_id else None
if media_id in existing_files:
print(f'Duplicate, {filename}')
os.remove(os.path.join(folder_path, filename))
continue
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, user_id = user_id,)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT media_id FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_instagram('StorySave/')
print("Processing completed.")

@ -0,0 +1,96 @@
from funcs import get_files
from PIL import Image
import imagehash
import config
import os
def generate_image_phash(filepath, hash_size=8):
try:
# Open the image using PIL
pil_image = Image.open(filepath)
# Compute pHash using the imagehash library
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
except Exception as e:
print(f"Error processing image {filepath}: {e}")
return None
def are_phashes_duplicates(phash1, phash2, threshold=5):
try:
# Compute the Hamming distance between the pHashes
distance = phash1 - phash2
return distance <= threshold
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
# if username != existing_username:
# continue
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
# Check if the current pHash is a duplicate
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
def get_media_by_hash(hash, existing_medias):
for media in existing_medias:
existing_hash = media[1]
if hash == existing_hash:
return media
return None
def get_media_by_id(media_id, existing_medias):
for media in existing_medias:
existing_media_id = media[1]
if media_id == existing_media_id:
return media
return None
def get_data_by_filename(filename, data):
for item in data:
if filename in item['filepath']:
return item
return None
directory = 'media/check_if_exists' # Directory containing user images
# Database connection
db, cursor = config.gen_connection()
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
existing_medias = cursor.fetchall()
usernames = os.listdir(directory)
for username in usernames:
files = get_files(os.path.join(directory, username))
for filepath in files:
image_filename = os.path.basename(filepath)
print(f'Processing {image_filename}...')
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue # Skip this image if there's an issue
phash_str = str(phash)
# Check if the image is a duplicate of any in the database
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {image_filename} to duplicates/')

@ -1,4 +1,4 @@
from funcs import get_files, get_media_type
from funcs import get_files
from PIL import Image
import imagehash
import config
@ -25,14 +25,10 @@ def are_phashes_duplicates(phash1, phash2, threshold=5):
print(f"Error comparing pHashes: {e}")
return False
def get_media_by_phash(phash, username, existing_medias, threshold=5):
def find_duplicate_phash(phash, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
if username:
if username != existing_username:
continue
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
@ -42,39 +38,46 @@ def get_media_by_phash(phash, username, existing_medias, threshold=5):
return media
return None
def get_image_files(directory):
return [file for file in get_files(directory) if get_media_type(file) == 'image']
def get_media_by_hash(hash, existing_medias):
for media in existing_medias:
existing_hash = media[1]
if hash == existing_hash:
return media
return None
def get_media_by_id(media_id, existing_medias):
for media in existing_medias:
existing_media_id = media[1]
if media_id == existing_media_id:
return media
return None
def get_data_by_filename(filename, data):
for item in data:
if filename in item['filepath']:
return item
return None
def get_images_with_username(directory):
files = {}
for username in os.listdir(directory):
user_files = get_image_files(os.path.join(directory, username))
files[username] = user_files
# Database connection
db, cursor = config.gen_connection()
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL AND media_id IS NULL;", ['image'])
existing_medias = cursor.fetchall()
# Directory containing user images
directory = 'media/check_if_exists'
usernames = os.listdir(directory)
files = get_image_files(directory)
username = None
directory = 'check_if_exists/' # Directory containing user images
files = [file for file in get_files(directory) if file.endswith(('.jpg', '.jpeg', '.png'))]
for filepath in files:
image_filename = os.path.basename(filepath)
print(f'Processing {image_filename}...')
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue # Skip this image if there's an issue
continue
# Check if the image is a duplicate of any in the database
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
duplicate_media = find_duplicate_phash(phash, existing_medias)
if duplicate_media:
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')

@ -0,0 +1,68 @@
from funcs import generate_phash # Assuming this function computes the pHash and returns a string
import imagehash
import os
def get_files(directory):
# Recursively get all files in the directory
file_list = []
for root, dirs, files in os.walk(directory):
for filename in files:
file_list.append(os.path.join(root, filename))
return file_list
# Function to compute pHashes for all images in a directory
def compute_phashes(image_paths):
phash_dict = {}
for image_path in image_paths:
try:
# Compute pHash and get it as a string
phash_str = generate_phash(image_path)
# Convert the hash string to an ImageHash object
phash = imagehash.hex_to_hash(phash_str)
phash_dict[image_path] = phash
except Exception as e:
print(f"Error processing {image_path}: {e}")
return phash_dict
# Get all image files from 'ready_to_upload' and 'sorted' directories
ready_images = get_files('ready_to_upload')
ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
sorted_images = get_files('sorted')
sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
# Compute pHashes for images in 'ready_to_upload'
print("Computing pHashes for 'ready_to_upload' images...")
ready_image_phashes = compute_phashes(ready_images)
# Compute pHashes for images in 'sorted'
print("Computing pHashes for 'sorted' images...")
sorted_image_phashes = compute_phashes(sorted_images)
# Prepare the 'already_processed' directory
os.makedirs('already_processed', exist_ok=True)
# Set a Hamming distance threshold for considering images as duplicates
threshold = 5 # Adjust this value as needed
# Find and move duplicates
for sorted_image, sorted_phash in sorted_image_phashes.items():
duplicate_found = False
for ready_image, ready_phash in ready_image_phashes.items():
# Compute Hamming distance between the two pHashes
try:
distance = sorted_phash - ready_phash
except TypeError as e:
print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
continue
if distance <= threshold:
# Duplicate found
newpath = sorted_image.replace('sorted', 'already_processed')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
os.rename(sorted_image, newpath)
duplicate_found = True
break # Exit the loop since a duplicate is found
if not duplicate_found:
print(f"No duplicate found for {sorted_image}")

@ -0,0 +1,59 @@
import config
# Function to find the closest perceptual hash (phash) match
def find_almost_identical_phash(phash, usernames, max_distance=1):
"""
Find a username whose phash is nearly identical to the given phash.
:param phash: The phash to compare (e.g., from the 'unknown' image).
:param usernames: List of tuples containing (username, phash).
:param max_distance: Maximum Hamming distance to consider as "identical".
:return: The matching username and phash, or None if no match is found.
"""
for username in usernames:
dist = hamming_distance(phash, username[1])
if dist <= max_distance:
return username
return None
def hamming_distance(phash1, phash2):
"""
Calculate the Hamming distance between two binary strings.
"""
if len(phash1) != len(phash2):
raise ValueError("Hashes must be of the same length")
return sum(c1 != c2 for c1, c2 in zip(phash1, phash2))
# Establish database connection
db, cursor = config.gen_connection()
# Fetch all images with an 'unknown' username
cursor.execute("SELECT id, username, phash FROM media WHERE username = 'unknown'")
rows = cursor.fetchall()
# Fetch all non-unknown usernames and their associated phash
cursor.execute("SELECT username, phash FROM media WHERE username != 'unknown' AND phash IS NOT NULL AND status = 'public'")
usernames = cursor.fetchall()
# Ensure there are valid usernames to compare against
if not usernames:
print("No known usernames found in the database.")
exit()
# Adjusted section in your script
for row in rows:
id = row[0]
phash = row[2]
# Find a nearly identical phash match
closest = find_almost_identical_phash(phash, usernames, max_distance=2)
if closest:
print(f"Found match for image {id}: {closest[0]} with phash {closest[1]}")
cursor.execute(
"UPDATE media SET username = %s WHERE id = %s",
(closest[0], id),
)
db.commit()
else:
print(f"No nearly identical match found for image {id}.")

@ -0,0 +1,90 @@
from funcs import get_files # Assuming this is defined elsewhere
from PIL import Image
import imagehash
import config
import os
def generate_image_phash(filepath, hash_size=8):
try:
# Open the image using PIL
pil_image = Image.open(filepath)
# Compute pHash using the imagehash library
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
except Exception as e:
print(f"Error processing image {filepath}: {e}")
return None
def are_phashes_duplicates(phash1, phash2, threshold=5):
try:
# Compute the Hamming distance between the pHashes
distance = phash1 - phash2
return distance <= threshold
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
# existing_username = media[2]
# if existing_username != username:
# continue # Only compare with the same user's media
# Convert stored pHash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
# Check if the current pHash is a duplicate
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
# Database connection
db, cursor = config.gen_connection()
directory = 'check_if_exists' # Directory containing user images
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
existing_medias = cursor.fetchall()
existing_phashes = [media[1] for media in existing_medias]
# Go through the directory folder where each subfolder is a username
users = os.listdir(directory)
for username in users:
user_images_path = os.path.join(directory, username)
if not os.path.isdir(user_images_path):
continue # Skip non-directory files
# Get all images for the current user
images = get_files(user_images_path) # Assuming this gets all image files
for filepath in images:
image_filename = os.path.basename(filepath)
print(f'Processing {image_filename}...')
# Generate pHash for the image
phash = generate_image_phash(filepath, hash_size=8)
if phash is None:
continue # Skip this image if there's an issue
phash_str = str(phash)
if phash_str not in existing_phashes:
print(f'No duplicate found for {image_filename}')
continue
# Check if the image is a duplicate of any in the database
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
found_username = duplicate_media[2]
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate image path: {filepath}')
newpath = os.path.join('duplicates', found_username, image_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {image_filename} to duplicates/')

@ -0,0 +1,87 @@
from PIL import Image
import imagehash
import config
import cv2
import os
def generate_thumbnail_phash(filepath, hash_size=8): # Set hash_size to 8
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cap.release()
if not ret:
print(f"Error reading frame from {filepath}")
return None
# Resize frame to a standard size
standard_size = (320, 240)
resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
# Convert OpenCV image (BGR) to PIL Image (RGB)
image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image_rgb)
# Compute pHash
phash = imagehash.phash(pil_image, hash_size=hash_size)
return phash
def are_phashes_duplicates(phash1, phash2, threshold=5):
# Compute Hamming distance between the pHashes
try:
distance = phash1 - phash2
except TypeError as e:
print(f"Error comparing pHashes: {e}")
return False
return distance <= threshold
def get_media_by_phash(phash, username, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
if existing_username != username:
continue
# Convert stored phash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
# Database connection
db, cursor = config.gen_connection()
# Directory containing user videos
directory = 'check_if_exists'
# Fetch existing videos with pHashes
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
existing_medias = cursor.fetchall()
users = os.listdir(directory) # Assuming 'check_if_exists' contains user videos
for username in users:
user_videos_path = os.path.join(directory, username)
if not os.path.isdir(user_videos_path):
continue
videos = [video for video in os.listdir(user_videos_path) if video.endswith(('.mp4', '.avi', '.mov'))]
for video in videos:
print(f'Processing {video}...')
filepath = os.path.join(user_videos_path, video)
phash = generate_thumbnail_phash(filepath, hash_size=8) # Use hash_size=8
if phash is None:
continue
phash_str = str(phash)
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate video path: {filepath}')
newpath = filepath.replace(directory, 'duplicates')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {video} to duplicates/')

@ -1,11 +1,11 @@
from funcs import get_files, get_media_type
from funcs import get_files
from PIL import Image
import imagehash
import config
import cv2
import os
def get_video_phash(filepath, hash_size=8): # Set hash_size to 8
def get_video_phash(filepath, hash_size=8):
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cap.release()
@ -37,54 +37,43 @@ def are_phashes_duplicates(phash1, phash2, threshold=5):
return distance <= threshold
def get_media_by_phash(phash, username, existing_medias, threshold=5):
def get_media_by_phash(phash, existing_medias, threshold=5):
for media in existing_medias:
existing_phash_str = media[1]
existing_username = media[2]
if username:
if existing_username != username:
continue
# Convert stored phash string to ImageHash object
existing_phash = imagehash.hex_to_hash(existing_phash_str)
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
return media
return None
def get_video_files(directory):
return [file for file in get_files(directory) if get_media_type(file) == 'video']
def get_videos_with_username(directory):
videos = {}
for username in os.listdir(directory):
user_videos = get_video_files(os.path.join(directory, username))
videos[username] = user_videos
return videos
# Database connection
db, cursor = config.gen_connection()
# Directory containing user videos
directory = 'check_if_exists'
directory = 'check_if_exists/' # Directory containing user images
# Fetch existing videos with pHashes
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL;", ['video'])
existing_medias = cursor.fetchall()
videos = get_video_files(directory)
username = None
for filepath in videos:
# make a list of all video files
files = [file for file in get_files(directory) if file.endswith(('.mp4', '.avi', '.mov'))]
for filepath in files:
video_filename = os.path.basename(filepath)
phash = get_video_phash(filepath, hash_size=8) # Use hash_size=8
if phash is None:
continue
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
duplicate_media = get_media_by_phash(phash, existing_medias, threshold=5)
if duplicate_media:
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
print(f'Duplicate video path: {filepath}')
newpath = filepath.replace(directory, 'duplicates')
newpath = os.path.join('duplicates', duplicate_media[2], video_filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f'Moved {filepath} to duplicates/')

@ -0,0 +1,58 @@
from funcs import generate_phash
import os
def find_duplicates(source_dir, target_dir, extensions, max_distance):
"""Remove duplicates in target_dir that are found in source_dir based on Hamming distance."""
source_files = {}
target_files = {}
# Helper function to filter files by extension
def filter_files(files):
return [f for f in files if os.path.splitext(f)[1].lower() in extensions]
# Build hash map of source directory
for dirpath, _, filenames in os.walk(source_dir):
for filename in filter_files(filenames):
filepath = os.path.join(dirpath, filename)
filehash = generate_phash(filepath, str=False)
if filehash:
source_files[filehash] = filepath
# Build hash map of target directory and compare
for dirpath, _, filenames in os.walk(target_dir):
for filename in filter_files(filenames):
filepath = os.path.join(dirpath, filename)
filehash = generate_phash(filepath, str=False)
if not filehash:
continue
# Check if this file is similar to any of the source files
is_duplicate = False
for source_hash in source_files.keys():
distance = filehash - source_hash # Hamming distance
if distance <= max_distance:
is_duplicate = True
break # Found a duplicate
if is_duplicate:
newpath = os.path.join('duplicates', filename)
os.makedirs(os.path.dirname(newpath), exist_ok=True)
os.rename(filepath, newpath)
print(f"Moved duplicate: {filepath} to duplicates/ (distance: {distance})")
else:
target_files[filehash] = filepath
if __name__ == '__main__':
# Paths to the directories
source_dir = 'D:/Crawlers/media/Coomer/sadierayxo'
target_dir = 'sorted/sadierayxo'
# List of accepted extensions
extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif'}
# Maximum Hamming distance to consider as duplicates
MAX_DISTANCE = 5 # Adjust this threshold as needed
find_duplicates(source_dir, target_dir, extensions, MAX_DISTANCE)
print("Duplicate removal process completed.")

@ -0,0 +1,110 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
def scan_dupes(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
if media_id:
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
except:
print(f'Error: {filepath}')
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story'):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
return True
except: media_id = uuid.uuid4().hex
dirtype = 'stories' if post_type == 'story' else 'posts'
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type)
newCursor.execute(query, values)
newDB.commit()
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
post_type = 'story' if folder_path.split('\\')[0] == 'stories' else 'post'
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, post_type=post_type)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT media_id FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_instagram('media/posts')
dump_instagram('media/stories')
scan_dupes('media/posts')
scan_dupes('media/stories')
clean_empty_folders('media/posts')
clean_empty_folders('media/stories')
print("Processing completed.")

@ -0,0 +1,110 @@
from BunnyCDN.Storage import Storage
import os, uuid, config, funcs, cv2
from datetime import datetime
from PIL import Image
def dump_facebook(folder_path):
for filename in os.listdir(folder_path):
if os.path.isdir(os.path.join(folder_path, filename)):
continue
username = filename.split("'")[0]
filepath = os.path.join(folder_path, filename)
mediatype = funcs.get_media_type(filename)
post_type = funcs.determine_post_type(filepath, mediatype)
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
for folder in os.listdir(folder_path):
if os.path.isdir(os.path.join(folder_path, folder)):
username = folder
for filename in os.listdir(os.path.join(folder_path, folder)):
filepath = os.path.join(folder_path, folder, filename)
mediatype = funcs.get_media_type(filename)
post_type = funcs.determine_post_type(filepath, mediatype)
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
def upload_file(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
file_hash = funcs.calculate_file_hash(filepath)
if file_hash in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return False
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
if "FB_IMG" in filename: media_id = filename.split("_")[2].split(".")[0]
else: media_id = uuid.uuid4().hex
dirtype = funcs.determine_post_type(filepath, media_type)
server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = funcs.get_video_dimensions(filepath)
thumbnail_url = None
if media_type == 'video':
thumbPath = f'temp/{media_id}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumbPath, frame)
cap.release()
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg')
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
if post_type == 'stories':
post_type = 'story'
else:
post_type = 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, platform, hash, filename, duration, thumbnail) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, 'facebook', file_hash, filename, duration, thumbnail_url)
try:
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
except Exception as e:
print(f"Database error: {e}")
return False
try:
if newCursor.rowcount > 0:
os.remove(filepath)
except Exception as e:
print(f"Failed to remove local file {filepath}: {e}")
return True
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT hash FROM media WHERE platform='facebook' AND hash IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_facebook('facebook/')
print("Processing completed.")

@ -0,0 +1,82 @@
from BunnyCDN.Storage import Storage
from datetime import datetime
import os, config, funcs
from PIL import Image
def dump_instagram(folder_path):
for filename in os.listdir(folder_path):
parts = filename.split('_')
try:
username = '_'.join(parts[:-2]) # Join all except last two
timestamp = int(parts[-2]) # Second last is timestamp
user_id = int(parts[-1].split('.')[0]) # Last part before extension is user_id
except Exception as e:
print(f"Invalid filename: {filename}. Error: {e}")
continue
filepath = os.path.join(folder_path, filename)
mediatype = funcs.get_media_type(filename)
post_type = funcs.determine_post_type(filepath, mediatype)
UploadMedia(username=username, media_type=mediatype, filepath=filepath, post_type=post_type, timestamp=timestamp, user_id=user_id)
def UploadMedia(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
if 'tero' in username:
pass
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
file_hash = funcs.calculate_file_hash(filepath)
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
dirtype = funcs.determine_post_type(filepath, media_type)
server_path = f'media/{dirtype}/{username}/{file_hash}{file_extension}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
if file_hash in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
obj_storage.PutFile(filepath, server_path)
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = funcs.get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration)
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_instagram('storysaver/missingdata/')
print("Processing completed.")

@ -0,0 +1,67 @@
from BunnyCDN.Storage import Storage
import os, uuid, config, funcs
from datetime import datetime
from PIL import Image
def dump_facebook(folder_path):
for folder in os.listdir(folder_path):
if os.path.isdir(os.path.join(folder_path, folder)):
username = folder
for filename in os.listdir(os.path.join(folder_path, folder)):
filepath = os.path.join(folder_path, folder, filename)
upload_file(username=username, filepath=filepath)
def upload_file(filepath, username):
filename = os.path.basename(filepath)
media_id = filename.split('.')[0]
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
dirtype = funcs.determine_post_type(filepath, media_type)
server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if file_hash in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return False
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, platform, hash, filename, duration, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, 'tiktok', file_hash, filename, duration, media_id)
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
if newCursor.rowcount > 0:
os.remove(filepath)
return True
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT hash FROM media WHERE platform='tiktok' AND hash IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
dump_facebook('tiktok/')
print("Processing completed.")

@ -0,0 +1,32 @@
import os, funcs
from funcs import generate_phash
def get_username(image, ready_images):
for ready_image in ready_images:
if os.path.basename(image) in ready_image:
ready_image = ready_image.replace('\\', '/')
return ready_image.split('/')[1]
return None
ready_images = funcs.get_files('ready_to_upload')
ready_images = [image for image in ready_images if not image.endswith('.mp4')]
sorted_images = funcs.get_files('sorted')
sorted_images = [image for image in sorted_images if not image.endswith('.mp4')]
os.makedirs('already_processed', exist_ok=True)
for image in sorted_images:
image = image.replace('\\', '/')
username = image.split('/')[1]
filename = os.path.basename(image)
for ready_image in ready_images:
if filename in ready_image:
username = get_username(image, ready_images)
newpath = ready_image.replace('ready_to_upload', 'already_processed')
os.makedirs(os.path.dirname(newpath), exist_ok=True)
print(f'Moving {image} which is a match for {ready_image} to already_processed')
os.rename(image, newpath)
print(f'Moved {ready_image} to already_processed')
break

@ -0,0 +1,56 @@
from BunnyCDN.Storage import Storage
import os, config, requests
from moviepy.editor import VideoFileClip
def get_media_type(filename):
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
video_extensions = {".mp4", ".mov"}
extension = os.path.splitext(filename.lower())[1]
if extension in image_extensions:
return 'image'
elif extension in video_extensions:
return 'video'
else:
return 'unknown'
def determine_post_type(media_type):
# Assuming the post type is directly based on media type.
return media_type
def get_video_dimensions(filepath):
with VideoFileClip(filepath) as clip:
width, height = clip.size
return width, height
def download_file(url):
local_filename = url.split('/')[-1]
# Note: Stream=True to avoid loading the whole file into memory
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return local_filename
if __name__ == '__main__':
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
posts = open('fucked', 'r')
for item in posts:
username, url = item.strip().split('~')
media_id = url.split('/')[-1].split('.')[0]
media_type = get_media_type(url)
query = "INSERT IGNORE INTO media (username, media_type, platform, media_url) VALUES (%s, %s, %s, %s)"
values = (username, media_type, 'facebook', url)
try:
newCursor.execute(query, values)
newDB.commit()
print(f'[{newCursor.rowcount}] records updated.{url}')
except Exception as e:
print(f"Database error: {e}")
posts.close()

@ -0,0 +1,40 @@
import config, os, json
from PIL import Image
import imagehash
def find_file(filename, directory):
filename = filename.lower().split('.')[0]
for root, dirs, files in os.walk(directory):
for file in files:
if filename in file:
return os.path.join(root, file)
return None
def generate_phash(image_path):
image = Image.open(image_path)
return str(imagehash.phash(image))
count = 0
cacheDir = 'sorted'
dataPath = 'pins.json'
os.makedirs(cacheDir, exist_ok=True)
medias = json.load(open(dataPath))
for item in medias:
count += 1
filepath = item['filepath']
if os.path.exists(filepath):
continue
newfilepath = find_file(os.path.basename(filepath), cacheDir)
if newfilepath:
print(f"Found file {newfilepath} for {filepath}")
item['filepath'] = newfilepath
with open(dataPath, 'w') as f:
json.dump(medias, f)

@ -0,0 +1,28 @@
import os, json
from funcs import generate_phash
count = 0
cacheDir = '_sort'
dataPath = 'pins.json'
os.makedirs(cacheDir, exist_ok=True)
medias = json.load(open(dataPath))
for item in medias:
count += 1
if item['type'] == 'image':
filepath = item['filepath']
if 'phash' in item:
print(f"Skipping {count}/{len(medias)}: already processed.")
continue
if not os.path.exists(filepath):
print(f"File {filepath} does not exist, skipping.")
continue
phash = generate_phash(filepath)
item['phash'] = phash
print(f"Processed {count}/{len(medias)}: with pHash {phash}")
with open(dataPath, 'w') as f:
json.dump(medias, f)

@ -0,0 +1,19 @@
import config, storysave_api
db, cursor = config.gen_connection()
usernames = []
with open('usernames.txt', 'r') as f:
for line in f:
usernames.append(line.strip())
for username in usernames:
print(f"Username: {username}")
user_id = storysave_api.get_user_id(username)
# Update the user_id in the database
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
db.commit()
print(f"[{cursor.rowcount}] Updated user_id for {username}")

@ -0,0 +1,94 @@
from BunnyCDN.Storage import Storage
from moviepy.editor import VideoFileClip
import config
import hashlib
import requests
import os
def file_hash_from_url(url, hash_algo='sha256'):
h = hashlib.new(hash_algo)
response = requests.get(url, stream=True)
if response.status_code == 200:
for chunk in response.iter_content(8192):
h.update(chunk)
return h.hexdigest()
else:
raise Exception(f"Failed to download file: Status code {response.status_code}")
def get_video_duration(file_path):
"""
Returns the duration of the video file in seconds.
:param file_path: Path to the video file
:return: Duration in seconds
"""
try:
with VideoFileClip(file_path) as video:
return video.duration
except:
return 0
def file_hash(filename, hash_algo='sha256'):
"""
Compute the hash of a file.
:param filename: Path to the file.
:param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5').
:return: Hexadecimal hash string.
"""
# Create a hash object
h = hashlib.new(hash_algo)
# Open the file in binary mode and read in chunks
with open(filename, 'rb') as file:
while chunk := file.read(8192):
h.update(chunk)
# Return the hexadecimal digest of the hash
return h.hexdigest()
# the hash of the images are different due to optimizer
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE duration = 0 AND media_type = 'video' AND status != 'deleted';")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
cacheDir = 'cache'
for result in results:
count += 1
videoID = result[0]
mediaID = result[1]
mediaURL = result[2]
extension = mediaURL.split('.')[-1]
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if os.path.exists(localFilePath):
print(f"File already exists: {localFilePath}")
else:
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
duration = get_video_duration(localFilePath)
if duration == 0:
print(f"Failed to get duration for {localFilePath}")
continue
if duration < 1:
duration = 1
cursor.execute("UPDATE media SET duration = %s WHERE id = %s;", (duration, result[0]))
db.commit()
print(f"[{count}/{len(results)}] {result[1]}: {duration}, {cursor.rowcount}")

@ -0,0 +1,47 @@
from BunnyCDN.Storage import Storage
import config
import hashlib
import os
def file_hash(filename, hash_algo='sha256'):
"""
Compute the hash of a file.
:param filename: Path to the file.
:param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5').
:return: Hexadecimal hash string.
"""
h = hashlib.new(hash_algo)
with open(filename, 'rb') as file:
while chunk := file.read(8192):
h.update(chunk)
return h.hexdigest()
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(os.getcwd(), 'temp', os.path.basename(serverPath))
if not os.path.exists(localFilePath):
obj_storage.DownloadFile(storage_path=serverPath, download_path=os.path.join(os.getcwd(), 'temp'))
filehash = file_hash(localFilePath)
cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, result[0]))
db.commit()
print(f"[{count}/{len(results)}] {result[1]}: {filehash}, {cursor.rowcount}")

@ -0,0 +1,47 @@
from BunnyCDN.Storage import Storage
import config, os, funcs
from PIL import Image
# the hash of the images are different due to optimizer
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
cacheDir = 'cache'
for result in results:
count += 1
videoID = result[0]
mediaID = result[1]
mediaURL = result[2]
extension = mediaURL.split('.')[-1]
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if os.path.exists(localFilePath):
print(f"File already exists: {localFilePath}")
else:
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
mediaType = funcs.get_media_type(localFilePath)
if mediaType == 'image':
with Image.open(localFilePath) as img:
width, height = img.size
elif mediaType == 'video':
width, height = funcs.get_video_dimensions(localFilePath)
cursor.execute("UPDATE media SET width = %s, height=%s WHERE id = %s;", (width, height, videoID))
db.commit()
print(f"[{count}/{len(results)}] width: {width}, height: {height} {cursor.rowcount}")

@ -0,0 +1,32 @@
import config
import os
temp_directory = "cache"
os.makedirs(temp_directory, exist_ok=True)
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
count = 0
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
id, media_url = result
serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
continue
file_size = os.path.getsize(localFilePath)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id))
db.commit()
print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}")

@ -1,9 +1,11 @@
from BunnyCDN.Storage import Storage
import config, os, cv2
from concurrent.futures import ThreadPoolExecutor
# this script will take a screenshot of the first frame of each video and upload it as a thumbnail to BunnyCDN
obj_storage = config.get_storage()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'video' AND thumbnail IS NULL and status = 'public';")
@ -28,7 +30,7 @@ def DownloadFile(serverPath, cacheDir):
def ImportMedias():
with ThreadPoolExecutor(max_workers=10) as executor:
for video in results:
serverPath = video[2].replace("https://cdn.altpins.com/", '').replace('//', '/').replace('\\', '/')
serverPath = video[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
executor.submit(DownloadFile, serverPath, cacheDir)
@ -39,7 +41,7 @@ for result in results:
mediaURL = result[2]
extension = mediaURL.split('.')[-1]
serverPath = result[2].replace("https://cdn.altpins.com/", '').replace('//', '/').replace('\\', '/')
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
@ -50,7 +52,7 @@ for result in results:
cv2.imwrite('thumbnail.jpg', frame)
cap.release()
thumbnailURL = f"https://cdn.altpins.com/thumbnails/{itemID}.jpg"
thumbnailURL = f"https://storysave.b-cdn.net/thumbnails/{itemID}.jpg"
obj_storage.PutFile('thumbnail.jpg', f'thumbnails/{itemID}.jpg')

@ -0,0 +1,36 @@
import config
from funcs import generate_phash
count = 0
storage = config.get_storage()
db, cursor = config.gen_connection()
generate_for = 'media_url'
media_type = 'image'
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL;", [media_type])
medias = cursor.fetchall()
for item in medias:
count += 1
itemID = item[0]
media_url = item[1]
server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/')
filepath = storage.DownloadFile(server_path, 'temp')
if not filepath:
print(f"Error downloading {server_path}")
continue
phash = generate_phash(filepath)
if not phash:
print(f"Error generating pHash for {filepath}")
continue
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID])
db.commit()
print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}")

@ -0,0 +1,39 @@
import config, os
from funcs import generate_phash
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = 0;")
results = cursor.fetchall()
count = 0
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")
for result in results:
count += 1
itemID = result[0]
mediaID = result[1]
if not mediaID:
print(f"Media ID is null, skipping.")
continue
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
print(f"File {localFilePath} does not exist, skipping.")
continue
phash = generate_phash(localFilePath)
if not phash:
print(f"Error generating pHash for {localFilePath}, skipping.")
continue
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
db.commit()
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")

@ -0,0 +1,74 @@
import config, os, threading, queue
from funcs import generate_phash
# Initialize database connection
db, cursor = config.gen_connection()
# Query the media table for unprocessed images
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = '0';")
results = cursor.fetchall()
# Setup cache directory
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")
# Thread-safe queue for processed media
processed_media_queue = queue.Queue()
def process_media():
"""Thread function to update database with processed pHash values."""
while True:
try:
item = processed_media_queue.get(timeout=10) # Timeout prevents infinite blocking
if item is None: # Sentinel value to exit the loop
break
itemID, phash = item
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
db.commit()
print(f"Updated database for ID {itemID} with pHash {phash}.")
except queue.Empty:
continue
# Start the database update thread
update_thread = threading.Thread(target=process_media, daemon=True)
update_thread.start()
# Main processing loop for generating pHash
count = 0
for result in results:
count += 1
itemID = result[0]
mediaID = result[1]
if not mediaID:
print(f"Media ID is null, skipping.")
continue
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
print(f"File {localFilePath} does not exist, skipping.")
continue
phash = generate_phash(localFilePath)
if not phash:
print(f"Error generating pHash for {localFilePath}, skipping.")
continue
# Add the processed media to the queue
processed_media_queue.put((itemID, phash))
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
# Signal the update thread to stop
processed_media_queue.put(None)
# Wait for the update thread to finish
update_thread.join()
print("Processing completed.")

@ -0,0 +1,43 @@
import os
import json
import config
# Establish database connection
db, cursor = config.gen_connection()
# Fetch rows with file_size = 0
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")
update_data = []
for result in results:
itemID = result[0]
media_id = result[1]
if not media_id:
print(f"Media ID is null for ID {itemID}, skipping.")
continue
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
print(f"File {localFilePath} does not exist for ID {itemID}, skipping.")
continue
file_size = os.path.getsize(localFilePath)
update_data.append({"id": itemID, "file_size": file_size})
# Save the results to a JSON file
output_file = "update_data.json"
with open(output_file, 'w') as f:
json.dump(update_data, f, indent=4)
print(f"Saved {len(update_data)} updates to {output_file}.")
cursor.close()
db.close()

File diff suppressed because it is too large Load Diff

@ -0,0 +1,29 @@
import json
import config
# Establish database connection
db, cursor = config.gen_connection()
# Load update data from the JSON file
input_file = "update_data.json"
with open(input_file, 'r') as f:
update_data = json.load(f)
print(f"Loaded {len(update_data)} records to update.")
# Process each record one by one
for count, item in enumerate(update_data, start=1):
item_id = item["id"]
file_size = item["file_size"]
try:
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s", (file_size, item_id))
db.commit()
print(f"Processed {count}/{len(update_data)}: ID {item_id} updated with file size {file_size}.")
except Exception as e:
print(f"Error updating ID {item_id}: {e}")
db.rollback()
print("All updates completed.")
cursor.close()
db.close()

@ -0,0 +1,31 @@
from BunnyCDN.Storage import Storage
import config, os
db, cursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
print(f"Found {len(results)} files to process.")
cacheDir = 'cache'
for result in results:
itemID = result[0]
mediaURL = result[2]
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if not os.path.exists(localFilePath):
continue
file_size = os.path.getsize(localFilePath)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, itemID))
db.commit()
print(f"Processed ID {itemID}: updated with file size {file_size}.")
cursor.close()
db.close()

@ -0,0 +1,112 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
def scan_dupes(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
if media_id:
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
except:
pass
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story'):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
return True
except: media_id = uuid.uuid4().hex
dirtype = 'stories' if post_type == 'story' else 'posts'
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
if media_type == 'image':
with Image.open(filepath) as img:
width, height = img.size
else:
width, height = get_video_dimensions(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type)
newCursor.execute(query, values)
newDB.commit()
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
post_type = 'story' if folder_path.split('\\')[0] == 'stories' else 'post'
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, post_type=post_type)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
newCursor.execute("SELECT media_id FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
scan_dupes('media/posts')
scan_dupes('media/stories')
scan_dupes('StorySave/')
dump_instagram('media/posts')
dump_instagram('media/stories')
dump_instagram('StorySave/')
clean_empty_folders('media/posts')
clean_empty_folders('media/stories')
clean_empty_folders('StorySave/')
print("Processing completed.")

@ -0,0 +1,33 @@
import bunny, json
medias = json.load(open('videos.json', 'r'))
videoIDS = [media['url'].split('/')[-1] for media in medias]
videos = bunny.list_videos()
with open('allVideos.json', 'w') as f:
json.dump(videos, f, indent=4)
missingVideos = []
for video in videos:
if video['guid'] in videoIDS:
continue
missingVideos.append(video)
datas = []
for video in missingVideos:
data = {
'guid': video['guid'],
'title': video['title'],
'length': video['length'],
'width': video['width'],
'height': video['height'],
'availableResolutions': video['availableResolutions'],
'storageSize': video['storageSize'],
'hasMP4Fallback': video['hasMP4Fallback'],
'category': video['category'],
}
datas.append(data)
with open('missing_videos.json', 'w') as f:
json.dump(datas, f, indent=4)

@ -0,0 +1,27 @@
from BunnyCDN.Storage import Storage
import os, json
altpins_obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
medias = json.load(open('db_pins.json', 'r'))
count = 0
print(f"Found {len(medias)} files to process.")
cacheDir = 'old_altpins_cache'
for media in medias:
count += 1
username = media['title']
mediaID = media['photo_id']
mediaURL = media['url']
extension = mediaURL.split('.')[-1]
serverPath = mediaURL.replace("https://altpins.b-cdn.net/", '').replace('//', '/').replace('\\', '/').replace('https://altpins.b-cdn.net/', '')
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
if os.path.exists(localFilePath):
continue
altpins_obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
print(f"Downloaded {count}/{len(medias)}: {localFilePath}")

@ -0,0 +1,16 @@
import json, bunny, os
from concurrent.futures import ThreadPoolExecutor
medias = json.load(open('missing_videos.json', 'r'))
#videoIDS = [media['url'].split('/')[-1] for media in medias]
videoIDS = [media['guid'] for media in medias]
with ThreadPoolExecutor(max_workers=10) as executor:
for id in videoIDS:
filePath = f"MISSING_STREAM_VIDEOS/{id}.zip"
if os.path.exists(filePath):
print(f'Video already exists as {filePath}. Skipping...')
continue
executor.submit(bunny.download_video, id)

@ -0,0 +1,29 @@
import os, json, config
# Load the data
pins = json.load(open('db_pins.json', 'r'))
files = os.listdir('STORAGE_IMPORTED/')
db, cursor = config.gen_connection()
cursor.execute('SELECT hash FROM media WHERE hash IS NOT NULL;')
existing_hashes = [hash[0] for hash in cursor.fetchall()]
for pin in pins:
if pin['hash'] in existing_hashes:
print(f"Found {pin['hash']} in the imported folder.")
pins.remove(pin)
alreadyImported = []
for pin in pins:
filepath = pin['filepath']
username = pin['title']
filename = os.path.basename(filepath)
if filename in files:
print(f"Found {filename} in the imported folder.")
alreadyImported.append(pins.pop(pins.index(pin)))
# Save to the file
json.dump(pins, open('db_pins.json', 'w'))
json.dump(alreadyImported, open('db_pins_imported.json', 'w'))

@ -0,0 +1,14 @@
import os, json, bunny
medias = json.load(open('allVideos.json', 'r'))
mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True]
missing = json.load(open('missing_videos.json', 'r'))
count = 0
cacheDir = 'old_mp4fallback_cache'
print(f"Found {len(medias)} files to process.")
for media in mp4Medias:
count += 1
filePath = os.path.join(cacheDir, media['guid'] + '.mp4')

@ -0,0 +1,36 @@
import os, json, bunny, config
db, cursor = config.gen_connection()
cursor.execute('SELECT media_id FROM media WHERE media_id IS NOT NULL;')
mediaIDS = cursor.fetchall()
pins = json.load(open('pins.json', 'r'))
videos = json.load(open('db_videos.json', 'r'))
pins = json.load(open('db_pins.json', 'r'))
ids = [video['id'] for video in videos]
for pin in pins:
if pin['id'] in ids:
pins.remove(pin)
# save to the file
json.dump(pins, open('db_pins.json', 'w'))
medias = json.load(open('allVideos.json', 'r'))
mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True]
missing = json.load(open('missing_videos.json', 'r'))
count = 0
cacheDir = 'old_mp4fallback_cache'
print(f"Found {len(medias)} files to process.")
for media in mp4Medias:
count += 1
filePath = os.path.join(cacheDir, media['guid'] + '.mp4')

@ -0,0 +1,53 @@
import os, json, funcs
STORAGE_IMPORTED = 'STORAGE_IMPORTED'
pins = json.load(open('db_pins.json', 'r'))
for pin in pins:
filename = pin['url'].split('/')[-1]
filepath = os.path.join(STORAGE_IMPORTED, filename)
pin['filename'] = filename
if not pin['hash']:
pin['hash'] = funcs.calculate_file_hash(filepath)
json.dump(pins, open('db_pins.json', 'w'), indent=4)
files = os.listdir(STORAGE_IMPORTED)
for file in files:
filepath = os.path.join(STORAGE_IMPORTED, file)
fileHash = funcs.calculate_file_hash(filepath)
if fileHash not in file:
print(f'Renaming {file} to {fileHash}')
os.rename(filepath, os.path.join(STORAGE_IMPORTED, fileHash))
pins_by_username = {}
for pin in pins:
username = pin['title']
if username not in pins_by_username:
pins_by_username[username] = []
pins_by_username[username].append(pin)
for username, username_pins in pins_by_username.items():
username_folder = os.path.join(STORAGE_IMPORTED, username)
os.makedirs(username_folder, exist_ok=True)
for pin in username_pins:
photo_id = pin['photo_id']
photo_url = pin['url']
fileHash = pin['hash']
if not fileHash:
continue
extension = photo_url.split('.')[-1]
filename = f'{fileHash}.{extension}'
filePath = os.path.join(STORAGE_IMPORTED, filename)
outputPath = os.path.join(STORAGE_IMPORTED, username, filename)
if os.path.exists(outputPath):
print(f'File {outputPath} already exists. Skipping...')
continue
print(f'Moving {photo_url} to {outputPath}')
os.rename(filePath, outputPath)

@ -0,0 +1,57 @@
import os
import hashlib
# Directories
fucked_dir = 'tiktoks/fucked/aleksandra'
source_dir = 'tiktoks/waiting_for_process/aleksandraverse'
def hash_file(filepath):
"""Generate MD5 hash of a file."""
hash_md5 = hashlib.md5()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_file_hashes(directory):
"""Generate a dictionary of file hashes for all files in a directory."""
file_hashes = {}
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
file_hashes[file_path] = hash_file(file_path)
return file_hashes
def files_are_identical(file1, file2):
"""Compare two files byte-by-byte."""
with open(file1, "rb") as f1, open(file2, "rb") as f2:
while True:
chunk1 = f1.read(4096)
chunk2 = f2.read(4096)
if chunk1 != chunk2:
return False
if not chunk1: # End of file
return True
def remove_duplicates(fucked_dir, source_files):
"""Remove files in 'fucked' that are identical to those in 'source_files'."""
for root, _, files in os.walk(fucked_dir):
for file in files:
file_path = os.path.join(root, file)
for source_file in source_files:
if files_are_identical(file_path, source_file):
print(f"Duplicate found. Removing: {file_path}")
os.remove(file_path)
break
def main():
print("Scanning source directory for hashes...")
source_hashes = get_file_hashes(source_dir)
print("Scanning 'fucked' directory for duplicates...")
remove_duplicates(fucked_dir, source_hashes)
print("Cleanup complete.")
if __name__ == "__main__":
main()

@ -0,0 +1,49 @@
import json, os
from videohash import VideoHash
from moviepy.editor import VideoFileClip
def is_valid_video(file_path):
try:
with VideoFileClip(file_path) as video:
return True
except Exception as e:
print(f"Invalid video {file_path}: {str(e)}")
return False
def load_hashes(file_path):
try:
with open(file_path, 'r') as file:
return json.load(file)
except FileNotFoundError:
return {}
def save_hashes(hashes, file_path):
with open(file_path, 'w') as file:
json.dump(hashes, file, indent=4)
hashes = load_hashes('video_hashes.json')
video_directory = 'STORAGE'
for username in os.listdir(video_directory):
user_dir = os.path.join(video_directory, username)
if not os.path.isdir(user_dir):
continue
for video_file in os.listdir(user_dir):
video_path = os.path.join(user_dir, video_file)
if not video_file.endswith(('.mp4', '.mkv', '.avi')) or not is_valid_video(video_path):
continue
if username in hashes and any(v[0] == video_file for v in hashes[username]):
continue
try:
video_hash = VideoHash(path=video_path)
if username in hashes:
hashes[username].append((video_file, video_hash.hash))
else:
hashes[username] = [(video_file, video_hash.hash)]
except Exception as e:
print(f"Error processing {video_file}: {str(e)}")
save_hashes(hashes, 'video_hashes.json')

@ -0,0 +1,17 @@
import os, config, funcs
db, cursor = config.gen_connection()
cursor.execute("SELECT phash FROM media WHERE phash IS NOT NULL")
phashes = set([x[0] for x in cursor.fetchall()])
files = funcs.get_files("check_if_exists")
for file in files:
image_phash = funcs.generate_phash(file)
if image_phash in phashes:
print(f"File {file} exists in the database")
os.remove(file)
funcs.cleanEmptyFolders("check_if_exists")

@ -0,0 +1,159 @@
from snapchat import get_data, get_stories, get_highlight_stories
from datetime import datetime
import requests
import config
import json
import os
"""
media_url_filename = url.split('/')[-1].split('?')[0]
etag = response.headers.get('ETag', '').replace('"', '')
filename = f"{username}~{timestamp}-{media_url_filename}~{etag}{extension}"
filepath = os.path.join(directory, 'highlights', filename)
"""
directory = "snapchat"
data_directory = "data"
def get_existing_snap_ids(directory):
existing_snap_ids = set()
for root, _, files in os.walk(directory):
for file in files:
if '~' not in file:
continue
filename, _ = os.path.splitext(file)
snap_id = filename.split('~')[2]
existing_snap_ids.add(snap_id)
return existing_snap_ids
def find_duplicate_snap(existing_snaps, snap_id, username):
for snap in existing_snaps:
if username == snap[2]:
if snap_id in snap[1]:
return snap
return False
def archive_data(data, username):
data_filename = f"{username}~{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
data_filepath = os.path.join(data_directory, data_filename)
with open(data_filepath, 'w') as f:
f.write(json.dumps(data))
print(f"Archived data for {username} at {data_filepath}")
def get_file_extension(url):
response = requests.head(url)
if response.status_code != 200:
print(f"Failed to access media {url}")
return None
content_type = response.headers.get('Content-Type', '')
if 'image' in content_type:
return '.jpg'
elif 'video' in content_type:
return '.mp4'
else:
print(f"Unknown content type for media {url}")
return None
def extract_file_type(url):
file_types = {
'400': '.jpg',
'1322': '.mp4',
'1325': '.mp4',
'1034': '.mp4',
'1023': '.jpg'
}
base_url = url.split("?")[0] # Remove query string
snap_data = base_url.split('/')[-1]
# Extract the file type number
data_parts = snap_data.split('.')
if len(data_parts) > 1:
file_type_number = data_parts[1]
if file_type_number in file_types:
return file_types[file_type_number]
else:
print(f"Unexpected URL format: {base_url}")
return None
def download_media(url, filepath):
if os.path.exists(filepath):
print(f"File {filepath} already exists. Skipping download.")
return filepath
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to download media {url}")
return None
with open(filepath, 'wb') as f:
f.write(response.content)
return filepath
def main():
if not os.path.exists(directory):
os.makedirs(directory)
db, cursor = config.gen_connection()
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat'")
usernames = [row[0] for row in cursor.fetchall()]
cursor.execute("SELECT id, filename, username FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
existing_medias = cursor.fetchall()
existing_snap_ids = get_existing_snap_ids(directory)
for username in usernames:
print(f"Getting stories for {username}...")
data = get_data(username)
if not data:
continue
archive_data(data, username)
print("Getting stories...")
stories = get_stories(data)
print("Getting highlights...")
stories.extend(get_highlight_stories(data))
for story in stories:
snap_id = story['snap_id']
url = story['url']
timestamp = story['timestamp']
duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
if duplicate_snap:
print(f"Media {snap_id} already exists. Skipping download.")
continue
# Check if media already exists
if snap_id in existing_snap_ids:
print(f"Media {snap_id} already exists. Skipping download.")
continue
# Determine file extension using HEAD request.
# TODO: find a better way to determine file extension without downloading the file.
extension = extract_file_type(url)
if not extension:
continue
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)
# Check if file already exists
if os.path.exists(filepath):
print(f"File {filename} already exists. Skipping download.")
continue
# Download the media
filepath = download_media(url, filepath)
print(f"Downloaded {filename} at {timestamp}")
if __name__ == "__main__":
main()

@ -0,0 +1,154 @@
from datetime import datetime
import config
import funcs
import cv2
import os
directory = 'media/instagram/'
def UploadMedia(media):
media_id = media['media_id']
username = media['username']
post_date = media['timestamp']
user_id = media['user_id']
filepath = media['filepath']
highlight_id = media['highlight_id']
post_type = media['post_type']
thumbnail_url = None
phash = None
if media_id and int(media_id) in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'video':
try:
thumbPath = f'temp/{media_id}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumbPath, frame)
cap.release()
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
phash = funcs.generate_phash(thumbPath)
os.remove(thumbPath)
except:
print('Error generating thumbnail. Skipping...')
return False
elif media_type == 'image':
phash = funcs.generate_phash(filepath)
if media_id:
newFilename = f'{media_id}{file_extension}'
else:
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
if highlight_id:
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media():
medias = []
post_types = {
'posts': 'post',
'stories': 'story',
'profile': 'profile',
}
for post_type in os.listdir(directory):
users_dir = os.path.join(directory, post_type)
if not os.path.isdir(users_dir):
continue
users = os.listdir(users_dir)
for username in users:
user_path = os.path.join(directory, post_type, username)
if not os.path.isdir(user_path):
continue
for filename in os.listdir(user_path):
if filename.startswith('.'):
continue
data = {}
filepath = os.path.join(user_path, filename)
if 'com.instagram.android__' in filename:
timestamp_str = filename.split('__')[-1].split('.')[0]
data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f')
else:
data['timestamp'] = datetime.now()
data['post_type'] = post_types[post_type]
data['username'] = username
data['filepath'] = filepath
data['media_id'] = None
data['user_id'] = get_user_id(data['username'])
data['highlight_id'] = None
medias.append(data)
return medias
def dump_instagram():
medias = get_media()
for media in medias:
UploadMedia(media)
existing_files.append(media['media_id'])
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram()
print("Processing completed.")

@ -0,0 +1,34 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Image Gallery</title>
<style>
.gallery {
display: flex;
flex-wrap: wrap;
}
.gallery img {
margin: 10px;
max-width: 200px;
height: auto;
}
.gallery div {
text-align: center;
margin: 10px;
}
</style>
</head>
<body>
<h1>Image Gallery</h1>
<div class="gallery">
{% for image in images %}
<div>
<h3>{{ image['username'] }}</h3>
<img src="{{ image['media_url'] }}" alt="Image for {{ image['username'] }}">
</div>
{% endfor %}
</div>
</body>
</html>

@ -0,0 +1,84 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Media Gallery</title>
<style>
body {
display: flex;
justify-content: center;
}
.container {
max-width: 1600px;
width: 100%;
padding: 20px;
}
.media-container {
column-count: 4;
column-gap: 10px;
}
.media-item {
break-inside: avoid;
margin-bottom: 10px;
}
img, video {
width: 100%;
height: auto;
display: block;
}
</style>
</head>
<body>
<div class="container">
<h1>Media Gallery</h1>
<div class="media-container" id="media-container"></div>
</div>
<script>
let page = 0;
async function loadMore() {
const response = await fetch(`/load-more?page=${page}`);
const mediaFiles = await response.json();
const container = document.getElementById('media-container');
mediaFiles.forEach(file => {
const mediaItem = document.createElement('div');
mediaItem.className = 'media-item';
if (file.endsWith('.png') || file.endsWith('.jpg') || file.endsWith('.jpeg') || file.endsWith('.gif')) {
const img = document.createElement('img');
img.src = `/media/${file}`;
img.alt = file;
mediaItem.appendChild(img);
} else if (file.endsWith('.mp4') || file.endsWith('.mkv') || file.endsWith('.mov')) {
const video = document.createElement('video');
video.controls = false;
video.autoplay = true;
video.muted = true;
video.loop = true;
const source = document.createElement('source');
source.src = `/media/${file}`;
source.type = 'video/mp4';
video.appendChild(source);
mediaItem.appendChild(video);
}
container.appendChild(mediaItem);
});
page += 1;
}
window.addEventListener('scroll', () => {
if (window.innerHeight + window.scrollY >= document.body.offsetHeight) {
loadMore();
}
});
// Initial load
loadMore();
</script>
</body>
</html>

@ -0,0 +1,32 @@
from flask import Flask, render_template, send_from_directory, jsonify, request
import os
app = Flask(__name__)
media_dir = 'storysaver'
MEDIA_PER_PAGE = 20
def get_media_files(start, count):
media_files = []
for root, dirs, files in os.walk(media_dir):
for filename in files:
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.mp4', '.mkv', '.mov')):
file_path = os.path.relpath(os.path.join(root, filename), media_dir)
media_files.append(file_path)
return media_files[start:start + count]
@app.route('/')
def index():
return render_template('index.html')
@app.route('/media/<path:filename>')
def media(filename):
return send_from_directory(media_dir, filename)
@app.route('/load-more')
def load_more():
page = int(request.args.get('page', 0))
media_files = get_media_files(page * MEDIA_PER_PAGE, MEDIA_PER_PAGE)
return jsonify(media_files)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)

@ -0,0 +1,133 @@
from BunnyCDN.Storage import Storage
from PIL import Image
import os, uuid, cv2, config
import hashlib
from moviepy.editor import VideoFileClip
def scan_dupes(folder_path):
newCursor.execute("SELECT hash FROM media")
existing_files = [image[0] for image in newCursor.fetchall()]
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
folder_path = os.path.join(root, folder)
for filename in os.listdir(folder_path):
media_id = filename.replace('.mp4', '').replace('.jpg', '')
filepath = os.path.join(folder_path, filename)
if media_id:
fileHash = calculate_file_hash(filepath)
if fileHash in existing_files:
print(f'Duplicate')
os.remove(filepath)
def clean_empty_folders(directory):
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
for subfolder in subfolders:
folder_path = os.path.join(foldername, subfolder)
if not os.listdir(folder_path):
os.rmdir(folder_path)
print(f"Removed empty folder: {folder_path}")
def upload_file(filepath, username, media_type='image', post_type = 'story'):
filename = os.path.basename(filepath)
file_extension = filename.split('.')[-1]
dirtype = 'stories' if post_type == 'story' else 'posts'
#dirtype = 'profile'
fileHash = calculate_file_hash(filepath)
try:
if int(media_id) in existing_files:
print(f'Duplicate')
os.remove(filepath)
return True
except: media_id = uuid.uuid4().hex
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
duration = 0
if media_type == 'image':
try:
with Image.open(filepath) as img:
width, height = img.size
except:
os.remove(filepath)
return
else:
width, height = get_video_dimensions(filepath)
duration = get_video_duration(filepath)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, hash, filename, media_id, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, fileHash, filename, media_id, duration)
newCursor.execute(query, values)
newDB.commit()
os.remove(filepath)
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
return width, height
def get_video_duration(file_path):
"""
Returns the duration of the video file in seconds.
:param file_path: Path to the video file
:return: Duration in seconds
"""
with VideoFileClip(file_path) as video:
return video.duration
def get_media_type(filename):
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
return 'image'
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
return 'video'
def dump_instagram(folder_path):
for root, dirs, files in os.walk(folder_path):
for folder in dirs:
username = folder
folder_path = os.path.join(root, folder)
post_type = 'post' if 'post' in folder_path.lower() else 'story'
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
mediatype = get_media_type(filename)
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
def calculate_file_hash(file_path, hash_func='sha256'):
h = hashlib.new(hash_func)
with open(file_path, 'rb') as file:
chunk = 0
while chunk != b'':
chunk = file.read(8192)
h.update(chunk)
return h.hexdigest()
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
storiesPath = 'StorySave/'
dump_instagram(storiesPath)
print("Processing completed.")

@ -1,424 +0,0 @@
import requests
import hashlib
access_key = "ccd3f9d4-9e6f-4bd2-8f594402b5a7-3646-48fe"
video_library_id = 106867
def create_video(title):
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos"
payload = f"{{\"title\":\"{title}\"}}"
headers = {
"accept": "application/json",
"content-type": "application/*+json",
"AccessKey": access_key
}
response = requests.post(url, data=payload, headers=headers)
return response
def generate_signature(library_id, api_key, expiration_time, video_id):
signature = hashlib.sha256((library_id + api_key + str(expiration_time) + video_id).encode()).hexdigest()
return signature
def upload_video_process(file_path, video_id):
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos/{video_id}"
headers = {
"accept": "application/json",
"AccessKey": access_key
}
with open(file_path, "rb") as file:
file_data = file.read()
response = requests.put(url, headers=headers, data=file_data)
return response.status_code
def upload_video(file_path, title=None):
video_item = create_video(title)
if video_item.status_code != 200:
return False
video_id = video_item.json()['guid']
upload_video_process(file_path, video_id)
return {
"embed_link": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/playlist.m3u8",
"animated_thumbnail": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/preview.webp",
"default_thumbnail": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/thumbnail.jpg",
}
def upload_video_recurbate(videoInfo):
title = f"{videoInfo['username']} {videoInfo['platform']}"
video_item = create_video(title)
if video_item.status_code != 200:
return False
video_id = video_item.json()['guid']
upload_video_process(videoInfo['filename'], video_id)
videoInfo["embed_link"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/playlist.m3u8"
videoInfo["animated_thumbnail"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/preview.webp"
videoInfo["default_thumbnail"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/thumbnail.jpg"
return True
def delete_video(video_id):
video_id = video_id.replace('https://vz-58ca89f1-986.b-cdn.net/', '').replace('/playlist.m3u8', '')
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos/{video_id}"
headers = {
"accept": "application/json",
"AccessKey": access_key
}
response = requests.delete(url, headers=headers)
return response.status_code
def list_videos():
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos?page=1&itemsPerPage=2147483647&orderBy=date"
headers = {
"accept": "application/json",
"AccessKey": access_key
}
response = requests.get(url, headers=headers)
return response.json()['items']
def get_heatmap(video_id):
url = "https://video.bunnycdn.com/library/libraryId/videos/videoId/heatmap"
url = url.replace('libraryId', str(video_library_id)).replace('videoId', str(video_id))
headers = {
"accept": "application/json",
"AccessKey": access_key
}
response = requests.get(url, headers=headers).json()
return response
def get_video(video_id):
url = "https://video.bunnycdn.com/library/libraryId/videos/videoId"
url = url.replace('libraryId', str(video_library_id)).replace('videoId', str(video_id))
headers = {
"accept": "application/json",
"AccessKey": access_key
}
response = requests.get(url, headers=headers).json()
return response
import os
import requests
from requests.exceptions import HTTPError
from urllib import parse
class Storage:
def __init__(self, api_key, storage_zone, storage_zone_region="de"):
"""
Creates an object for using BunnyCDN Storage API
Parameters
----------
api_key : String
Your bunnycdn storage
Apikey/FTP password of
storage zone
storage_zone : String
Name of your storage zone
storage_zone_region(optional parameter) : String
The storage zone region code
as per BunnyCDN
"""
self.headers = {
# headers to be passed in HTTP requests
"AccessKey": api_key,
"Content-Type": "application/json",
"Accept": "applcation/json",
}
# applying constraint that storage_zone must be specified
assert storage_zone != "", "storage_zone is not specified/missing"
# For generating base_url for sending requests
if storage_zone_region == "de" or storage_zone_region == "":
self.base_url = "https://storage.bunnycdn.com/" + storage_zone + "/"
else:
self.base_url = (
"https://"
+ storage_zone_region
+ ".storage.bunnycdn.com/"
+ storage_zone
+ "/"
)
def DownloadFile(self, storage_path, download_path=os.getcwd()):
"""
This function will get the files and subfolders of storage zone mentioned in path
and download it to the download_path location mentioned
Parameters
----------
storage_path : String
The path of the directory
(including file name and excluding storage zone name)
from which files are to be retrieved
download_path : String
The directory on local server to which downloaded file must be saved
Note:For download_path instead of '\' '\\' should be used example: C:\\Users\\XYZ\\OneDrive
"""
assert (
storage_path != ""
), "storage_path must be specified" # to make sure storage_path is not null
# to build correct url
if storage_path[0] == "/":
storage_path = storage_path[1:]
if storage_path[-1] == "/":
storage_path = storage_path[:-1]
url = self.base_url + parse.quote(storage_path)
file_name = url.split("/")[-1] # For storing file name
# to return appropriate help messages if file is present or not and download file if present
try:
response = requests.get(url, headers=self.headers, stream=True)
response.raise_for_status()
except HTTPError as http:
return {
"status": "error",
"HTTP": response.status_code,
"msg": f"Http error occured {http}",
}
except Exception as err:
return {
"status": "error",
"HTTP": response.status_code,
"msg": f"error occured {err}",
}
else:
download_path = os.path.join(download_path, file_name)
# Downloading file
with open(download_path, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
return {
"status": "success",
"HTTP": response.status_code,
"msg": "File downloaded Successfully",
}
def PutFile(
self,
file_name,
storage_path=None,
local_upload_file_path=os.getcwd(),
):
"""
This function uploads files to your BunnyCDN storage zone
Parameters
----------
storage_path : String
The path of directory in storage zone
(including the name of file as desired and excluding storage zone name)
to which file is to be uploaded
file_name : String
The name of the file as stored in local server
local_upload_file_path : String
The path of file as stored in local server(excluding file name)
from where file is to be uploaded
Examples
--------
file_name : 'ABC.txt'
local_upload_file_path : 'C:\\User\\Sample_Directory'
storage_path : '<Directory name in storage zone>/<file name as to be uploaded on storage zone>.txt'
#Here .txt because the file being uploaded in example is txt
"""
local_upload_file_path = os.path.join(local_upload_file_path, file_name)
# to build correct url
if storage_path is not None and storage_path != "":
if storage_path[0] == "/":
storage_path = storage_path[1:]
if storage_path[-1] == "/":
storage_path = storage_path[:-1]
url = self.base_url + parse.quote(storage_path)
else:
url = self.base_url + parse.quote(file_name)
with open(local_upload_file_path, "rb") as file:
file_data = file.read()
response = requests.put(url, data=file_data, headers=self.headers)
try:
response.raise_for_status()
except HTTPError as http:
return {
"status": "error",
"HTTP": response.status_code,
"msg": f"Upload Failed HTTP Error Occured: {http}",
}
else:
return {
"status": "success",
"HTTP": response.status_code,
"msg": "The File Upload was Successful",
}
def DeleteFile(self, storage_path=""):
"""
This function deletes a file or folder mentioned in the storage_path from the storage zone
Parameters
----------
storage_path : The directory path to your file (including file name) or folder which is to be deleted.
If this is the root of your storage zone, you can ignore this parameter.
"""
# Add code below
assert (
storage_path != ""
), "storage_path must be specified" # to make sure storage_path is not null
# to build correct url
if storage_path[0] == "/":
storage_path = storage_path[1:]
url = self.base_url + parse.quote(storage_path)
try:
response = requests.delete(url, headers=self.headers)
response.raise_for_status
except HTTPError as http:
return {
"status": "error",
"HTTP": response.raise_for_status(),
"msg": f"HTTP Error occured: {http}",
}
except Exception as err:
return {
"status": "error",
"HTTP": response.status_code,
"msg": f"Object Delete failed ,Error occured:{err}",
}
else:
return {
"status": "success",
"HTTP": response.status_code,
"msg": "Object Successfully Deleted",
}
def GetStoragedObjectsList(self, storage_path=None):
"""
This functions returns a list of files and directories located in given storage_path.
Parameters
----------
storage_path : The directory path that you want to list.
"""
# to build correct url
if storage_path is not None:
if storage_path[0] == "/":
storage_path = storage_path[1:]
if storage_path[-1] != "/":
url = self.base_url + parse.quote(storage_path) + "/"
else:
url = self.base_url
# Sending GET request
try:
response = requests.get(url, headers=self.headers)
response.raise_for_status()
except HTTPError as http:
return {
"status": "error",
"HTTP": response.status_code,
"msg": f"http error occured {http}",
}
else:
storage_list = []
for dictionary in response.json():
temp_dict = {}
for key in dictionary:
if key == "ObjectName" and dictionary["IsDirectory"] is False:
temp_dict["File_Name"] = dictionary[key]
if key == "ObjectName" and dictionary["IsDirectory"]:
temp_dict["Folder_Name"] = dictionary[key]
storage_list.append(temp_dict)
return storage_list
def MoveFile(self, old_path, new_path):
"""
Moves a file by downloading from the old path and uploading to the new path,
then deleting from the old path. Uses existing PutFile and DeleteFile methods.
Parameters
----------
old_path : str
The current path (relative to storage zone root) of the file to move.
new_path : str
The new path (relative to storage zone root) for the file.
Returns
-------
dict
A dictionary containing 'status', 'msg', and optionally 'HTTP'.
"""
# Validate arguments
if not old_path or not new_path:
return {
"status": "error",
"msg": "Both old_path and new_path must be provided."
}
# 1. Download from old_path to a temporary local directory
# If you already have the file locally, you can skip this download step.
download_response = self.DownloadFile(old_path, download_path="temp")
if download_response.get("status") != "success":
return {
"status": "error",
"msg": f"Failed to download file for moving. Reason: {download_response.get('msg', 'unknown')}",
"HTTP": download_response.get("HTTP")
}
# Extract the filename from old_path to know what we downloaded
filename = os.path.basename(old_path)
# 2. Upload to new_path using existing PutFile
# We'll assume new_path includes the desired filename. If it does not, adjust logic.
put_response = self.PutFile(
file_name=filename,
storage_path=new_path, # e.g. "folder/newfile.jpg"
local_upload_file_path="temp" # where we downloaded it
)
if put_response.get("status") != "success":
return {
"status": "error",
"msg": f"Failed to upload file to new path. Reason: {put_response.get('msg', 'unknown')}",
"HTTP": put_response.get("HTTP")
}
# 3. Delete the original file using existing DeleteFile
delete_response = self.DeleteFile(old_path)
if delete_response.get("status") != "success":
return {
"status": "error",
"msg": f"Failed to delete old file. Reason: {delete_response.get('msg', 'unknown')}",
"HTTP": delete_response.get("HTTP")
}
# (Optional) Clean up the local temp file
local_temp_path = os.path.join("temp", filename)
if os.path.exists(local_temp_path):
os.remove(local_temp_path)
return {
"status": "success",
"msg": f"File successfully moved from '{old_path}' to '{new_path}'."
}

@ -1,3 +1,6 @@
from BunnyCDN.Storage import Storage
import mysql.connector
username = "doadmin"
password = "AVNS_2qeFJuiGRpBQXkJjlA6"
host = "storysave-do-user-13308724-0.c.db.ondigitalocean.com"
@ -6,42 +9,10 @@ database = "storysave"
sslmode = "REQUIRED"
def gen_connection():
import mysql.connector
print("Connecting to database")
newDB = mysql.connector.connect(host=host, user=username, password=password, database=database, port=port)
print("Connected to database")
return newDB, newDB.cursor(dictionary=True)
return newDB, newDB.cursor()
def get_storage():
from BunnyCDN.Storage import Storage
return Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
def get_custom_storage():
from bunny import Storage
return Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
def get_redis_connection():
from redis import Redis
REDIS_HOST = "192.168.0.27"
REDIS_PORT = 30059
REDIS_PASSWORD = "7U6zXN96xNg$8BnPd&eE"
try:
client = Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
response = client.ping()
if response:
print("Connected to Redis successfully!")
return client
else:
print("Failed to connect to Redis!")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
return Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')

@ -1,145 +0,0 @@
import os
from funcs import calculate_file_hash, get_media_dimensions, generate_phash
import config
# --- Configuration & Constants ---
BASE_URL = "https://cdn.altpins.com/"
TEMP_DIR = os.path.join(os.getcwd(), 'temp')
CACHE_DIR = os.path.join(os.getcwd(), 'cache')
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)
def normalize_server_path(media_url, replace_all=True):
"""
Remove the BASE_URL from media_url and normalize slashes.
If replace_all is True, replace double slashes and backslashes.
"""
path = media_url.replace(BASE_URL, '')
if replace_all:
path = path.replace('//', '/').replace('\\', '/')
else:
path = path.replace('\\', '/')
return path
def update_hashes(cursor, db, obj_storage):
cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;")
results = cursor.fetchall()
total = len(results)
print(f"Found {total} files to process for hash updating.")
for idx, (record_id, media_id, media_url) in enumerate(results, start=1):
server_path = normalize_server_path(media_url)
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
filehash = calculate_file_hash(local_file)
cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, record_id))
db.commit()
print(f"[{idx}/{total}] {media_id}: {filehash}, Rows affected: {cursor.rowcount}")
def update_dimensions(cursor, db, obj_storage):
cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0 OR height = 0;")
results = cursor.fetchall()
total = len(results)
print(f"Found {total} files to process for dimensions updating.")
for idx, (record_id, media_id, media_url) in enumerate(results, start=1):
server_path = normalize_server_path(media_url)
local_file = os.path.join(CACHE_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=CACHE_DIR)
# Optionally, you could get the media type if needed:
width, height = get_media_dimensions(local_file)
if width == 0 or height == 0:
print(f"Error getting dimensions for {media_url}")
continue
cursor.execute("UPDATE media SET width = %s, height = %s WHERE id = %s;", (width, height, record_id))
db.commit()
print(f"[{idx}/{total}] {media_id}: width: {width}, height: {height}, Rows affected: {cursor.rowcount}")
def update_file_size(cursor, db, obj_storage):
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0 AND status != 'deleted';")
results = cursor.fetchall()
total = len(results)
print(f"Found {total} files to process for file size updating.")
for idx, (record_id, media_url) in enumerate(results, start=1):
server_path = normalize_server_path(media_url)
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
file_size = os.path.getsize(local_file)
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, record_id))
db.commit()
print(f"[{idx}/{total}] {media_url}: {file_size} bytes, Rows affected: {cursor.rowcount}")
def update_phash(cursor, db, obj_storage):
generate_for = 'media_url'
media_type = 'image'
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL AND status != 'deleted';", [media_type])
medias = cursor.fetchall()
total = len(medias)
print(f"Found {total} files to process for pHash updating.")
for idx, (record_id, media_url) in enumerate(medias, start=1):
server_path = normalize_server_path(media_url, replace_all=False)
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
if not os.path.exists(local_file):
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
phash = generate_phash(local_file)
if not phash:
print(f"Error generating pHash for {local_file}")
continue
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, record_id])
db.commit()
print(f"[{idx}/{total}] Processed record {record_id} with pHash: {phash}")
def update_user_ids(cursor, db):
cursor.execute("SELECT DISTINCT username FROM media WHERE user_id IS NULL AND platform = 'instagram';")
usernames = [username[0] for username in cursor.fetchall()]
total = len(usernames)
print(f"Found {total} usernames to process for user_id updating.")
for idx, username in enumerate(usernames, start=1):
print(f"[{idx}/{total}] Username: {username}")
cursor.execute("SELECT DISTINCT user_id FROM media WHERE username = %s AND user_id IS NOT NULL;", [username])
possible_user_ids = [user_id for user_id, in cursor.fetchall()]
if len(possible_user_ids) == 0:
print(f"No user_id found for {username}")
continue
if len(possible_user_ids) > 1:
print(f"Multiple user_ids found for {username}: {possible_user_ids}")
continue
user_id = possible_user_ids[0]
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
db.commit()
print(f"[{idx}/{total}] Updated user_id for {username}, Rows affected: {cursor.rowcount}")
def main():
obj_storage = config.get_storage()
db, cursor = config.gen_connection()
update_hashes(cursor, db, obj_storage)
update_dimensions(cursor, db, obj_storage)
update_file_size(cursor, db, obj_storage)
update_phash(cursor, db, obj_storage)
update_user_ids(cursor, db)
if __name__ == '__main__':
main()

@ -2,7 +2,6 @@ from funcs import get_files
from PIL import Image
import imagehash
import cv2
import os
def is_static_video_phash_optimized(video_path, frame_sample_rate=30, hash_size=16, hamming_threshold=1):
"""
@ -33,6 +32,7 @@ def is_static_video_phash_optimized(video_path, frame_sample_rate=30, hash_size=
pil_image = Image.fromarray(frame_rgb)
previous_hash = imagehash.phash(pil_image, hash_size=hash_size)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
is_static = True
current_frame_number = 1
@ -66,34 +66,11 @@ def is_static_video_phash_optimized(video_path, frame_sample_rate=30, hash_size=
return is_static
directory = input("Enter the directory path: ")
directory = 'videos'
files = [file for file in get_files(directory) if file.endswith('.mp4')]
files = get_files(directory)
total_space_saved = 0
for video_file in files:
if not is_static_video_phash_optimized(video_file):
continue
screenshot_path = os.path.join('.temp', os.path.basename(video_file) + '.jpg')
if not os.path.exists(screenshot_path):
cap = cv2.VideoCapture(video_file)
ret, frame = cap.read()
cap.release()
if ret:
cv2.imwrite(screenshot_path, frame)
screenshot_size = os.path.getsize(screenshot_path)
video_size = os.path.getsize(video_file)
if screenshot_size < video_size:
screenshot_size_in_mb = screenshot_size / (1024 * 1024)
video_size_in_mb = video_size / (1024 * 1024)
total_space_saved += video_size - screenshot_size
print(f"Screenshot size: {screenshot_size_in_mb:.2f} MB, Video size: {video_size_in_mb:.2f} MB")
else:
os.remove(screenshot_path)
print(f"Total space saved: {total_space_saved / (1024 * 1024):.2f} MB")
if video_file.endswith('.mp4'):
if is_static_video_phash_optimized(video_file):
print("The video is static: " + video_file)

@ -10,21 +10,6 @@ from moviepy.editor import VideoFileClip
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
proxies={"http": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/","https": "http://yehyuxsl-rotate:4tl5bvrwkz5e@p.webshare.io:80/"}
def get_file_extension(url):
response = requests.head(url)
if response.status_code != 200:
print(f"Failed to access media {url}")
return None
content_type = response.headers.get('Content-Type', '')
if 'image' in content_type:
return '.jpg'
elif 'video' in content_type:
return '.mp4'
else:
print(f"Unknown content type for media {url}")
return None
def generate_phash(image_path):
try:
image = Image.open(image_path)
@ -33,10 +18,10 @@ def generate_phash(image_path):
print(f"Error generating phash for {image_path}: {e}")
return False
def clean_empty_folders(path):
def cleanEmptyFolders(path):
for root, dirs, fs in os.walk(path):
for d in dirs:
clean_empty_folders(os.path.join(root, d))
cleanEmptyFolders(os.path.join(root, d))
if not os.listdir(root):
os.rmdir(root)
@ -44,8 +29,6 @@ def get_files(directory):
files = []
for root, dirs, filenames in os.walk(directory):
for filename in filenames:
if filename.startswith('.'):
continue
files.append(os.path.join(root, filename))
return files
@ -104,36 +87,68 @@ def compare_images(image_path1, image_path2):
else:
return False
def remove_empty_folders(dir_path):
import shutil
def is_folder_empty(folder_path):
return len(os.listdir(folder_path)) == 0
num_folder = 0
for root, dirs, files in os.walk(dir_path, topdown=False):
for dir_name in dirs:
dir_path = os.path.join(root, dir_name)
if not os.path.isdir(dir_path):
continue
if '$' in dir_name or '$' in dir_path:
print(f"Skipping system folder: {dir_path}")
continue
if 'system volume information' in dir_name.lower() or 'system volume information' in dir_path.lower():
print(f"Skipping system folder: {dir_path}")
continue
if is_folder_empty(dir_path) or dir_name.lower() == '__pycache__':
shutil.rmtree(dir_path)
print(f"Moved empty folder: {dir_path}")
num_folder+=1
def download_file(url, filePath):
try:
if os.path.exists(filePath):
print(f"File already exists: {filePath}")
return filePath
if not url:
print(f"Invalid URL: {url}")
return False
response = requests.get(url, stream=True, headers=headers)
response.raise_for_status()
if response.status_code != 200:
print(f"Failed to download {url}. Status code: {response.status_code}")
return False
os.makedirs(os.path.dirname(filePath), exist_ok=True)
directory = os.path.dirname(filePath)
if not os.path.exists(directory):
os.makedirs(directory)
with open(filePath, "wb") as out_file:
for chunk in response.iter_content(chunk_size=8192):
out_file.write(chunk)
return filePath
print(f"Downloaded {filePath}")
return True
except Exception as e:
print(f"Failed to download {url}. Error: {e}")
return False
def determine_post_type(filepath):
width, height = get_media_dimensions(filepath)
if 0 in (width, height):
return False
aspect_ratio = width / height
if aspect_ratio > 0.5 and aspect_ratio < 0.6:
return 'stories'
else:
return 'posts'
def get_media_type(filename):
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".svg", ".eps", ".raw", ".cr2", ".nef", ".orf", ".sr2", ".heic", ".indd", ".ai", ".psd", ".svg"}
video_extensions = {".mp4", ".mov", ".avi", ".mkv", ".wmv", ".flv", ".webm", ".vob", ".ogg", ".ts", ".flv"}
video_extensions = {".mp4", ".mov"}
filetype_dict = {"image": image_extensions, "video": video_extensions}
extension = os.path.splitext(filename.lower())[1] # Get the extension and convert to lower case
@ -148,7 +163,9 @@ def get_video_duration(file_path):
print(f"File not found: {file_path}")
return 0
if not get_media_type(file_path) == 'video':
video_types = {".mp4", ".mov", ".mkv"}
extension = os.path.splitext(file_path.lower())[1]
if extension not in video_types:
return 0
try:
@ -161,12 +178,6 @@ def get_video_duration(file_path):
print(f"Error getting duration for {file_path}: {e}")
return 0
def get_media_dimensions(media_path):
if get_media_type(media_path) == 'video':
return get_video_dimensions(media_path)
else:
return get_image_dimensions(media_path)
def get_video_dimensions(video_path):
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@ -174,13 +185,13 @@ def get_video_dimensions(video_path):
cap.release()
return width, height
def get_image_dimensions(image_path):
try:
with Image.open(image_path) as img:
def get_media_dimensions(media_path):
if get_media_type(media_path) == 'video':
return get_video_dimensions(media_path)
else:
with Image.open(media_path) as img:
return img.size
except:
return 0, 0
def get_video_data(video_path):
data = {'duration': 0, 'width': 0, 'height': 0}
try:
@ -199,15 +210,4 @@ def calculate_file_hash(file_path, hash_func='sha256'):
while chunk:
h.update(chunk)
chunk = file.read(8192)
return h.hexdigest()
def files_are_identical(file1, file2):
"""Compare two files byte-by-byte."""
with open(file1, "rb") as f1, open(file2, "rb") as f2:
while True:
chunk1 = f1.read(4096)
chunk2 = f2.read(4096)
if chunk1 != chunk2:
return False
if not chunk1: # End of file
return True
return h.hexdigest()

@ -1,4 +1,5 @@
from concurrent.futures import ThreadPoolExecutor
from BunnyCDN.Storage import Storage
import config, os
def DownloadFile(serverPath, cacheDir):
@ -8,8 +9,8 @@ def DownloadFile(serverPath, cacheDir):
print(f"File already exists: {localFilePath}")
return localFilePath
print(f"Downloading {serverPath} to {localFilePath}")
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
print(f"Downloaded {serverPath} to {localFilePath}")
return localFilePath
def ImportMedias(results):
@ -19,14 +20,14 @@ def ImportMedias(results):
executor.submit(DownloadFile, serverPath, cacheDir)
obj_storage = config.get_storage()
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
db, cursor = config.gen_connection()
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0 ORDER BY id DESC;")
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
results = cursor.fetchall()
cacheDir = 'cache'
os.makedirs(cacheDir, exist_ok=True)
print(f"Found {len(results)} files to process.")

@ -0,0 +1 @@
DH3ucOuYLbJ2Va3lfJPEYQq_6mk_v3R9dnrAYSQHr-Q=

@ -1,78 +0,0 @@
import os
import config
import logging
# Set up logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler() # or use logging.FileHandler('script.log')
]
)
# Prepare database connection
db, cursor = config.gen_connection()
# Ensure local temp directory exists
TEMP_DIR = "temp"
os.makedirs(TEMP_DIR, exist_ok=True)
URL_PREFIX = "https://cdn.altpins.com/"
# Retrieve records from database
query = f"""
SELECT id, date, media_url, platform, username, hash
FROM media
WHERE media_url like '%none%';
"""
cursor.execute(query)
rows = cursor.fetchall()
# Initialize Bunny.net Storage (credentials redacted)
obj_storage = config.get_custom_storage()
count = 0
total = len(rows)
for row in rows:
count += 1
pin_id, date, media_url, platform, username, file_hash = row
logging.info(f"[{count}/{total}] Processing screenshot ID: {pin_id}")
serverPath = media_url.replace(URL_PREFIX, "").split("?")[0]
filename = os.path.basename(serverPath)
filename = filename.replace("none", file_hash).replace("None", file_hash)
filepath = os.path.join(TEMP_DIR, filename)
# 2. Create new path (based on date)
year = date.year
month = str(date.month).zfill(2)
day = str(date.day).zfill(2)
formatted_date = os.path.join(str(year), month, day)
# Extract the server path (remove domain and query)
newPath = os.path.join("media", "stories", username, filename)
new_media_url = f"{URL_PREFIX}{newPath}"
# 3. Move file to new path
logging.info(f"Moving screenshot from {serverPath} to {newPath}")
status = obj_storage.MoveFile(serverPath, newPath)
if status['status'] != 'success':
logging.info(f"Failed to move file {serverPath} to {newPath}. Error: {status['status']}")
continue
# 4. Update DB
logging.info(f"Updating DB record {pin_id} to new URL\n{new_media_url}\nhttps://altpins.com/pin/{pin_id}")
cursor.execute("UPDATE media SET media_url = %s WHERE id = %s", [new_media_url, pin_id])
db.commit()
logging.info(f"Successfully processed screenshot {pin_id}")
# Close the DB connection
cursor.close()
db.close()
logging.info("All done!")

@ -0,0 +1 @@
gAAAAABmRUff7c9t9gngWj_2cwvaTBrUDJ_JUyYVUfG-p3SvDV7qOSHddJ4eHADiJeRtJNtY9UxkohSB5I1MmLahAb_hxxwIVA==

@ -1,41 +0,0 @@
from storysave_api import get_hd_profile_picture
import config, funcs, os, time
known_phashes = {'e7c51a904b69d366': 'default empty profile picture',
'cb3ce46194c335dc': 'default empty profile picture',
}
known_hashes = {
'09c3cf34d4f117d99fa6285f4bfd3a0d888d7ab2cbca665b16097f6b93ca0de6' : 'default empty profile picture',
'2b9c0914d8f3f0aa6cf86705df70b7b21e9ca2f9013a346463788e7cebd0158f' : 'default empty profile picture',
}
db, cursor = config.gen_connection()
cursor.execute("SELECT DISTINCT username, user_id, favorite FROM following WHERE user_id IS NOT NULL AND platform = 'instagram' ORDER BY favorite DESC;")
usernames = cursor.fetchall()
for username, user_id, favorite in usernames:
profilepicurl = get_hd_profile_picture(user_id=user_id)
if not profilepicurl:
print(f'Failed for {username}')
continue
filename = os.path.basename(profilepicurl).split('?')[0]
user_dir = os.path.join('media', 'instagram', 'profile', username)
filepath = os.path.join(user_dir, filename)
filepath = funcs.download_file(profilepicurl, filepath)
if not filepath:
continue
phash = funcs.generate_phash(filepath)
if phash in known_phashes:
print(f"Profile picture for {username} is the default empty profile picture.")
os.remove(filepath)
continue
print(f"Downloaded profile picture for {username}.")
time.sleep(1)

@ -16,8 +16,6 @@ undetected_chromedriver
python-telegram-bot
tqdm
webdriver-manager
moviepy==1.0.3
moviepy
instagrapi
ImageHash
watchdog
redis
ImageHash

@ -1,91 +0,0 @@
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
import shutil
import os
from funcs import get_media_dimensions
media_dir = "media"
stories_dir = os.path.join(media_dir, "stories")
posts_dir = os.path.join(media_dir, "posts")
os.makedirs(stories_dir, exist_ok=True)
os.makedirs(posts_dir, exist_ok=True)
def is_story(width, height, tolerance=0.02):
if width == 0 or height == 0:
return False
ratio = min(width, height) / max(width, height)
return abs(ratio - (9 / 16)) <= (9 / 16 * tolerance)
def determine_post_type(filepath):
lower = filepath.lower()
if "posts" in lower:
return "posts"
try:
width, height = get_media_dimensions(filepath)
except Exception as e:
print(f"Error getting dimensions for {filepath}: {e}")
return None
return "stories" if is_story(width, height) else "posts"
class DownloadHandler(FileSystemEventHandler):
def process_file(self, file_path):
file = os.path.basename(file_path)
# Ignore incomplete or weird temp names
if "crdownload" in file or file.count("~") != 3:
return
if not os.path.exists(file_path):
return
post_type = determine_post_type(file_path)
if post_type == "posts":
dest_dir = posts_dir
elif post_type == "stories":
dest_dir = stories_dir
else:
print(f"Could not determine post type for {file}. Skipping...")
return
output_path = os.path.join(dest_dir, file)
if os.path.exists(output_path):
print(f"File already exists {output_path}. Removing...")
os.remove(file_path)
return
shutil.move(file_path, output_path)
print(f"Moved {file_path}{output_path}")
def on_created(self, event):
if not event.is_directory:
self.process_file(event.src_path)
def on_moved(self, event):
if not event.is_directory:
self.process_file(event.dest_path)
if __name__ == "__main__":
download_path = os.path.join(os.path.expanduser("~"), "Downloads")
event_handler = DownloadHandler()
# Initial scan for files already in Downloads
for f in os.listdir(download_path):
full_path = os.path.join(download_path, f)
if os.path.isfile(full_path):
event_handler.process_file(full_path)
observer = Observer()
observer.schedule(event_handler, download_path, recursive=False)
observer.start()
try:
observer.join()
except KeyboardInterrupt:
observer.stop()
observer.join()

@ -0,0 +1,37 @@
{
"uuids": {
"phone_id": "53c03380-c7b9-44ab-b10e-1b585e8e428b",
"uuid": "2a9c7a37-c902-4332-8a32-1fd903acd991",
"client_session_id": "2b0a28f0-86c4-4cd4-b044-c4effd953cc9",
"advertising_id": "d330f041-56f1-4f45-906d-d3740717f0b1",
"android_device_id": "android-df5a2572f9762ff7",
"request_id": "35de6403-02e2-46b4-a02c-403cea1fe9c6",
"tray_session_id": "ed1874f7-cb8d-4ed6-bea8-13c53b9c3d67"
},
"mid": "ZwOR_QABAAGgkEbeoytBO3EL-dgC",
"ig_u_rur": null,
"ig_www_claim": null,
"authorization_data": {
"ds_user_id": "1587432849",
"sessionid": "1587432849%3Ak5q9QqmHia2WWq%3A18%3AAYcDFsLKMiFCtVhCcqYl7KZrFLw5IOSgf1pNfQZYLA"
},
"cookies": {},
"last_login": 1728287241.130515,
"device_settings": {
"app_version": "269.0.0.18.75",
"android_version": 26,
"android_release": "8.0.0",
"dpi": "480dpi",
"resolution": "1080x1920",
"manufacturer": "OnePlus",
"device": "devitron",
"model": "6T Dev",
"cpu": "qcom",
"version_code": "314665256"
},
"user_agent": "Instagram 269.0.0.18.75 Android (26/8.0.0; 480dpi; 1080x1920; OnePlus; 6T Dev; devitron; qcom; en_US; 314665256)",
"country": "US",
"country_code": 1,
"locale": "en_US",
"timezone_offset": -14400
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,96 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup
import requests
import json
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}
def get_data(username):
url = f"https://www.snapchat.com/add/{username}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
data_script = soup.find("script", id="__NEXT_DATA__")
if not data_script:
print(f"No data found for {username}.")
return None
data = json.loads(data_script.string)
return data
def get_all_users_data(usernames):
all_data = {}
# Define a helper function for threading
def fetch_data(username):
return username, get_data(username)
# Use ThreadPoolExecutor for concurrent fetching
with ThreadPoolExecutor() as executor:
futures = {executor.submit(fetch_data, username): username for username in usernames}
for future in as_completed(futures):
username = futures[future]
try:
username, data = future.result()
all_data[username] = data
except Exception as e:
print(f"Error fetching data for {username}: {e}")
all_data[username] = None
return all_data
def parse_stories(stories):
parsed_stories = []
for story in stories:
parsed_story = parse_story(story)
parsed_stories.append(parsed_story)
return parsed_stories
def get_stories(data):
try:
stories = data['props']['pageProps']['story']['snapList']
return parse_stories(stories)
except KeyError:
return []
def get_highlights(data):
highlights = []
page_props = data.get('props', {}).get('pageProps', {})
# Possible keys that might contain highlights
possible_highlight_keys = ['curatedHighlights', 'savedHighlights', 'highlights']
for key in possible_highlight_keys:
highlight_data = page_props.get(key, [])
if highlight_data:
highlights.extend(highlight_data)
return highlights
def parse_story(story):
original_snap_id = story.get('snapId', {}).get('value', '')
snap_url = story.get('snapUrls', {}).get('mediaUrl', '')
timestamp = story.get('timestampInSec', {}).get('value', '')
return {
"original_snap_id": original_snap_id,
"snap_id": get_snap_id(snap_url),
"url": snap_url,
"timestamp": timestamp,
"platform": "snapchat",
"type": "story",
}
def get_snap_id(url):
return url.split('/')[-1].split('.')[0]
def get_highlight_stories(data):
stories = []
highlights = get_highlights(data)
for highlight in highlights:
snap_list = highlight.get('snapList', [])
for snap in snap_list:
story = parse_story(snap)
stories.append(story)
return stories

@ -0,0 +1,270 @@
from snapchat import get_stories, get_highlight_stories, get_all_users_data
from datetime import datetime
from uuid import uuid4
import requests
import config
import funcs
import json
import cv2
import os
directory = "snapchat"
data_directory = "data"
def find_duplicate_snap(existing_snaps, snap_id, username):
"""
Find a snap in the existing_snaps list on database.s
"""
for snap in existing_snaps:
if username == snap[2]:
if snap_id in snap[1]:
return snap
return False
def archive_data(data, username):
data_filename = f"{username}~{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
data_filepath = os.path.join(data_directory, data_filename)
with open(data_filepath, 'w') as f:
f.write(json.dumps(data))
def get_file_extension(url):
response = requests.head(url)
if response.status_code != 200:
print(f"Failed to access media {url}")
return None
content_type = response.headers.get('Content-Type', '')
if 'image' in content_type:
return '.jpg'
elif 'video' in content_type:
return '.mp4'
else:
print(f"Unknown content type for media {url}")
return None
def extract_file_type(url):
file_types = {
'400': '.jpg',
'1322': '.mp4',
'1325': '.mp4',
'1034': '.mp4',
'1023': '.jpg'
}
base_url = url.split("?")[0] # Remove query string
snap_data = base_url.split('/')[-1]
# Extract the file type number
data_parts = snap_data.split('.')
if len(data_parts) > 1:
file_type_number = data_parts[1]
if file_type_number in file_types:
return file_types[file_type_number]
else:
print(f"Unexpected URL format: {base_url}")
return None
def download_media(url, filepath):
if os.path.exists(filepath):
# File already exists, skip download and return the filepath as if it was downloaded.
return filepath
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to download media {url}")
return None
with open(filepath, 'wb') as f:
f.write(response.content)
return filepath
def get_snapchat_stories():
os.makedirs(directory, exist_ok=True)
os.makedirs(data_directory, exist_ok=True)
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat' ORDER BY id DESC")
usernames = [row[0] for row in cursor.fetchall()]
cursor.execute("SELECT id, filename, username FROM media WHERE filename IS NOT NULL AND platform = 'snapchat' ORDER BY id DESC")
existing_medias = cursor.fetchall()
snapchat_users_data = get_all_users_data(usernames)
ready_stories = []
for username in usernames:
print(f"Getting stories for {username}...")
data = snapchat_users_data.get(username)
if not data:
print(f"Failed to get data for {username}. Skipping.")
continue
archive_data(data, username)
stories = get_stories(data)
stories.extend(get_highlight_stories(data))
for story in stories:
snap_id = story['snap_id']
url = story['url']
timestamp = story['timestamp']
duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
if duplicate_snap:
# Snap already exists in the database
continue
# Determine file extension using HEAD request.
extension = extract_file_type(url)
if not extension:
print(f"Failed to determine file extension for {url}. Skipping.")
continue
filename = f"{username}~{timestamp}~{snap_id}{extension}"
filepath = os.path.join(directory, filename)
media = {
'username': username,
'timestamp': timestamp,
'filepath': filepath,
'snap_id': snap_id,
'original_snap_id': story['original_snap_id'],
'media_url': url,
}
ready_stories.append(media)
print(f"Media {snap_id} ready for download.")
# sort ready_stories by timestamp from oldest to newest
ready_stories.sort(key=lambda x: x['timestamp'])
return ready_stories
def get_snapchat_files():
stories = funcs.get_files(directory)
stories = [get_media_data(filepath) for filepath in stories]
stories = [story for story in stories if story]
return stories
def main():
ready_stories = get_snapchat_stories()
stories_from_files = get_snapchat_files()
ready_stories.extend(stories_from_files)
download_stories(ready_stories)
def download_stories(stories):
for story in stories:
# Download the media
filepath = story['filepath']
url = story['media_url']
filename = os.path.basename(filepath)
timestamp = story['timestamp']
filepath = download_media(url, filepath)
print(f"Downloaded {filename} at {timestamp}")
if not filepath:
continue
story['filepath'] = filepath
UploadMedia(story)
def UploadMedia(media):
username = media['username']
timestamp = media['timestamp']
filepath = media['filepath']
filename = os.path.basename(filepath)
snap_id = media['snap_id']
original_snap_id = media['original_snap_id']
thumbnail_url = None
phash = None
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
post_date = datetime.fromtimestamp(int(timestamp))
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{filename}')
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{filename}"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except:
print('Error generating thumbnail. Skipping...')
return False
server_path = f'media/snaps/{username}/{filename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path)
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, snap_id, original_snap_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, 'story', post_date, file_hash, filename, duration, thumbnail_url, phash, 'snapchat', snap_id, original_snap_id)
cursor.execute(query, values)
db.commit()
print(f'[{cursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = f'temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) < 3:
return False
username = parts[0]
timestamp = parts[1]
snap_id = parts[2]
snap_id = os.path.splitext(snap_id)[0]
data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None, 'media_url': None}
# data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id, 'media_url': None}
return data
def process_snap_ids(filenames):
snap_ids = []
for filename in filenames:
snap_id = filename.split('~')[2]
snap_id = os.path.splitext(snap_id)[0]
if snap_id not in snap_ids:
snap_ids.append(snap_id)
return snap_ids
if __name__ == '__main__':
print('Starting snappy...')
db, cursor = config.gen_connection()
obj_storage = config.get_storage()
main()
print("Processing completed.")

@ -2,50 +2,31 @@ from bs4 import BeautifulSoup
import requests
import json
doc_ids = [7663723823674585, 9539110062771438, 8964418863643891, 9066276850131169]
active_doc_id = doc_ids[3]
def findPost(filePath = 'test.json'):
params = {'av': '17841401225494803','__a': '1','__req': '1','__hs': '19906.HYP:instagram_web_pkg.2.1..0.1','dpr': '1','__ccg': 'UNKNOWN','__rev': '1014609539','__s': 'guk60j:651i2v:pmhu0r','__hsi': '7386834689999716220','__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux609vCwjE1xoswaq0yE6u0nS4oaEd86a3a1YwBgao1aU2swbOU2zxe2GewGw9a362W2K0zEnwhEe82mwww4cwJCwLyES1TwTwFwIwbS1LwTwKG1pg2Xwr86C1mwrd6goK3ibxKi2K7ErwYCz8rwHw','__csr': 'igAzIj5OgR5YBHdRtivbkyFv-zJIZE_ykzfahdAydeHCHAAAqyk4pqBgDzeV4-qlbBF29UlCxFpVokDwAyosyV9KWUmx6iu58WqdwSDCDAFwHxi3C00lWy2FG4k583NxW8yFE0bUyxd06lxO5C2a8yFm2u290ejg1JU2Gw2rQ061U','__comet_req': '7','fb_dtsg': 'NAcPDfX2XufdLkctek6zNxz3DWxPW4t-cJzz39QtOQ5KS-_Rq3erT4A:17843708194158284:1719013044','jazoest': '26262','lsd': 'D0zmaX16yIQu_GwDXKTbMc','__spin_r': '1014609539','__spin_b': 'trunk','__spin_t': '1719881474','__jssesw': '1','fb_api_caller_class': 'RelayModern','fb_api_req_friendly_name': 'PolarisProfilePageContentDirectQuery', 'variables': '{"id":"57771591453","render_surface":"PROFILE"}','server_timestamps': 'true','doc_id': '7663723823674585'}
data = requests.get('https://www.instagram.com/graphql/query')
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
}
posts = data['data']['xdt_api__v1__feed__user_timeline_graphql_connection']['edges']
posts = [post['node'] for post in posts]
def get_posts(username):
return max(posts, key=lambda post: max(c['width'] * c['height'] for c in post['image_versions2']['candidates']))
url = 'https://www.instagram.com/graphql/query/'
def getHDProfilePicture():
url = 'https://www.save-free.com/process'
variables = {
"data": {
"count": 12,
"include_reel_media_seen_timestamp": True,
"include_relationship_info": True,
"latest_besties_reel_media": True,
"latest_reel_media": True
},
"username": username,
"__relay_internal__pv__PolarisIsLoggedInrelayprovider": True,
"__relay_internal__pv__PolarisShareSheetV3relayprovider": False
}
zoom_data = {'instagram_url': 'natahalieeee','type': 'profile','resource': 'zoom'}
data = {'instagram_url': 'natahalieeee','type': 'profile','resource': 'save'}
params = {
'variables': json.dumps(variables),
'doc_id': active_doc_id
}
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36','Referer' : 'https://www.save-free.com/profile-downloader/',}
response = requests.get(url, headers=headers, params=params)
response = requests.post(url, data=data, headers=headers)
if response.status_code == 200:
try:
data = response.json()
posts = data['data']['xdt_api__v1__feed__user_timeline_graphql_connection']['edges']
end_cursor = data['data']['xdt_api__v1__feed__user_timeline_graphql_connection']['page_info']['end_cursor']
return posts
except (KeyError, TypeError) as e:
print(f"Error parsing JSON response: {e}")
return None
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
response = requests.post(url, data=zoom_data, headers=headers)
with open('image.jpg', 'wb') as f:
f.write(response.content)
def get_username_by_user_id(user_id):
url = 'https://www.instagram.com/graphql/query/'
@ -65,6 +46,10 @@ def get_username_by_user_id(user_id):
'variables': json.dumps(variables)
}
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
@ -83,7 +68,7 @@ def extract_script_tags(username):
url = f"https://www.instagram.com/{username}/"
try:
# Fetch the HTML content of the page
response = requests.get(url, headers=headers)
response = requests.get(url)
response.raise_for_status()
# Parse the HTML content with BeautifulSoup
@ -146,166 +131,53 @@ def get_user_id(username):
def get_profile_data(username):
url = 'https://www.instagram.com/graphql/query'
user_id = get_user_id(username)
variables = {
"id": user_id,
"render_surface": "PROFILE"
}
data = {
'variables': json.dumps(variables),
'doc_id': active_doc_id
}
response = requests.post(url, headers=headers, data=data)
json_data = response.json()
return json_data
def get_hd_profile_picture(username = None, user_id = None):
api_url = 'https://www.instagram.com/graphql/query'
if not username and not user_id:
return None
if not user_id:
user_id = get_user_id(username)
if not user_id:
return None
variables = {
"id": user_id,
"render_surface": "PROFILE"
}
data = {
'variables': json.dumps(variables),
'doc_id': '9539110062771438'
}
data = {
'av': '17841401225494803',
'__d': 'www',
'__user': 0,
'__a': 1,
'__req': 4,
'__hs': '20231.HYP%3Ainstagram_web_pkg.2.1...1',
'dpr': 2,
'__ccg': 'GOOD',
'__rev': 1023131892,
'__s': 'g7nwhv%3Ad6c29x%3Aaag0uk',
'__hsi': 7507576467274562470,
'__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux609vCwjE1EE2Cw8G11wBw5Zx62G3i1ywOwa90Fw4Hw9O0Lbwae4UaEW2G0AEco5G0zEnwhE3Mw51wLyES1Twoob82ZwrUdUbGwmk0KU6O1FwlE6PhA6bwg8rAwHxW1oxe6UaU3cyUrw4rxO2C',
'__csr': 'gg84YIJgSyn2Ob7oDs-h7qhmToSsDl_8uAAaBigC8yQiaKJuumUkyybh4i9qBFaiayqBAVKczV4cBjhHUbqxeq3q9Suuum9zkEjAy9Ua8ymi45DUG7EgzoeUfKm2ym6UblG00kXK0jUE3Ug3dwh24DgAi1mo0AyaDw4WwiU1Y80bCm12g2Jwww5OCkE18Wc0mmqA4pU22wCw1Ucw06TW0csw7Gw',
'__hsdp': 'l2DMCyPBdbclSEgBiHWhqWiRV5kKKyoFtoYABrqafK699onQtK1fg96qiK5EZcIk0A5bwau0xVEhwAyQElwik0qi1cwam0m20ou06L82Ew56w4-w8O1Xw75wnoc85i',
'__hblp': '08K19xO0V89815oaEtwUCwhoOq4opxG5o8oS4Vk4U9o9o7C0zof82Nwg8uG0jV0Hweu1OwsE13o1ZU11UlwVwko2wwfy0G89E17U11EdU2cwuU5C0Yp8660Eo5idz8vxucw',
'__a': 1,
'__req': 2,
'__hs': '20047.HYP:instagram_web_pkg.2.1..0.1',
'dpr': 1,
'__ccg': 'EXCELLENT',
'__rev': 1018347086,
'__s': '8di41h:vwko3r:whjifd',
'__hsi': 7439320945163371549,
'__dyn': '7xe5WwlEnwn8K2Wmm1twpUnwgU7S6EdF8aUco38w5ux60p-0LVE4W0qa0FE2awgo1EUhwnU6a3a0EA2C0iK0D830wae4UaEW2G0AEco5G0zE5W0Y81eEdEGdwtU662O0Lo6-3u2WE15E6O1FwlE6PhA6bwg8rAwHxW1oCz8rwHwcOEym5oqw',
'__csr': 'hA5I8EAy7hnfqiIBklLZHVkmTHQmVmAh5UCchA9GQByu_yfD-nUBaVaDmSbDyUydCDgzyQAcggDK48Sm2ai8y8lxe6UTgmjwCyUC8yFXK9zooxmez9FUW684qu4awQwF9w04XAg0wi0nB03981oU082Oa0fMe3e19g512AK6Ulo5C3lw7Uy8G6Efo9k08mgiaaw25VobU2bw3KU023zw6Pw',
'__comet_req': 7,
'fb_dtsg': 'NAfvHXND-ELXKZFgyrogJIig1C4j6gRiNUaBBBomMZ1mNa-FvpKl6bw%3A17854231342124680%3A1731941013',
'jazoest': 26187,
'lsd': 'NFD0t4uLm10VsaniLLl9nv',
'__spin_r': 1023131892,
'fb_dtsg': 'NAcO7gvrsNlfWXA8giwQC4bVYRXXAGomAqcIRYUJUE2Hk8HmABf56Yg:17854575481098892:1732030177',
'jazoest': 26190,
'lsd': 'zcsn3c8we8kpMB_AVukeii',
'__spin_r': 1018347086,
'__spin_b': 'trunk',
'__spin_t': 1747993861,
'__crn': 'comet.igweb.PolarisProfilePostsTabRoute',
'__spin_t': 1732101883,
'fb_api_caller_class': 'RelayModern',
'fb_api_req_friendly_name': 'PolarisProfileNoteBubbleQuery',
'variables': '%7B%22user_id%22%3A%228309584937%22%7D',
'server_timestamps': True,
'doc_id': 8698637896906070
'fb_api_req_friendly_name': 'PolarisProfilePageContentQuery',
'variables': '{"id":"6687693830","render_surface":"PROFILE"}',
'server_timestamps': 'true',
'doc_id': 9539110062771438
}
try:
response = requests.post(api_url, data=data)
json_data = response.json()
if 'message' in json_data:
if json_data['message'] == 'Please wait a few minutes before you try again.':
print('Rate limited. Please try again later.')
return None
hd_profile_pic = json_data['data']['user']['hd_profile_pic_url_info']['url']
except:
hd_profile_pic = None
return hd_profile_pic
def get_user_id_by_username(username):
url = 'https://www.instagram.com/graphql/query'
variables = {
"data": {
"context": "blended",
"include_reel": True,
"query": username,
"rank_token": "",
"search_surface": "web_top_search"
},
"hasQuery": True
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}
data = {
'variables': json.dumps(variables),
'doc_id': active_doc_id
}
response = requests.post(url, headers=headers, data=data)
if response.status_code == 200:
json_data = response.json()
users = json_data['data']['xdt_api__v1__fbsearch__topsearch_connection']['users']
for user in users:
user_data = user['user']
if user_data['username'] == username:
return user_data['pk']
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
def get_user_id_api(username):
url = f"https://www.instagram.com/api/v1/users/web_profile_info/?username={username}"
headers['referer'] = f"https://www.instagram.com/{username}/"
headers['x-ig-app-id'] = '936619743392459'
response = requests.get(url, headers=headers)
json_data = response.json()
if response.status_code == 200:
try:
data = response.json()
user_id = data['data']['user']['id']
return user_id
except (KeyError, TypeError) as e:
print(f"Error parsing JSON response: {e}")
return None
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
return None
return json_data
def get_highest_quality_image(image_versions):
max_res = 0
max_res_url = None
for image in image_versions:
if image['width'] > max_res:
max_res = image['width']
max_res_url = image['url']
return max_res_url
def parse_post(post):
medias = post['node']['carousel_media']
media_items = []
for media in medias:
media_item = {}
username_check = 'tal_ohana'
image_versions = media['image_versions2']['candidates']
media_item['image_url'] = get_highest_quality_image(image_versions)
media_item['pk'] = media['pk']
media_item['media_type'] = media['media_type']
user_id = get_user_id(username_check)
media_items.append(media_item)
username = get_username_by_user_id(user_id)
return media_items
if username:
print(f"Username: {username}")
else:
print("Could not retrieve username.")

@ -1,71 +1,48 @@
from datetime import datetime, timedelta
from datetime import datetime
from uuid import uuid4
import config
import funcs
import json
import config
import cv2
import os
import re
temp_directory = ".temp"
directory = 'media'
os.makedirs(temp_directory, exist_ok=True)
media_types = {
'stories': 'story',
'posts': 'post',
'profile': 'profile'
}
for media_type, _ in media_types.items():
os.makedirs(os.path.join(directory, media_type), exist_ok=True)
existing_media_ids = set()
UPLOAD_CUSTOM = False
CACHE_FILE = os.path.join(temp_directory, 'existing_media_ids.json')
CACHE_TTL = timedelta(hours=48)
directory = 'storysaver'
def UploadMedia(media):
platform = 'Instagram'
media_id = media['media_id']
username = media['username']
timestamp = media['timestamp']
user_id = media['user_id']
filepath = media['filepath']
platform = media['platform']
media_id = media['media_id']
timestamp = media['timestamp']
highlight_id = media['highlight_id']
post_type = media['post_type']
file_size = os.path.getsize(filepath)
thumbnail_url = None
phash = None
if media_id and media_id in existing_media_ids:
if media_id and int(media_id) in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
file_size = os.path.getsize(filepath)
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
file_hash = funcs.calculate_file_hash(filepath)
if not user_id:
user_id = get_user_id(username)
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
try:
post_date = datetime.fromtimestamp(int(timestamp))
except:
post_date = datetime.fromtimestamp(os.path.getctime(filepath))
width, height = funcs.get_media_dimensions(filepath)
if 0 in (width, height):
print(f'Error getting dimensions for {filename}. Skipping...')
post_type = funcs.determine_post_type(filepath)
if not post_type:
print(f'Error determining post type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
@ -73,307 +50,141 @@ def UploadMedia(media):
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg')
thumbnail_url = f"https://cdn.altpins.com/thumbnails/{file_hash}.jpg"
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except Exception as e:
print(f'Error generating thumbnail: {e}. Skipping...')
return False
custom_filename = media_id if media_id else file_hash
newFilename = f'{custom_filename}{file_extension}'
newFilename = f'{media_id}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://cdn.altpins.com/{server_path}"
obj_storage.PutFile(filepath, server_path)
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
if highlight_id:
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)",
(highlight_id, user_id, media_id))
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
post_type = 'story' if post_type == 'stories' else 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
newCursor.execute(query, values)
newCursor.execute(query, values) # slower
newDB.commit()
correct_emoji = '' if newCursor.rowcount > 0 else ''
print(f'{correct_emoji} added {filename} to database')
print(f'File: {filename}')
print(f'URL: {file_url}')
print(f'Pin URL: https://altpins.com/pin/{newCursor.lastrowid}')
print("=" * 100)
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
existing_media_ids.add(media_id)
return newCursor.lastrowid
return True
def generate_thumbnail(filepath):
thumb_path = os.path.join(temp_directory, f'{uuid4()}.jpg')
def generate_thumbnail(filepath):
thumb_path = f'temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) != 4:
if len(parts) < 4:
return False
username = parts[0]
timestamp = parts[1]
media_id = parts[2]
user_id = parts[3].split('_')[-1].split('.')[0]
platform = 'instagram'
highlight_id = user_id.replace('highlight', '') if 'highlight' in user_id else None
if user_id.isdigit():
user_id = int(user_id)
else:
if highlight_id:
user_id = get_user_id(username)
if media_id.isdigit():
try:
media_id = int(media_id)
else:
except:
print(f'Invalid media_id for file {filename}. Skipping...')
media_id = None
data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id,
'filepath': filepath, 'highlight_id': highlight_id, 'platform': platform}
return data
data = {'username': username, 'timestamp': timestamp, 'media_id': media_id, 'user_id': user_id, 'filepath': filepath, 'highlight_id': highlight_id}
return data
def get_media():
def get_media(folder_path):
medias = []
failed_medias = []
for media_type, post_type in media_types.items():
media_folder_path = os.path.join(directory, media_type)
if not os.path.exists(media_folder_path):
continue
all_files = funcs.get_files(media_folder_path)
for filepath in all_files:
for root, dirs, files in os.walk(folder_path):
for filename in files:
filepath = os.path.join(root, filename)
data = get_media_data(filepath)
if not data:
failed_medias.append(filepath)
continue
data['post_type'] = post_type
medias.append(data)
return medias, failed_medias
def get_custom_media(failed_medias):
medias = []
for media_type, post_type in media_types.items():
folder_path = os.path.join(directory, media_type)
user_dirs = [d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))]
for username in user_dirs:
user_folder_path = os.path.join(folder_path, username)
for filename in os.listdir(user_folder_path):
if filename.startswith('.'):
continue
filepath = os.path.join(user_folder_path, filename)
if not filepath in failed_medias:
continue
user_id = get_user_id(username)
timestamp = int(os.path.getctime(filepath))
media_id = os.path.splitext(filename)[0]
if media_id.isdigit():
media_id = int(media_id)
if media_id < 10000000:
media_id = None
else:
media_id = None
data = {
"username": username,
"timestamp": timestamp,
"media_id": media_id,
"user_id": user_id,
"filepath": filepath,
"platform": 'instagram',
"highlight_id": None,
"post_type": post_type
}
if data:
medias.append(data)
return medias
def save_highlight_data(highlights):
filename = f'{uuid4()}.json'
filepath = os.path.join('highlight_data', filename)
with open(filepath, 'w') as f:
json.dump(highlights, f)
def dump_instagram():
medias, failed_medias = get_media()
medias = clean_dupes(medias)
failed_medias = get_custom_media(failed_medias)
medias.sort(key=lambda x: (x['username'].lower(), x['timestamp']))
new_user_ids = {}
for media in medias:
user_id = media['user_id']
username = media['username']
if not media['user_id']:
continue
if username in existing_users:
continue
existing_users[username] = user_id
new_user_ids[username] = user_id
for media in medias:
if media['user_id']:
continue
if media['username'] in new_user_ids:
media['user_id'] = new_user_ids[media['username']]
highlights = []
def dump_instagram(folder_path):
medias = get_media(folder_path)
if cleanup_dupe_stories(medias):
medias = get_media(folder_path)
for media in medias:
if not media['highlight_id']:
continue
highlights.append({
"media_id": media["media_id"],
"user_id": media["user_id"],
"highlight_id": media['highlight_id'],
"username": media['username'],
})
if highlights:
save_highlight_data(highlights)
for media in medias:
pinid = UploadMedia(media)
existing_media_ids.add(media['media_id'])
if UPLOAD_CUSTOM:
for media in failed_medias:
pinid = UploadMedia(media)
UploadMedia(media)
existing_files.append(media['media_id'])
def clean_dupes(medias):
def cleanup_dupe_stories(medias):
removed_count = 0
new_medias = []
for media in medias:
media_id = media['media_id']
filepath = media['filepath']
if not media_id:
print(f'Invalid media_id for file {filepath}. Skipping...')
continue
if media_id in existing_media_ids:
if media_id in existing_files:
removed_count += 1
print(f'Found duplicate file {filepath}. Removing...')
os.remove(filepath)
continue
if re.search(r'\(\d+\)', filepath):
if '(1)' in filepath:
removed_count += 1
print(f'Found duplicate file {filepath}. Removing...')
os.remove(filepath)
continue
new_medias.append(media)
print(f'Removed {removed_count} duplicate files.')
return new_medias
# -------------------- CACHE SYSTEM --------------------
def get_cached_data():
if not os.path.exists(CACHE_FILE):
print('No cache file found. Generating new cache…')
return None, None, None
try:
with open(CACHE_FILE, 'r') as f:
cache = json.load(f)
media_ids = set(cache.get('media_ids', []))
users = {k.lower(): v for k, v in cache.get('existing_users', {}).items()}
last_id = cache.get('last_id', 0)
return media_ids, users, last_id
except Exception as e:
print(f"Cache read error: {e}")
return None, None, None
def save_cached_data(media_ids, existing_users, last_id):
with open(CACHE_FILE, 'w') as f:
json.dump({
'timestamp': datetime.now().isoformat(),
'media_ids': list(media_ids),
'existing_users': existing_users,
'last_id': last_id
}, f)
def get_user_ids(cur):
cur.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL AND platform='instagram'")
rows = cur.fetchall()
return {user['username'].lower(): user['user_id'] for user in rows}
def get_existing_media_ids(cur):
cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public'")
rows = cur.fetchall()
media_ids = {row['media_id'] for row in rows}
last_id = max((row['id'] for row in rows), default=0)
return media_ids, last_id
def get_existing_medias(cur):
media_ids, users, last_id = get_cached_data()
if not media_ids or not users:
print('Cold cache → pulling full data...')
media_ids, last_id = get_existing_media_ids(cur)
users = get_user_ids(cur)
save_cached_data(media_ids, users, last_id)
return media_ids, users
cur.execute("SELECT id, media_id FROM media WHERE media_id IS NOT NULL AND platform='instagram' AND status='public' AND id > %s ORDER BY id ASC", (last_id,))
rows = cur.fetchall()
for r in rows:
media_ids.add(r['media_id'])
last_id = max(last_id, r['id'])
if rows:
save_cached_data(media_ids, users, last_id)
return media_ids, users
# -------------------- MAIN --------------------
return removed_count
if __name__ == '__main__':
print('Starting processing...')
if not funcs.get_files(directory):
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
existing_media_ids, existing_users = get_existing_medias(newCursor)
dump_instagram()
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram(directory)
print("Processing completed.")

@ -0,0 +1,147 @@
from datetime import datetime
from uuid import uuid4
import funcs
import config
import cv2
import os
media_directory = "media/ready_for_upload"
platform = "instagram"
working_directory = os.path.join(media_directory, platform)
def UploadMedia(media):
username = media['username']
user_id = media['user_id']
filepath = media['filepath']
platform = media['platform']
media_id = media['media_id']
thumbnail_url = None
phash = None
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
post_type = funcs.determine_post_type(filepath)
if not post_type:
print(f'Error determining post type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
post_date = datetime.now()
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except:
print('Error generating thumbnail. Skipping...')
return False
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
post_type = 'story' if post_type == 'stories' else 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, media_id)
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = f'temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media(folder_path):
medias = []
user_folders = os.listdir(folder_path)
for user_folder in user_folders:
user_folder_path = os.path.join(folder_path, user_folder)
if not os.path.isdir(user_folder_path):
continue
files = os.listdir(user_folder_path)
for filename in files:
filepath = os.path.join(folder_path, user_folder, filename)
# skip file if its hidden
if filename.startswith('.'):
continue
try:
media_id = filename.split('.')[0]
media_id = int(media_id)
except:
media_id = None
media = {
'username': user_folder,
'filepath': filepath,
'user_id': get_user_id(user_folder),
'media_id': media_id,
'platform': platform
}
medias.append(media)
return medias
def dump_instagram(folder_path):
medias = get_media(folder_path)
for media in medias:
UploadMedia(media)
if __name__ == '__main__':
print('Starting processing...')
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram(working_directory)
print("Processing completed.")

@ -0,0 +1,142 @@
from datetime import datetime
import config
import funcs
import cv2
import os
directory = 'storysaver'
def UploadMedia(media):
media_id = media['media_id']
username = media['username']
post_date = media['timestamp']
user_id = media['user_id']
filepath = media['filepath']
highlight_id = media['highlight_id']
post_type = media['post_type']
thumbnail_url = None
phash = None
if media_id and int(media_id) in existing_files:
print('Duplicate file detected. Removing...')
os.remove(filepath)
return True
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
file_hash = funcs.calculate_file_hash(filepath)
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'video':
try:
thumbPath = f'temp/{media_id}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumbPath, frame)
cap.release()
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
phash = funcs.generate_phash(thumbPath)
os.remove(thumbPath)
except:
print('Error generating thumbnail. Skipping...')
return False
elif media_type == 'image':
phash = funcs.generate_phash(filepath)
if media_id:
newFilename = f'{media_id}{file_extension}'
else:
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/{post_type}/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
if highlight_id:
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
newDB.commit()
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def get_user_id(username):
username = username.lower()
if username in existing_users:
return existing_users[username]
return None
def get_media():
medias = []
post_types = {
'posts': 'post',
'stories': 'story',
'profile': 'profile',
}
for post_type in os.listdir('media'):
users = os.listdir(f'media/{post_type}')
for user in users:
user_path = f'media/{post_type}/{user}'
for filename in os.listdir(user_path):
data = {}
filepath = os.path.join(user_path, filename)
data['post_type'] = post_types[post_type]
data['username'] = user
data['timestamp'] = filename.split('__')[-1].split('.')[0] if 'com.instagram.android__' in filename else datetime.now()
if 'com.instagram.android__' in filename:
data['timestamp'] = datetime.strptime(data, '%Y%m%d%H%M%S%f')
data['filepath'] = filepath
data['media_id'] = None
data['user_id'] = get_user_id(data['username'])
data['highlight_id'] = None
medias.append(data)
return medias
def dump_instagram():
medias = get_media()
for media in medias:
UploadMedia(media)
existing_files.append(media['media_id'])
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
existing_files = [image[0] for image in newCursor.fetchall()]
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
dump_instagram(directory)
print("Processing completed.")

@ -0,0 +1,140 @@
from datetime import datetime
from uuid import uuid4
import funcs
import config
import cv2
import os
directory = 'processed_tiktoks'
def UploadMedia(media):
platform = 'TikTok'
username = media['username']
filepath = media['filepath']
file_size = os.path.getsize(filepath)
thumbnail_url = None
phash = None
filename = os.path.basename(filepath)
file_extension = os.path.splitext(filename)[1].lower()
media_type = funcs.get_media_type(filename)
if not media_type:
print(f'Error determining media type for {filename}. Skipping...')
return False
post_type = funcs.determine_post_type(filepath)
if not post_type:
print(f'Error determining post type for {filename}. Skipping...')
return False
file_hash = funcs.calculate_file_hash(filepath)
if file_hash in existing_hashes:
print(f'File {filename} already exists. Skipping...')
return False
post_date = datetime.now()
width, height = funcs.get_media_dimensions(filepath)
duration = funcs.get_video_duration(filepath)
if media_type == 'image':
phash = funcs.generate_phash(filepath)
elif media_type == 'video':
try:
thumb_path = generate_thumbnail(filepath)
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
phash = funcs.generate_phash(thumb_path)
os.remove(thumb_path)
except:
print('Error generating thumbnail. Skipping...')
return False
newFilename = f'{file_hash}{file_extension}'
server_path = f'media/tiktoks/{username}/{newFilename}'
file_url = f"https://storysave.b-cdn.net/{server_path}"
obj_storage.PutFile(filepath, server_path) # slow as fuck
post_type = 'story' if post_type == 'stories' else 'post'
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (username, media_type, file_url, width, height, post_type, post_date, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
newCursor.execute(query, values) # slower
newDB.commit()
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
os.remove(filepath)
return True
def generate_thumbnail(filepath):
thumb_path = f'temp/{uuid4()}.jpg'
cap = cv2.VideoCapture(filepath)
ret, frame = cap.read()
cv2.imwrite(thumb_path, frame)
cap.release()
return thumb_path
def get_media_data(filepath):
filename = os.path.basename(filepath)
parts = filename.split('~')
if len(parts) == 3:
username, title, tiktok_id = parts
elif len(parts) == 2:
username, title = parts
tiktok_id = None
else:
return False
data = {'username': username, 'filepath': filepath, 'tiktok_id': tiktok_id, 'title': title}
return data
def get_media(folder_path):
medias = []
users = os.listdir(folder_path)
for user in users:
user_folder = os.path.join(folder_path, user)
if not os.path.isdir(user_folder):
print(f"Skipping {user}")
continue
files = os.listdir(user_folder)
for filename in files:
filepath = os.path.join(user_folder, filename)
data = get_media_data(filepath)
if data:
medias.append(data)
return medias
def dump_instagram(folder_path):
medias = get_media(folder_path)
for media in medias:
UploadMedia(media)
if __name__ == '__main__':
print('Starting processing...')
if not os.listdir(directory):
print('No files to process. Exiting...')
exit()
newDB, newCursor = config.gen_connection()
obj_storage = config.get_storage()
newCursor.execute("SELECT hash FROM media WHERE hash IS NOT NULL AND platform = 'TikTok'")
existing_hashes = [row[0] for row in newCursor.fetchall()]
dump_instagram(directory)
print("Processing completed.")

@ -0,0 +1,58 @@
from uuid import uuid4
import uuid
import os
def is_valid_uuid(uuid_to_test, version=4):
try:
uuid_obj = uuid.UUID(uuid_to_test, version=version)
except ValueError:
return False
return str(uuid_obj) == uuid_to_test
source_dir = 'tiktoks/'
processed_dir = 'processed_tiktoks'
os.makedirs(processed_dir, exist_ok=True)
users = os.listdir(source_dir)
for user in users:
user_dir = os.path.join(source_dir, user)
if not os.path.isdir(user_dir):
print(f"Skipping {user}")
continue
for file in os.listdir(user_dir):
filename = os.path.splitext(file)[0]
filepath = os.path.join(user_dir, file)
file_ext = os.path.splitext(file)[1]
tiktok_id = str(uuid4())
username = user
if is_valid_uuid(filename):
title = ''
tiktok_id = filename
elif 'masstik' in file or 'masstiktok' in file:
data = file.split('_')
title = filename.split('_')[-1]
else:
title = filename
print("="*100)
title = title.encode('utf-8', 'ignore').decode('utf-8')
print(f"Username: {username}\nTitle: {title}")
new_filename = f"{username}~{title}~{tiktok_id}{file_ext}"
new_filepath = os.path.join(processed_dir, username, new_filename)
os.makedirs(os.path.dirname(new_filepath), exist_ok=True)
if not os.path.exists(new_filepath):
os.rename(filepath, new_filepath)
print(f"Renamed {file} to {new_filepath}")
else:
print("File with the same name already exists. Renaming aborted.")
print("="*100)

@ -0,0 +1,38 @@
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
import shutil
import time
import os
class DownloadHandler(FileSystemEventHandler):
def process_file(self, file_path):
file = os.path.basename(file_path)
if 'crdownload' not in file and file.count('~') == 3:
print(f'Moving {file}...')
outputPath = os.path.join('storysaver', file)
try:
shutil.move(file_path, outputPath)
except Exception as e:
print(f'Failed to move file: {e}')
def on_created(self, event):
if not event.is_directory and 'crdownload' not in event.src_path:
self.process_file(event.src_path)
def on_moved(self, event):
if not event.is_directory and 'crdownload' not in event.dest_path:
self.process_file(event.dest_path)
if __name__ == "__main__":
downloadPath = os.path.join(os.path.expanduser('~'), 'Downloads')
event_handler = DownloadHandler()
observer = Observer()
observer.schedule(event_handler, downloadPath, recursive=False)
observer.start()
try:
while True:
time.sleep(1) # Add a 1-second sleep to reduce CPU usage
except KeyboardInterrupt:
observer.stop()
observer.join()

@ -84,20 +84,19 @@ def parse_media_data(media_item):
mediaInfo = {'taken_at': taken_at, 'post_type' : post_type, 'media_type': mediaTypes[media_item.media_type]}
if media_item.media_type not in [1, 2]:
print(f"Unsupported media type with ID {media_item.pk}")
return None
mediaInfo['media_id'] = int(media_item.pk)
if media_item.media_type == 1: # Image
mediaInfo['media_id'] = int(media_item.pk)
mediaInfo['fileURL'] = media_item.thumbnail_url
mediaInfo['filename'] = f"{media_item.pk}.jpg" # Fix this, get the actual file extension
mediaInfo['filename'] = f"{media_item.pk}.jpg"
elif media_item.media_type == 2: # Video
mediaInfo['media_id'] = int(media_item.pk)
mediaInfo['fileURL'] = media_item.video_url
try:mediaInfo['duration'] = media_item.video_duration # Fix this, get the actual file extension
try:mediaInfo['duration'] = media_item.video_duration
except:mediaInfo['duration'] = 0
mediaInfo['filename'] = f"{media_item.pk}.mp4"
else:
print(f"Unsupported media type with ID {media_item.pk}")
return None
return mediaInfo

@ -5,7 +5,6 @@ from uuid import uuid4
from PIL import Image
import config
import funcs
import json
import os
def insert_highlight_items(media_ids, highlight_id, title, user_id):
@ -32,28 +31,23 @@ def upload_to_storage(local_path, server_path):
print(f"Failed to upload {local_path} to {server_path}. Error: {e}")
def login(force=False):
def login():
client = Client()
try:
if not force:
client.load_settings("session_data.json")
else:
raise FileNotFoundError
except (FileNotFoundError, json.JSONDecodeError):
with open("p.enc", "rb") as encrypted_file:
encrypted_data = encrypted_file.read()
fernet = Fernet(open("key.enc", "r").read())
password = str(fernet.decrypt(encrypted_data), "utf-8")
username = "olivercury"
auth = input("Enter your 2FA code (leave blank if not enabled): ")
if auth:
client.login(username=username, password=password, verification_code=auth)
else:
client.login(username, password)
client.dump_settings("session_data.json")
if os.path.exists("session_data.json"):
client.load_settings("session_data.json")
return client
with open("p.enc", "rb") as encrypted_file:
encrypted_data = encrypted_file.read()
fernet = Fernet(open("key.enc", "r").read())
password = str(fernet.decrypt(encrypted_data), "utf-8")
username = "olivercury"
auth = input("Enter your 2FA code (leave blank if not enabled): ")
client.login(username=username, password=password, verification_code=auth)
client.dump_settings("session_data.json")
print("Logged in successfully.")
@ -204,11 +198,8 @@ if __name__ == "__main__":
for mediaInfo in medias:
filePath = os.path.join('media', mediaInfo['post_type'], username, mediaInfo['filename'])
filePath = funcs.download_file(mediaInfo['media_url'], filePath)
if not filePath:
continue
funcs.download_file(mediaInfo['media_url'], filePath)
mediaInfo["hash"] = funcs.calculate_file_hash(filePath)
mediaInfo["username"] = username

@ -1,143 +0,0 @@
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
# --- Configuration ---
USERNAME = "maorshabakov" # your Instagram username
PASSWORD = "PeyxCU%MD*Zq9p" # your Instagram password
TARGET_USER = "cata.leyah" # the username of the profile to scrape
DOWNLOAD_DIR = "downloads" # directory to save media
SCROLL_PAUSE_TIME = 2 # seconds to wait after each scroll
# --- Helper functions ---
def login_instagram(driver, username, password):
driver.get("https://www.instagram.com/accounts/login/")
time.sleep(3) # wait for the login page to load
# Accept cookies if prompted (may need to adjust for your region)
try:
accept_button = driver.find_element(By.XPATH, "//button[text()='Allow all cookies']")
accept_button.click()
time.sleep(2)
except Exception:
pass
# check if already logged in by checking if the current url has been redirected to the home page
if driver.current_url == "https://www.instagram.com/":
print("Already logged in.")
return
# Enter username and password
username_input = driver.find_element(By.NAME, "username")
password_input = driver.find_element(By.NAME, "password")
username_input.send_keys(username)
password_input.send_keys(password)
password_input.send_keys(Keys.RETURN)
time.sleep(5) # wait for login to complete
def scroll_to_load_posts(driver, post_count=12):
post_links = dict()
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
new_posts = get_post_links(driver)
for link in new_posts:
if link not in post_links:
post_links[link] = True
if len(post_links) >= post_count:
break
if new_height == last_height:
break
last_height = new_height
def get_post_links(driver):
# Find all post links on the profile page.
# Instagram posts are links with hrefs that contain '/p/'
post_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")
links = [elem.get_attribute("href") for elem in post_elements]
# Remove duplicates
return list(set(links))
def download_media(url, download_folder, filename):
response = requests.get(url, stream=True)
if response.status_code == 200:
filepath = os.path.join(download_folder, filename)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {filename}")
else:
print(f"Failed to download: {url}")
def extract_media_url(driver):
# Try to get video first
try:
video = driver.find_element(By.TAG_NAME, "video")
media_url = video.get_attribute("src")
if media_url:
return media_url, "mp4"
except Exception:
pass
# Fallback to image extraction
try:
# Sometimes the post image is inside a div with role="button"
image = driver.find_element(By.XPATH, "//img[contains(@src, 'scontent')]")
media_url = image.get_attribute("src")
if media_url:
return media_url, "jpg"
except Exception:
pass
return None, None
# --- Main script ---
def main():
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
chrome_options = Options()
chrome_options.add_argument("--user-data-dir=.profiles/thenigga")
driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()
try:
# Log in to Instagram
login_instagram(driver, USERNAME, PASSWORD)
# Navigate to the target user's profile
driver.get(f"https://www.instagram.com/{TARGET_USER}/")
time.sleep(5) # let the page load
# Scroll down to load all posts
scroll_to_load_posts(driver)
# Gather all post links from the profile page
post_links = get_post_links(driver)
print(f"Found {len(post_links)} posts.")
# Process each post
for idx, post_link in enumerate(post_links):
driver.get(post_link)
time.sleep(3) # wait for post to load
# click download button where div class post-download-all-button
download_button = driver.find_element(By.XPATH, "//div[@class='post-download-all-button']")
driver.execute_script("arguments[0].click();", download_button)
time.sleep(1)
finally:
driver.quit()
if __name__ == "__main__":
main()
Loading…
Cancel
Save