Compare commits
10 Commits
48d2330193
...
42afcdc539
| Author | SHA1 | Date |
|---|---|---|
|
|
42afcdc539 | 3 weeks ago |
|
|
d440a25e1d | 3 weeks ago |
|
|
bfecfa05b6 | 3 weeks ago |
|
|
00691be490 | 1 month ago |
|
|
a65cc43999 | 3 months ago |
|
|
55484ebf11 | 6 months ago |
|
|
373f3ab661 | 8 months ago |
|
|
e6ad418ecd | 9 months ago |
|
|
445b0ad9f0 | 9 months ago |
|
|
ad39eeaed1 | 9 months ago |
@ -1,35 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import gzip
|
||||
|
||||
data_dir = 'data'
|
||||
data_compressed_dir = 'data_compressed'
|
||||
os.makedirs(data_compressed_dir, exist_ok=True)
|
||||
|
||||
def compress_file(filepath, output_file):
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
compress_data(data, output_file)
|
||||
return output_file
|
||||
|
||||
def compress_data(data, output_file):
|
||||
with gzip.open(output_file, 'wb') as f:
|
||||
f.write(json.dumps(data).encode('utf-8'))
|
||||
return output_file
|
||||
|
||||
|
||||
data_files = os.listdir(data_dir)
|
||||
for file in data_files:
|
||||
if not file.endswith('.json'):
|
||||
continue
|
||||
|
||||
filepath = f'{data_dir}/{file}'
|
||||
output_file = f'{data_compressed_dir}/{file}.gz'
|
||||
output_file = compress_file(filepath, output_file)
|
||||
if output_file:
|
||||
print(f'Compressed {file} to {output_file}')
|
||||
os.remove(filepath)
|
||||
else:
|
||||
print(f'Failed to compress {file}')
|
||||
|
||||
print('Data compression completed')
|
||||
@ -1,137 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from PIL import Image
|
||||
import os, uuid, cv2, config
|
||||
import hashlib
|
||||
|
||||
def clean_empty_folders(directory):
|
||||
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
|
||||
for subfolder in subfolders:
|
||||
folder_path = os.path.join(foldername, subfolder)
|
||||
if not os.listdir(folder_path):
|
||||
os.rmdir(folder_path)
|
||||
print(f"Removed empty folder: {folder_path}")
|
||||
|
||||
def calculate_file_hash(file_path, hash_func='sha256'):
|
||||
h = hashlib.new(hash_func)
|
||||
|
||||
with open(file_path, 'rb') as file:
|
||||
chunk = 0
|
||||
while chunk != b'':
|
||||
chunk = file.read(8192)
|
||||
h.update(chunk)
|
||||
|
||||
return h.hexdigest()
|
||||
|
||||
def extract_file_info(filename):
|
||||
try:
|
||||
username = filename.split("~")[0]
|
||||
timestamp = filename.split("~")[1]
|
||||
user_id = filename.split("~")[2]
|
||||
media_id, some2 = user_id.split("_")
|
||||
user_id = some2.split(".")[0]
|
||||
|
||||
return username, media_id, user_id, timestamp
|
||||
except:
|
||||
return None, None, None, None
|
||||
|
||||
def extract_file_info2(filename):
|
||||
try:
|
||||
username = filename.split("~")[0]
|
||||
elements = filename.split("~")[1].split("_")
|
||||
|
||||
media_id, user_id = elements[0], elements[1].split(".")[0]
|
||||
|
||||
return username, media_id, user_id
|
||||
except:
|
||||
return None, None, None
|
||||
|
||||
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story', user_id = None, date = None):
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = filename.split('.')[-1]
|
||||
|
||||
dirtype = 'stories' if post_type == 'story' else 'posts'
|
||||
server_path = f'users/{dirtype}/{username}/{media_id if media_id else uuid.uuid4().hex}.{file_extension}'
|
||||
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
fileHash = calculate_file_hash(filepath)
|
||||
|
||||
if media_type == 'image':
|
||||
with Image.open(filepath) as img:
|
||||
width, height = img.size
|
||||
else:
|
||||
width, height = get_video_dimensions(filepath)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, user_id, hash, date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type, user_id, fileHash, date)
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
|
||||
existing_files.append(media_id)
|
||||
|
||||
if newCursor.rowcount == 0:
|
||||
print('What the fuck just happend?')
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
os.remove(filepath)
|
||||
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
|
||||
|
||||
|
||||
def get_video_dimensions(video_path):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
cap.release()
|
||||
return width, height
|
||||
|
||||
|
||||
def get_media_type(filename):
|
||||
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
|
||||
return 'image'
|
||||
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
|
||||
return 'video'
|
||||
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
username = folder
|
||||
folder_path = os.path.join(root, folder)
|
||||
|
||||
for filename in os.listdir(folder_path):
|
||||
if "~" not in filename:
|
||||
continue
|
||||
|
||||
username, media_id, user_id, timestamp = extract_file_info(filename)
|
||||
|
||||
if None in [username, media_id, user_id, timestamp]:
|
||||
username, media_id, user_id = extract_file_info2(filename)
|
||||
if None in [username, media_id, user_id]:
|
||||
print(f"Failed to extract info from {filename}")
|
||||
continue
|
||||
|
||||
media_id = int(media_id) if media_id else None
|
||||
|
||||
if media_id in existing_files:
|
||||
print(f'Duplicate, {filename}')
|
||||
os.remove(os.path.join(folder_path, filename))
|
||||
continue
|
||||
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
mediatype = get_media_type(filename)
|
||||
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, user_id = user_id,)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
dump_instagram('StorySave/')
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,96 +0,0 @@
|
||||
from funcs import get_files
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
import config
|
||||
import os
|
||||
|
||||
def generate_image_phash(filepath, hash_size=8):
|
||||
try:
|
||||
# Open the image using PIL
|
||||
pil_image = Image.open(filepath)
|
||||
|
||||
# Compute pHash using the imagehash library
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
return phash
|
||||
except Exception as e:
|
||||
print(f"Error processing image {filepath}: {e}")
|
||||
return None
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
try:
|
||||
# Compute the Hamming distance between the pHashes
|
||||
distance = phash1 - phash2
|
||||
return distance <= threshold
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
def get_media_by_phash(phash, username, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
|
||||
# if username != existing_username:
|
||||
# continue
|
||||
|
||||
# Convert stored pHash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
# Check if the current pHash is a duplicate
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_hash(hash, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_hash = media[1]
|
||||
if hash == existing_hash:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_id(media_id, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_media_id = media[1]
|
||||
if media_id == existing_media_id:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_data_by_filename(filename, data):
|
||||
for item in data:
|
||||
if filename in item['filepath']:
|
||||
return item
|
||||
return None
|
||||
|
||||
directory = 'media/check_if_exists' # Directory containing user images
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
usernames = os.listdir(directory)
|
||||
|
||||
for username in usernames:
|
||||
files = get_files(os.path.join(directory, username))
|
||||
for filepath in files:
|
||||
image_filename = os.path.basename(filepath)
|
||||
print(f'Processing {image_filename}...')
|
||||
|
||||
# Generate pHash for the image
|
||||
phash = generate_image_phash(filepath, hash_size=8)
|
||||
if phash is None:
|
||||
continue # Skip this image if there's an issue
|
||||
|
||||
phash_str = str(phash)
|
||||
|
||||
# Check if the image is a duplicate of any in the database
|
||||
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate image path: {filepath}')
|
||||
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {image_filename} to duplicates/')
|
||||
@ -1,68 +0,0 @@
|
||||
from funcs import generate_phash # Assuming this function computes the pHash and returns a string
|
||||
import imagehash
|
||||
import os
|
||||
|
||||
def get_files(directory):
|
||||
# Recursively get all files in the directory
|
||||
file_list = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for filename in files:
|
||||
file_list.append(os.path.join(root, filename))
|
||||
return file_list
|
||||
|
||||
# Function to compute pHashes for all images in a directory
|
||||
def compute_phashes(image_paths):
|
||||
phash_dict = {}
|
||||
for image_path in image_paths:
|
||||
try:
|
||||
# Compute pHash and get it as a string
|
||||
phash_str = generate_phash(image_path)
|
||||
# Convert the hash string to an ImageHash object
|
||||
phash = imagehash.hex_to_hash(phash_str)
|
||||
phash_dict[image_path] = phash
|
||||
except Exception as e:
|
||||
print(f"Error processing {image_path}: {e}")
|
||||
return phash_dict
|
||||
|
||||
# Get all image files from 'ready_to_upload' and 'sorted' directories
|
||||
ready_images = get_files('ready_to_upload')
|
||||
ready_images = [image for image in ready_images if not image.lower().endswith('.mp4')]
|
||||
|
||||
sorted_images = get_files('sorted')
|
||||
sorted_images = [image for image in sorted_images if not image.lower().endswith('.mp4')]
|
||||
|
||||
# Compute pHashes for images in 'ready_to_upload'
|
||||
print("Computing pHashes for 'ready_to_upload' images...")
|
||||
ready_image_phashes = compute_phashes(ready_images)
|
||||
|
||||
# Compute pHashes for images in 'sorted'
|
||||
print("Computing pHashes for 'sorted' images...")
|
||||
sorted_image_phashes = compute_phashes(sorted_images)
|
||||
|
||||
# Prepare the 'already_processed' directory
|
||||
os.makedirs('already_processed', exist_ok=True)
|
||||
|
||||
# Set a Hamming distance threshold for considering images as duplicates
|
||||
threshold = 5 # Adjust this value as needed
|
||||
|
||||
# Find and move duplicates
|
||||
for sorted_image, sorted_phash in sorted_image_phashes.items():
|
||||
duplicate_found = False
|
||||
for ready_image, ready_phash in ready_image_phashes.items():
|
||||
# Compute Hamming distance between the two pHashes
|
||||
try:
|
||||
distance = sorted_phash - ready_phash
|
||||
except TypeError as e:
|
||||
print(f"Error comparing hashes for {sorted_image} and {ready_image}: {e}")
|
||||
continue
|
||||
|
||||
if distance <= threshold:
|
||||
# Duplicate found
|
||||
newpath = sorted_image.replace('sorted', 'already_processed')
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
print(f"Moving {sorted_image} (duplicate of {ready_image}) to 'already_processed'")
|
||||
os.rename(sorted_image, newpath)
|
||||
duplicate_found = True
|
||||
break # Exit the loop since a duplicate is found
|
||||
if not duplicate_found:
|
||||
print(f"No duplicate found for {sorted_image}")
|
||||
@ -1,59 +0,0 @@
|
||||
import config
|
||||
|
||||
# Function to find the closest perceptual hash (phash) match
|
||||
def find_almost_identical_phash(phash, usernames, max_distance=1):
|
||||
"""
|
||||
Find a username whose phash is nearly identical to the given phash.
|
||||
:param phash: The phash to compare (e.g., from the 'unknown' image).
|
||||
:param usernames: List of tuples containing (username, phash).
|
||||
:param max_distance: Maximum Hamming distance to consider as "identical".
|
||||
:return: The matching username and phash, or None if no match is found.
|
||||
"""
|
||||
for username in usernames:
|
||||
dist = hamming_distance(phash, username[1])
|
||||
if dist <= max_distance:
|
||||
return username
|
||||
return None
|
||||
|
||||
def hamming_distance(phash1, phash2):
|
||||
"""
|
||||
Calculate the Hamming distance between two binary strings.
|
||||
"""
|
||||
if len(phash1) != len(phash2):
|
||||
raise ValueError("Hashes must be of the same length")
|
||||
return sum(c1 != c2 for c1, c2 in zip(phash1, phash2))
|
||||
|
||||
|
||||
# Establish database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch all images with an 'unknown' username
|
||||
cursor.execute("SELECT id, username, phash FROM media WHERE username = 'unknown'")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
# Fetch all non-unknown usernames and their associated phash
|
||||
cursor.execute("SELECT username, phash FROM media WHERE username != 'unknown' AND phash IS NOT NULL AND status = 'public'")
|
||||
usernames = cursor.fetchall()
|
||||
|
||||
# Ensure there are valid usernames to compare against
|
||||
if not usernames:
|
||||
print("No known usernames found in the database.")
|
||||
exit()
|
||||
|
||||
# Adjusted section in your script
|
||||
for row in rows:
|
||||
id = row[0]
|
||||
phash = row[2]
|
||||
|
||||
# Find a nearly identical phash match
|
||||
closest = find_almost_identical_phash(phash, usernames, max_distance=2)
|
||||
|
||||
if closest:
|
||||
print(f"Found match for image {id}: {closest[0]} with phash {closest[1]}")
|
||||
cursor.execute(
|
||||
"UPDATE media SET username = %s WHERE id = %s",
|
||||
(closest[0], id),
|
||||
)
|
||||
db.commit()
|
||||
else:
|
||||
print(f"No nearly identical match found for image {id}.")
|
||||
@ -1,90 +0,0 @@
|
||||
from funcs import get_files # Assuming this is defined elsewhere
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
import config
|
||||
import os
|
||||
|
||||
def generate_image_phash(filepath, hash_size=8):
|
||||
try:
|
||||
# Open the image using PIL
|
||||
pil_image = Image.open(filepath)
|
||||
|
||||
# Compute pHash using the imagehash library
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
return phash
|
||||
except Exception as e:
|
||||
print(f"Error processing image {filepath}: {e}")
|
||||
return None
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
try:
|
||||
# Compute the Hamming distance between the pHashes
|
||||
distance = phash1 - phash2
|
||||
return distance <= threshold
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
def get_media_by_phash(phash, username, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
|
||||
# existing_username = media[2]
|
||||
# if existing_username != username:
|
||||
# continue # Only compare with the same user's media
|
||||
|
||||
# Convert stored pHash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
# Check if the current pHash is a duplicate
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
directory = 'check_if_exists' # Directory containing user images
|
||||
|
||||
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['image'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
existing_phashes = [media[1] for media in existing_medias]
|
||||
|
||||
# Go through the directory folder where each subfolder is a username
|
||||
users = os.listdir(directory)
|
||||
|
||||
for username in users:
|
||||
user_images_path = os.path.join(directory, username)
|
||||
if not os.path.isdir(user_images_path):
|
||||
continue # Skip non-directory files
|
||||
|
||||
# Get all images for the current user
|
||||
images = get_files(user_images_path) # Assuming this gets all image files
|
||||
|
||||
for filepath in images:
|
||||
image_filename = os.path.basename(filepath)
|
||||
print(f'Processing {image_filename}...')
|
||||
|
||||
# Generate pHash for the image
|
||||
phash = generate_image_phash(filepath, hash_size=8)
|
||||
if phash is None:
|
||||
continue # Skip this image if there's an issue
|
||||
|
||||
phash_str = str(phash)
|
||||
|
||||
if phash_str not in existing_phashes:
|
||||
print(f'No duplicate found for {image_filename}')
|
||||
continue
|
||||
|
||||
# Check if the image is a duplicate of any in the database
|
||||
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
found_username = duplicate_media[2]
|
||||
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate image path: {filepath}')
|
||||
newpath = os.path.join('duplicates', found_username, image_filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {image_filename} to duplicates/')
|
||||
@ -1,87 +0,0 @@
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
import config
|
||||
import cv2
|
||||
import os
|
||||
|
||||
def generate_thumbnail_phash(filepath, hash_size=8): # Set hash_size to 8
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret:
|
||||
print(f"Error reading frame from {filepath}")
|
||||
return None
|
||||
|
||||
# Resize frame to a standard size
|
||||
standard_size = (320, 240)
|
||||
resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Convert OpenCV image (BGR) to PIL Image (RGB)
|
||||
image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(image_rgb)
|
||||
|
||||
# Compute pHash
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
|
||||
return phash
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
# Compute Hamming distance between the pHashes
|
||||
try:
|
||||
distance = phash1 - phash2
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
return distance <= threshold
|
||||
|
||||
def get_media_by_phash(phash, username, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
if existing_username != username:
|
||||
continue
|
||||
|
||||
# Convert stored phash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Directory containing user videos
|
||||
directory = 'check_if_exists'
|
||||
|
||||
# Fetch existing videos with pHashes
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL", ['video'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
users = os.listdir(directory) # Assuming 'check_if_exists' contains user videos
|
||||
for username in users:
|
||||
user_videos_path = os.path.join(directory, username)
|
||||
if not os.path.isdir(user_videos_path):
|
||||
continue
|
||||
|
||||
videos = [video for video in os.listdir(user_videos_path) if video.endswith(('.mp4', '.avi', '.mov'))]
|
||||
for video in videos:
|
||||
print(f'Processing {video}...')
|
||||
filepath = os.path.join(user_videos_path, video)
|
||||
|
||||
phash = generate_thumbnail_phash(filepath, hash_size=8) # Use hash_size=8
|
||||
if phash is None:
|
||||
continue
|
||||
|
||||
phash_str = str(phash)
|
||||
|
||||
duplicate_media = get_media_by_phash(phash, username, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate video path: {filepath}')
|
||||
newpath = filepath.replace(directory, 'duplicates')
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {video} to duplicates/')
|
||||
@ -1,58 +0,0 @@
|
||||
from funcs import generate_phash
|
||||
import os
|
||||
|
||||
def find_duplicates(source_dir, target_dir, extensions, max_distance):
|
||||
"""Remove duplicates in target_dir that are found in source_dir based on Hamming distance."""
|
||||
source_files = {}
|
||||
target_files = {}
|
||||
|
||||
# Helper function to filter files by extension
|
||||
def filter_files(files):
|
||||
return [f for f in files if os.path.splitext(f)[1].lower() in extensions]
|
||||
|
||||
# Build hash map of source directory
|
||||
for dirpath, _, filenames in os.walk(source_dir):
|
||||
for filename in filter_files(filenames):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
filehash = generate_phash(filepath, str=False)
|
||||
if filehash:
|
||||
source_files[filehash] = filepath
|
||||
|
||||
# Build hash map of target directory and compare
|
||||
for dirpath, _, filenames in os.walk(target_dir):
|
||||
for filename in filter_files(filenames):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
filehash = generate_phash(filepath, str=False)
|
||||
if not filehash:
|
||||
continue
|
||||
|
||||
# Check if this file is similar to any of the source files
|
||||
is_duplicate = False
|
||||
for source_hash in source_files.keys():
|
||||
distance = filehash - source_hash # Hamming distance
|
||||
if distance <= max_distance:
|
||||
is_duplicate = True
|
||||
break # Found a duplicate
|
||||
|
||||
if is_duplicate:
|
||||
newpath = os.path.join('duplicates', filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f"Moved duplicate: {filepath} to duplicates/ (distance: {distance})")
|
||||
else:
|
||||
target_files[filehash] = filepath
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Paths to the directories
|
||||
source_dir = 'D:/Crawlers/media/Coomer/sadierayxo'
|
||||
target_dir = 'sorted/sadierayxo'
|
||||
|
||||
# List of accepted extensions
|
||||
extensions = {'.png', '.jpg', '.jpeg', '.webp', '.gif'}
|
||||
|
||||
# Maximum Hamming distance to consider as duplicates
|
||||
MAX_DISTANCE = 5 # Adjust this threshold as needed
|
||||
|
||||
find_duplicates(source_dir, target_dir, extensions, MAX_DISTANCE)
|
||||
|
||||
print("Duplicate removal process completed.")
|
||||
@ -1,110 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from PIL import Image
|
||||
import os, uuid, cv2, config
|
||||
|
||||
def scan_dupes(folder_path):
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
folder_path = os.path.join(root, folder)
|
||||
for filename in os.listdir(folder_path):
|
||||
media_id = filename.replace('.mp4', '').replace('.jpg', '')
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
if media_id:
|
||||
try:
|
||||
if int(media_id) in existing_files:
|
||||
print(f'Duplicate')
|
||||
os.remove(filepath)
|
||||
except:
|
||||
print(f'Error: {filepath}')
|
||||
|
||||
def clean_empty_folders(directory):
|
||||
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
|
||||
for subfolder in subfolders:
|
||||
folder_path = os.path.join(foldername, subfolder)
|
||||
if not os.listdir(folder_path):
|
||||
os.rmdir(folder_path)
|
||||
print(f"Removed empty folder: {folder_path}")
|
||||
|
||||
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story'):
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = filename.split('.')[-1]
|
||||
|
||||
try:
|
||||
if int(media_id) in existing_files:
|
||||
print(f'Duplicate')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
except: media_id = uuid.uuid4().hex
|
||||
|
||||
dirtype = 'stories' if post_type == 'story' else 'posts'
|
||||
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
if media_type == 'image':
|
||||
with Image.open(filepath) as img:
|
||||
width, height = img.size
|
||||
else:
|
||||
width, height = get_video_dimensions(filepath)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type)
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
|
||||
os.remove(filepath)
|
||||
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
|
||||
|
||||
|
||||
def get_video_dimensions(video_path):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
cap.release()
|
||||
return width, height
|
||||
|
||||
|
||||
def get_media_type(filename):
|
||||
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
|
||||
return 'image'
|
||||
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
|
||||
return 'video'
|
||||
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
username = folder
|
||||
folder_path = os.path.join(root, folder)
|
||||
|
||||
post_type = 'story' if folder_path.split('\\')[0] == 'stories' else 'post'
|
||||
|
||||
for filename in os.listdir(folder_path):
|
||||
media_id = filename.replace('.mp4', '').replace('.jpg', '')
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
mediatype = get_media_type(filename)
|
||||
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, post_type=post_type)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
dump_instagram('media/posts')
|
||||
dump_instagram('media/stories')
|
||||
|
||||
scan_dupes('media/posts')
|
||||
scan_dupes('media/stories')
|
||||
|
||||
clean_empty_folders('media/posts')
|
||||
clean_empty_folders('media/stories')
|
||||
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,110 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import os, uuid, config, funcs, cv2
|
||||
from datetime import datetime
|
||||
from PIL import Image
|
||||
|
||||
def dump_facebook(folder_path):
|
||||
for filename in os.listdir(folder_path):
|
||||
if os.path.isdir(os.path.join(folder_path, filename)):
|
||||
continue
|
||||
|
||||
username = filename.split("'")[0]
|
||||
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
|
||||
mediatype = funcs.get_media_type(filename)
|
||||
post_type = funcs.determine_post_type(filepath, mediatype)
|
||||
|
||||
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
|
||||
|
||||
for folder in os.listdir(folder_path):
|
||||
if os.path.isdir(os.path.join(folder_path, folder)):
|
||||
username = folder
|
||||
|
||||
for filename in os.listdir(os.path.join(folder_path, folder)):
|
||||
filepath = os.path.join(folder_path, folder, filename)
|
||||
|
||||
mediatype = funcs.get_media_type(filename)
|
||||
post_type = funcs.determine_post_type(filepath, mediatype)
|
||||
|
||||
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
|
||||
|
||||
def upload_file(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
if file_hash in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return False
|
||||
|
||||
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
|
||||
|
||||
if "FB_IMG" in filename: media_id = filename.split("_")[2].split(".")[0]
|
||||
else: media_id = uuid.uuid4().hex
|
||||
|
||||
dirtype = funcs.determine_post_type(filepath, media_type)
|
||||
server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
if media_type == 'image':
|
||||
with Image.open(filepath) as img:
|
||||
width, height = img.size
|
||||
else:
|
||||
width, height = funcs.get_video_dimensions(filepath)
|
||||
|
||||
thumbnail_url = None
|
||||
if media_type == 'video':
|
||||
thumbPath = f'temp/{media_id}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg')
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
|
||||
|
||||
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
|
||||
|
||||
if post_type == 'stories':
|
||||
post_type = 'story'
|
||||
else:
|
||||
post_type = 'post'
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, platform, hash, filename, duration, thumbnail) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, 'facebook', file_hash, filename, duration, thumbnail_url)
|
||||
|
||||
try:
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
except Exception as e:
|
||||
print(f"Database error: {e}")
|
||||
return False
|
||||
|
||||
try:
|
||||
if newCursor.rowcount > 0:
|
||||
os.remove(filepath)
|
||||
except Exception as e:
|
||||
print(f"Failed to remove local file {filepath}: {e}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT hash FROM media WHERE platform='facebook' AND hash IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
dump_facebook('facebook/')
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,82 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from datetime import datetime
|
||||
import os, config, funcs
|
||||
from PIL import Image
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
for filename in os.listdir(folder_path):
|
||||
parts = filename.split('_')
|
||||
|
||||
try:
|
||||
username = '_'.join(parts[:-2]) # Join all except last two
|
||||
timestamp = int(parts[-2]) # Second last is timestamp
|
||||
user_id = int(parts[-1].split('.')[0]) # Last part before extension is user_id
|
||||
except Exception as e:
|
||||
print(f"Invalid filename: {filename}. Error: {e}")
|
||||
continue
|
||||
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
|
||||
mediatype = funcs.get_media_type(filename)
|
||||
post_type = funcs.determine_post_type(filepath, mediatype)
|
||||
|
||||
UploadMedia(username=username, media_type=mediatype, filepath=filepath, post_type=post_type, timestamp=timestamp, user_id=user_id)
|
||||
|
||||
|
||||
def UploadMedia(filepath, username, media_type='image', post_type='story', timestamp=None, user_id=None):
|
||||
if 'tero' in username:
|
||||
pass
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
|
||||
|
||||
post_date = datetime.fromtimestamp(int(timestamp)) if timestamp else datetime.now()
|
||||
|
||||
dirtype = funcs.determine_post_type(filepath, media_type)
|
||||
|
||||
server_path = f'media/{dirtype}/{username}/{file_hash}{file_extension}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
if file_hash in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
if media_type == 'image':
|
||||
with Image.open(filepath) as img:
|
||||
width, height = img.size
|
||||
else:
|
||||
width, height = funcs.get_video_dimensions(filepath)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration)
|
||||
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT hash FROM media WHERE platform='instagram' AND hash IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
dump_instagram('storysaver/missingdata/')
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,67 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import os, uuid, config, funcs
|
||||
from datetime import datetime
|
||||
from PIL import Image
|
||||
|
||||
def dump_facebook(folder_path):
|
||||
for folder in os.listdir(folder_path):
|
||||
if os.path.isdir(os.path.join(folder_path, folder)):
|
||||
username = folder
|
||||
|
||||
for filename in os.listdir(os.path.join(folder_path, folder)):
|
||||
filepath = os.path.join(folder_path, folder, filename)
|
||||
|
||||
upload_file(username=username, filepath=filepath)
|
||||
|
||||
def upload_file(filepath, username):
|
||||
filename = os.path.basename(filepath)
|
||||
media_id = filename.split('.')[0]
|
||||
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath) if media_type == 'video' else 0
|
||||
|
||||
width, height = funcs.get_video_dimensions(filepath) if media_type == 'video' else Image.open(filepath).size
|
||||
|
||||
|
||||
dirtype = funcs.determine_post_type(filepath, media_type)
|
||||
server_path = os.path.join('media', dirtype, username, f'{media_id}{file_extension}')
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
if file_hash in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return False
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, platform, hash, filename, duration, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, 'tiktok', file_hash, filename, duration, media_id)
|
||||
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
if newCursor.rowcount > 0:
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT hash FROM media WHERE platform='tiktok' AND hash IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
dump_facebook('tiktok/')
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,32 +0,0 @@
|
||||
import os, funcs
|
||||
from funcs import generate_phash
|
||||
|
||||
def get_username(image, ready_images):
|
||||
for ready_image in ready_images:
|
||||
if os.path.basename(image) in ready_image:
|
||||
ready_image = ready_image.replace('\\', '/')
|
||||
return ready_image.split('/')[1]
|
||||
return None
|
||||
|
||||
ready_images = funcs.get_files('ready_to_upload')
|
||||
ready_images = [image for image in ready_images if not image.endswith('.mp4')]
|
||||
|
||||
sorted_images = funcs.get_files('sorted')
|
||||
sorted_images = [image for image in sorted_images if not image.endswith('.mp4')]
|
||||
|
||||
os.makedirs('already_processed', exist_ok=True)
|
||||
|
||||
for image in sorted_images:
|
||||
image = image.replace('\\', '/')
|
||||
username = image.split('/')[1]
|
||||
filename = os.path.basename(image)
|
||||
|
||||
for ready_image in ready_images:
|
||||
if filename in ready_image:
|
||||
username = get_username(image, ready_images)
|
||||
newpath = ready_image.replace('ready_to_upload', 'already_processed')
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
print(f'Moving {image} which is a match for {ready_image} to already_processed')
|
||||
os.rename(image, newpath)
|
||||
print(f'Moved {ready_image} to already_processed')
|
||||
break
|
||||
@ -1,56 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import os, config, requests
|
||||
from moviepy.editor import VideoFileClip
|
||||
|
||||
def get_media_type(filename):
|
||||
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
|
||||
video_extensions = {".mp4", ".mov"}
|
||||
extension = os.path.splitext(filename.lower())[1]
|
||||
if extension in image_extensions:
|
||||
return 'image'
|
||||
elif extension in video_extensions:
|
||||
return 'video'
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
def determine_post_type(media_type):
|
||||
# Assuming the post type is directly based on media type.
|
||||
return media_type
|
||||
|
||||
def get_video_dimensions(filepath):
|
||||
with VideoFileClip(filepath) as clip:
|
||||
width, height = clip.size
|
||||
return width, height
|
||||
|
||||
def download_file(url):
|
||||
local_filename = url.split('/')[-1]
|
||||
# Note: Stream=True to avoid loading the whole file into memory
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(local_filename, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
return local_filename
|
||||
|
||||
if __name__ == '__main__':
|
||||
newDB, newCursor = config.gen_connection()
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
posts = open('fucked', 'r')
|
||||
|
||||
for item in posts:
|
||||
username, url = item.strip().split('~')
|
||||
media_id = url.split('/')[-1].split('.')[0]
|
||||
media_type = get_media_type(url)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, platform, media_url) VALUES (%s, %s, %s, %s)"
|
||||
values = (username, media_type, 'facebook', url)
|
||||
|
||||
try:
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated.{url}')
|
||||
except Exception as e:
|
||||
print(f"Database error: {e}")
|
||||
|
||||
posts.close()
|
||||
@ -1,40 +0,0 @@
|
||||
import config, os, json
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
|
||||
def find_file(filename, directory):
|
||||
filename = filename.lower().split('.')[0]
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if filename in file:
|
||||
return os.path.join(root, file)
|
||||
return None
|
||||
|
||||
def generate_phash(image_path):
|
||||
image = Image.open(image_path)
|
||||
return str(imagehash.phash(image))
|
||||
|
||||
count = 0
|
||||
|
||||
cacheDir = 'sorted'
|
||||
dataPath = 'pins.json'
|
||||
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
|
||||
medias = json.load(open(dataPath))
|
||||
|
||||
for item in medias:
|
||||
count += 1
|
||||
|
||||
filepath = item['filepath']
|
||||
if os.path.exists(filepath):
|
||||
continue
|
||||
|
||||
newfilepath = find_file(os.path.basename(filepath), cacheDir)
|
||||
if newfilepath:
|
||||
print(f"Found file {newfilepath} for {filepath}")
|
||||
item['filepath'] = newfilepath
|
||||
|
||||
|
||||
with open(dataPath, 'w') as f:
|
||||
json.dump(medias, f)
|
||||
@ -1,28 +0,0 @@
|
||||
import os, json
|
||||
from funcs import generate_phash
|
||||
|
||||
count = 0
|
||||
cacheDir = '_sort'
|
||||
dataPath = 'pins.json'
|
||||
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
|
||||
medias = json.load(open(dataPath))
|
||||
|
||||
for item in medias:
|
||||
count += 1
|
||||
if item['type'] == 'image':
|
||||
filepath = item['filepath']
|
||||
if 'phash' in item:
|
||||
print(f"Skipping {count}/{len(medias)}: already processed.")
|
||||
continue
|
||||
|
||||
if not os.path.exists(filepath):
|
||||
print(f"File {filepath} does not exist, skipping.")
|
||||
continue
|
||||
phash = generate_phash(filepath)
|
||||
item['phash'] = phash
|
||||
print(f"Processed {count}/{len(medias)}: with pHash {phash}")
|
||||
|
||||
with open(dataPath, 'w') as f:
|
||||
json.dump(medias, f)
|
||||
@ -1,19 +0,0 @@
|
||||
import config, storysave_api
|
||||
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
usernames = []
|
||||
with open('usernames.txt', 'r') as f:
|
||||
for line in f:
|
||||
usernames.append(line.strip())
|
||||
|
||||
for username in usernames:
|
||||
print(f"Username: {username}")
|
||||
|
||||
user_id = storysave_api.get_user_id(username)
|
||||
|
||||
# Update the user_id in the database
|
||||
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
|
||||
db.commit()
|
||||
print(f"[{cursor.rowcount}] Updated user_id for {username}")
|
||||
@ -1,94 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from moviepy.editor import VideoFileClip
|
||||
import config
|
||||
import hashlib
|
||||
import requests
|
||||
import os
|
||||
|
||||
def file_hash_from_url(url, hash_algo='sha256'):
|
||||
h = hashlib.new(hash_algo)
|
||||
|
||||
response = requests.get(url, stream=True)
|
||||
|
||||
if response.status_code == 200:
|
||||
for chunk in response.iter_content(8192):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
else:
|
||||
raise Exception(f"Failed to download file: Status code {response.status_code}")
|
||||
|
||||
def get_video_duration(file_path):
|
||||
"""
|
||||
Returns the duration of the video file in seconds.
|
||||
|
||||
:param file_path: Path to the video file
|
||||
:return: Duration in seconds
|
||||
"""
|
||||
try:
|
||||
with VideoFileClip(file_path) as video:
|
||||
return video.duration
|
||||
except:
|
||||
return 0
|
||||
|
||||
def file_hash(filename, hash_algo='sha256'):
|
||||
"""
|
||||
Compute the hash of a file.
|
||||
|
||||
:param filename: Path to the file.
|
||||
:param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5').
|
||||
:return: Hexadecimal hash string.
|
||||
"""
|
||||
# Create a hash object
|
||||
h = hashlib.new(hash_algo)
|
||||
|
||||
# Open the file in binary mode and read in chunks
|
||||
with open(filename, 'rb') as file:
|
||||
while chunk := file.read(8192):
|
||||
h.update(chunk)
|
||||
|
||||
# Return the hexadecimal digest of the hash
|
||||
return h.hexdigest()
|
||||
|
||||
# the hash of the images are different due to optimizer
|
||||
|
||||
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE duration = 0 AND media_type = 'video' AND status != 'deleted';")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
cacheDir = 'cache'
|
||||
for result in results:
|
||||
count += 1
|
||||
videoID = result[0]
|
||||
mediaID = result[1]
|
||||
mediaURL = result[2]
|
||||
extension = mediaURL.split('.')[-1]
|
||||
|
||||
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if os.path.exists(localFilePath):
|
||||
print(f"File already exists: {localFilePath}")
|
||||
else:
|
||||
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
|
||||
|
||||
duration = get_video_duration(localFilePath)
|
||||
|
||||
if duration == 0:
|
||||
print(f"Failed to get duration for {localFilePath}")
|
||||
continue
|
||||
|
||||
if duration < 1:
|
||||
duration = 1
|
||||
|
||||
cursor.execute("UPDATE media SET duration = %s WHERE id = %s;", (duration, result[0]))
|
||||
db.commit()
|
||||
|
||||
print(f"[{count}/{len(results)}] {result[1]}: {duration}, {cursor.rowcount}")
|
||||
@ -1,47 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import config
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
def file_hash(filename, hash_algo='sha256'):
|
||||
"""
|
||||
Compute the hash of a file.
|
||||
|
||||
:param filename: Path to the file.
|
||||
:param hash_algo: Hashing algorithm to use (e.g., 'sha256', 'md5').
|
||||
:return: Hexadecimal hash string.
|
||||
"""
|
||||
h = hashlib.new(hash_algo)
|
||||
|
||||
with open(filename, 'rb') as file:
|
||||
while chunk := file.read(8192):
|
||||
h.update(chunk)
|
||||
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
|
||||
localFilePath = os.path.join(os.getcwd(), 'temp', os.path.basename(serverPath))
|
||||
if not os.path.exists(localFilePath):
|
||||
obj_storage.DownloadFile(storage_path=serverPath, download_path=os.path.join(os.getcwd(), 'temp'))
|
||||
|
||||
filehash = file_hash(localFilePath)
|
||||
|
||||
cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, result[0]))
|
||||
db.commit()
|
||||
|
||||
print(f"[{count}/{len(results)}] {result[1]}: {filehash}, {cursor.rowcount}")
|
||||
@ -1,47 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import config, os, funcs
|
||||
from PIL import Image
|
||||
|
||||
# the hash of the images are different due to optimizer
|
||||
|
||||
#obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
cacheDir = 'cache'
|
||||
for result in results:
|
||||
count += 1
|
||||
videoID = result[0]
|
||||
mediaID = result[1]
|
||||
mediaURL = result[2]
|
||||
extension = mediaURL.split('.')[-1]
|
||||
|
||||
serverPath = result[2].replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if os.path.exists(localFilePath):
|
||||
print(f"File already exists: {localFilePath}")
|
||||
else:
|
||||
obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
|
||||
|
||||
mediaType = funcs.get_media_type(localFilePath)
|
||||
|
||||
if mediaType == 'image':
|
||||
with Image.open(localFilePath) as img:
|
||||
width, height = img.size
|
||||
elif mediaType == 'video':
|
||||
width, height = funcs.get_video_dimensions(localFilePath)
|
||||
|
||||
|
||||
cursor.execute("UPDATE media SET width = %s, height=%s WHERE id = %s;", (width, height, videoID))
|
||||
db.commit()
|
||||
|
||||
print(f"[{count}/{len(results)}] width: {width}, height: {height} {cursor.rowcount}")
|
||||
@ -1,32 +0,0 @@
|
||||
import config
|
||||
import os
|
||||
|
||||
temp_directory = "cache"
|
||||
os.makedirs(temp_directory, exist_ok=True)
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
|
||||
id, media_url = result
|
||||
|
||||
serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
continue
|
||||
|
||||
file_size = os.path.getsize(localFilePath)
|
||||
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id))
|
||||
db.commit()
|
||||
|
||||
print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}")
|
||||
@ -1,36 +0,0 @@
|
||||
import config
|
||||
from funcs import generate_phash
|
||||
|
||||
count = 0
|
||||
|
||||
storage = config.get_storage()
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
generate_for = 'media_url'
|
||||
media_type = 'image'
|
||||
|
||||
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL;", [media_type])
|
||||
medias = cursor.fetchall()
|
||||
|
||||
for item in medias:
|
||||
count += 1
|
||||
|
||||
itemID = item[0]
|
||||
media_url = item[1]
|
||||
|
||||
server_path = media_url.replace('https://storysave.b-cdn.net/', '').replace('\\', '/')
|
||||
filepath = storage.DownloadFile(server_path, 'temp')
|
||||
if not filepath:
|
||||
print(f"Error downloading {server_path}")
|
||||
continue
|
||||
|
||||
phash = generate_phash(filepath)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {filepath}")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, itemID])
|
||||
db.commit()
|
||||
|
||||
print(f"[{cursor.rowcount}] Processed {count}/{len(medias)}: with pHash {phash}")
|
||||
@ -1,39 +0,0 @@
|
||||
import config, os
|
||||
from funcs import generate_phash
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
itemID = result[0]
|
||||
mediaID = result[1]
|
||||
if not mediaID:
|
||||
print(f"Media ID is null, skipping.")
|
||||
continue
|
||||
mediaURL = result[2]
|
||||
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
phash = generate_phash(localFilePath)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {localFilePath}, skipping.")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
|
||||
db.commit()
|
||||
|
||||
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
|
||||
@ -1,74 +0,0 @@
|
||||
import config, os, threading, queue
|
||||
from funcs import generate_phash
|
||||
|
||||
# Initialize database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Query the media table for unprocessed images
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE media_type = 'image' AND phash = '0';")
|
||||
results = cursor.fetchall()
|
||||
|
||||
# Setup cache directory
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
# Thread-safe queue for processed media
|
||||
processed_media_queue = queue.Queue()
|
||||
|
||||
def process_media():
|
||||
"""Thread function to update database with processed pHash values."""
|
||||
while True:
|
||||
try:
|
||||
item = processed_media_queue.get(timeout=10) # Timeout prevents infinite blocking
|
||||
if item is None: # Sentinel value to exit the loop
|
||||
break
|
||||
|
||||
itemID, phash = item
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", (phash, itemID))
|
||||
db.commit()
|
||||
print(f"Updated database for ID {itemID} with pHash {phash}.")
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
# Start the database update thread
|
||||
update_thread = threading.Thread(target=process_media, daemon=True)
|
||||
update_thread.start()
|
||||
|
||||
# Main processing loop for generating pHash
|
||||
count = 0
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
itemID = result[0]
|
||||
mediaID = result[1]
|
||||
|
||||
if not mediaID:
|
||||
print(f"Media ID is null, skipping.")
|
||||
continue
|
||||
|
||||
mediaURL = result[2]
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist, skipping.")
|
||||
continue
|
||||
|
||||
phash = generate_phash(localFilePath)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {localFilePath}, skipping.")
|
||||
continue
|
||||
|
||||
# Add the processed media to the queue
|
||||
processed_media_queue.put((itemID, phash))
|
||||
print(f"Processed {count}/{len(results)}: {mediaID} with pHash {phash}")
|
||||
|
||||
# Signal the update thread to stop
|
||||
processed_media_queue.put(None)
|
||||
|
||||
# Wait for the update thread to finish
|
||||
update_thread.join()
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,43 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import config
|
||||
|
||||
# Establish database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch rows with file_size = 0
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
cacheDir = 'cache'
|
||||
os.makedirs(cacheDir, exist_ok=True)
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
update_data = []
|
||||
for result in results:
|
||||
itemID = result[0]
|
||||
media_id = result[1]
|
||||
|
||||
if not media_id:
|
||||
print(f"Media ID is null for ID {itemID}, skipping.")
|
||||
continue
|
||||
|
||||
mediaURL = result[2]
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
print(f"File {localFilePath} does not exist for ID {itemID}, skipping.")
|
||||
continue
|
||||
|
||||
file_size = os.path.getsize(localFilePath)
|
||||
update_data.append({"id": itemID, "file_size": file_size})
|
||||
|
||||
# Save the results to a JSON file
|
||||
output_file = "update_data.json"
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(update_data, f, indent=4)
|
||||
|
||||
print(f"Saved {len(update_data)} updates to {output_file}.")
|
||||
cursor.close()
|
||||
db.close()
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,29 +0,0 @@
|
||||
import json
|
||||
import config
|
||||
|
||||
# Establish database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Load update data from the JSON file
|
||||
input_file = "update_data.json"
|
||||
with open(input_file, 'r') as f:
|
||||
update_data = json.load(f)
|
||||
|
||||
print(f"Loaded {len(update_data)} records to update.")
|
||||
|
||||
# Process each record one by one
|
||||
for count, item in enumerate(update_data, start=1):
|
||||
item_id = item["id"]
|
||||
file_size = item["file_size"]
|
||||
|
||||
try:
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s", (file_size, item_id))
|
||||
db.commit()
|
||||
print(f"Processed {count}/{len(update_data)}: ID {item_id} updated with file size {file_size}.")
|
||||
except Exception as e:
|
||||
print(f"Error updating ID {item_id}: {e}")
|
||||
db.rollback()
|
||||
|
||||
print("All updates completed.")
|
||||
cursor.close()
|
||||
db.close()
|
||||
@ -1,31 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import config, os
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
cacheDir = 'cache'
|
||||
|
||||
for result in results:
|
||||
itemID = result[0]
|
||||
|
||||
mediaURL = result[2]
|
||||
serverPath = mediaURL.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
continue
|
||||
|
||||
file_size = os.path.getsize(localFilePath)
|
||||
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, itemID))
|
||||
db.commit()
|
||||
|
||||
print(f"Processed ID {itemID}: updated with file size {file_size}.")
|
||||
|
||||
cursor.close()
|
||||
db.close()
|
||||
@ -1,112 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from PIL import Image
|
||||
import os, uuid, cv2, config
|
||||
|
||||
def scan_dupes(folder_path):
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
folder_path = os.path.join(root, folder)
|
||||
for filename in os.listdir(folder_path):
|
||||
media_id = filename.replace('.mp4', '').replace('.jpg', '')
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
if media_id:
|
||||
try:
|
||||
if int(media_id) in existing_files:
|
||||
print(f'Duplicate')
|
||||
os.remove(filepath)
|
||||
except:
|
||||
pass
|
||||
|
||||
def clean_empty_folders(directory):
|
||||
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
|
||||
for subfolder in subfolders:
|
||||
folder_path = os.path.join(foldername, subfolder)
|
||||
if not os.listdir(folder_path):
|
||||
os.rmdir(folder_path)
|
||||
print(f"Removed empty folder: {folder_path}")
|
||||
|
||||
def upload_file(filepath, username, media_id = None, media_type='image', post_type = 'story'):
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = filename.split('.')[-1]
|
||||
|
||||
try:
|
||||
if int(media_id) in existing_files:
|
||||
print(f'Duplicate')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
except: media_id = uuid.uuid4().hex
|
||||
|
||||
dirtype = 'stories' if post_type == 'story' else 'posts'
|
||||
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
if media_type == 'image':
|
||||
with Image.open(filepath) as img:
|
||||
width, height = img.size
|
||||
else:
|
||||
width, height = get_video_dimensions(filepath)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type) VALUES (%s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type)
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
|
||||
os.remove(filepath)
|
||||
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
|
||||
|
||||
|
||||
def get_video_dimensions(video_path):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
cap.release()
|
||||
return width, height
|
||||
|
||||
|
||||
def get_media_type(filename):
|
||||
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
|
||||
return 'image'
|
||||
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
|
||||
return 'video'
|
||||
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
username = folder
|
||||
folder_path = os.path.join(root, folder)
|
||||
|
||||
post_type = 'story' if folder_path.split('\\')[0] == 'stories' else 'post'
|
||||
|
||||
for filename in os.listdir(folder_path):
|
||||
media_id = filename.replace('.mp4', '').replace('.jpg', '')
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
mediatype = get_media_type(filename)
|
||||
upload_file(username=username, media_type=mediatype, filepath=filepath, media_id=media_id, post_type=post_type)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
scan_dupes('media/posts')
|
||||
scan_dupes('media/stories')
|
||||
scan_dupes('StorySave/')
|
||||
|
||||
dump_instagram('media/posts')
|
||||
dump_instagram('media/stories')
|
||||
dump_instagram('StorySave/')
|
||||
|
||||
clean_empty_folders('media/posts')
|
||||
clean_empty_folders('media/stories')
|
||||
clean_empty_folders('StorySave/')
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,33 +0,0 @@
|
||||
import bunny, json
|
||||
|
||||
medias = json.load(open('videos.json', 'r'))
|
||||
videoIDS = [media['url'].split('/')[-1] for media in medias]
|
||||
|
||||
videos = bunny.list_videos()
|
||||
|
||||
with open('allVideos.json', 'w') as f:
|
||||
json.dump(videos, f, indent=4)
|
||||
|
||||
missingVideos = []
|
||||
for video in videos:
|
||||
if video['guid'] in videoIDS:
|
||||
continue
|
||||
missingVideos.append(video)
|
||||
|
||||
datas = []
|
||||
for video in missingVideos:
|
||||
data = {
|
||||
'guid': video['guid'],
|
||||
'title': video['title'],
|
||||
'length': video['length'],
|
||||
'width': video['width'],
|
||||
'height': video['height'],
|
||||
'availableResolutions': video['availableResolutions'],
|
||||
'storageSize': video['storageSize'],
|
||||
'hasMP4Fallback': video['hasMP4Fallback'],
|
||||
'category': video['category'],
|
||||
}
|
||||
datas.append(data)
|
||||
|
||||
with open('missing_videos.json', 'w') as f:
|
||||
json.dump(datas, f, indent=4)
|
||||
@ -1,27 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
import os, json
|
||||
|
||||
altpins_obj_storage = Storage('577cb82d-8176-4ccf-935ce0a574bf-fe4c-4012', 'altpins')
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
medias = json.load(open('db_pins.json', 'r'))
|
||||
|
||||
count = 0
|
||||
print(f"Found {len(medias)} files to process.")
|
||||
|
||||
cacheDir = 'old_altpins_cache'
|
||||
for media in medias:
|
||||
count += 1
|
||||
username = media['title']
|
||||
mediaID = media['photo_id']
|
||||
mediaURL = media['url']
|
||||
extension = mediaURL.split('.')[-1]
|
||||
|
||||
serverPath = mediaURL.replace("https://altpins.b-cdn.net/", '').replace('//', '/').replace('\\', '/').replace('https://altpins.b-cdn.net/', '')
|
||||
localFilePath = os.path.join(cacheDir, os.path.basename(serverPath))
|
||||
|
||||
if os.path.exists(localFilePath):
|
||||
continue
|
||||
|
||||
altpins_obj_storage.DownloadFile(storage_path=serverPath, download_path=cacheDir)
|
||||
print(f"Downloaded {count}/{len(medias)}: {localFilePath}")
|
||||
@ -1,16 +0,0 @@
|
||||
import json, bunny, os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
medias = json.load(open('missing_videos.json', 'r'))
|
||||
#videoIDS = [media['url'].split('/')[-1] for media in medias]
|
||||
videoIDS = [media['guid'] for media in medias]
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
for id in videoIDS:
|
||||
filePath = f"MISSING_STREAM_VIDEOS/{id}.zip"
|
||||
|
||||
if os.path.exists(filePath):
|
||||
print(f'Video already exists as {filePath}. Skipping...')
|
||||
continue
|
||||
|
||||
executor.submit(bunny.download_video, id)
|
||||
@ -1,29 +0,0 @@
|
||||
import os, json, config
|
||||
|
||||
# Load the data
|
||||
pins = json.load(open('db_pins.json', 'r'))
|
||||
files = os.listdir('STORAGE_IMPORTED/')
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute('SELECT hash FROM media WHERE hash IS NOT NULL;')
|
||||
existing_hashes = [hash[0] for hash in cursor.fetchall()]
|
||||
|
||||
for pin in pins:
|
||||
if pin['hash'] in existing_hashes:
|
||||
print(f"Found {pin['hash']} in the imported folder.")
|
||||
pins.remove(pin)
|
||||
|
||||
alreadyImported = []
|
||||
for pin in pins:
|
||||
filepath = pin['filepath']
|
||||
username = pin['title']
|
||||
filename = os.path.basename(filepath)
|
||||
|
||||
if filename in files:
|
||||
print(f"Found {filename} in the imported folder.")
|
||||
alreadyImported.append(pins.pop(pins.index(pin)))
|
||||
|
||||
# Save to the file
|
||||
json.dump(pins, open('db_pins.json', 'w'))
|
||||
json.dump(alreadyImported, open('db_pins_imported.json', 'w'))
|
||||
@ -1,14 +0,0 @@
|
||||
import os, json, bunny
|
||||
|
||||
medias = json.load(open('allVideos.json', 'r'))
|
||||
mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True]
|
||||
|
||||
missing = json.load(open('missing_videos.json', 'r'))
|
||||
|
||||
count = 0
|
||||
cacheDir = 'old_mp4fallback_cache'
|
||||
print(f"Found {len(medias)} files to process.")
|
||||
for media in mp4Medias:
|
||||
count += 1
|
||||
filePath = os.path.join(cacheDir, media['guid'] + '.mp4')
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
import os, json, bunny, config
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute('SELECT media_id FROM media WHERE media_id IS NOT NULL;')
|
||||
mediaIDS = cursor.fetchall()
|
||||
|
||||
|
||||
|
||||
pins = json.load(open('pins.json', 'r'))
|
||||
|
||||
videos = json.load(open('db_videos.json', 'r'))
|
||||
pins = json.load(open('db_pins.json', 'r'))
|
||||
ids = [video['id'] for video in videos]
|
||||
|
||||
for pin in pins:
|
||||
if pin['id'] in ids:
|
||||
pins.remove(pin)
|
||||
|
||||
# save to the file
|
||||
json.dump(pins, open('db_pins.json', 'w'))
|
||||
|
||||
|
||||
medias = json.load(open('allVideos.json', 'r'))
|
||||
mp4Medias = [media for media in medias if media['hasMP4Fallback'] == True]
|
||||
|
||||
missing = json.load(open('missing_videos.json', 'r'))
|
||||
|
||||
count = 0
|
||||
cacheDir = 'old_mp4fallback_cache'
|
||||
print(f"Found {len(medias)} files to process.")
|
||||
for media in mp4Medias:
|
||||
count += 1
|
||||
filePath = os.path.join(cacheDir, media['guid'] + '.mp4')
|
||||
|
||||
|
||||
@ -1,53 +0,0 @@
|
||||
import os, json, funcs
|
||||
|
||||
STORAGE_IMPORTED = 'STORAGE_IMPORTED'
|
||||
pins = json.load(open('db_pins.json', 'r'))
|
||||
|
||||
for pin in pins:
|
||||
filename = pin['url'].split('/')[-1]
|
||||
filepath = os.path.join(STORAGE_IMPORTED, filename)
|
||||
pin['filename'] = filename
|
||||
if not pin['hash']:
|
||||
pin['hash'] = funcs.calculate_file_hash(filepath)
|
||||
|
||||
json.dump(pins, open('db_pins.json', 'w'), indent=4)
|
||||
|
||||
files = os.listdir(STORAGE_IMPORTED)
|
||||
|
||||
for file in files:
|
||||
filepath = os.path.join(STORAGE_IMPORTED, file)
|
||||
fileHash = funcs.calculate_file_hash(filepath)
|
||||
if fileHash not in file:
|
||||
print(f'Renaming {file} to {fileHash}')
|
||||
os.rename(filepath, os.path.join(STORAGE_IMPORTED, fileHash))
|
||||
|
||||
pins_by_username = {}
|
||||
for pin in pins:
|
||||
username = pin['title']
|
||||
if username not in pins_by_username:
|
||||
pins_by_username[username] = []
|
||||
pins_by_username[username].append(pin)
|
||||
|
||||
for username, username_pins in pins_by_username.items():
|
||||
username_folder = os.path.join(STORAGE_IMPORTED, username)
|
||||
os.makedirs(username_folder, exist_ok=True)
|
||||
for pin in username_pins:
|
||||
photo_id = pin['photo_id']
|
||||
photo_url = pin['url']
|
||||
fileHash = pin['hash']
|
||||
|
||||
if not fileHash:
|
||||
continue
|
||||
|
||||
extension = photo_url.split('.')[-1]
|
||||
filename = f'{fileHash}.{extension}'
|
||||
|
||||
filePath = os.path.join(STORAGE_IMPORTED, filename)
|
||||
outputPath = os.path.join(STORAGE_IMPORTED, username, filename)
|
||||
|
||||
if os.path.exists(outputPath):
|
||||
print(f'File {outputPath} already exists. Skipping...')
|
||||
continue
|
||||
|
||||
print(f'Moving {photo_url} to {outputPath}')
|
||||
os.rename(filePath, outputPath)
|
||||
@ -1,57 +0,0 @@
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
# Directories
|
||||
fucked_dir = 'tiktoks/fucked/aleksandra'
|
||||
source_dir = 'tiktoks/waiting_for_process/aleksandraverse'
|
||||
|
||||
def hash_file(filepath):
|
||||
"""Generate MD5 hash of a file."""
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(filepath, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
def get_file_hashes(directory):
|
||||
"""Generate a dictionary of file hashes for all files in a directory."""
|
||||
file_hashes = {}
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
file_hashes[file_path] = hash_file(file_path)
|
||||
return file_hashes
|
||||
|
||||
def files_are_identical(file1, file2):
|
||||
"""Compare two files byte-by-byte."""
|
||||
with open(file1, "rb") as f1, open(file2, "rb") as f2:
|
||||
while True:
|
||||
chunk1 = f1.read(4096)
|
||||
chunk2 = f2.read(4096)
|
||||
if chunk1 != chunk2:
|
||||
return False
|
||||
if not chunk1: # End of file
|
||||
return True
|
||||
|
||||
def remove_duplicates(fucked_dir, source_files):
|
||||
"""Remove files in 'fucked' that are identical to those in 'source_files'."""
|
||||
for root, _, files in os.walk(fucked_dir):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
for source_file in source_files:
|
||||
if files_are_identical(file_path, source_file):
|
||||
print(f"Duplicate found. Removing: {file_path}")
|
||||
os.remove(file_path)
|
||||
break
|
||||
|
||||
def main():
|
||||
print("Scanning source directory for hashes...")
|
||||
source_hashes = get_file_hashes(source_dir)
|
||||
|
||||
print("Scanning 'fucked' directory for duplicates...")
|
||||
remove_duplicates(fucked_dir, source_hashes)
|
||||
|
||||
print("Cleanup complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,49 +0,0 @@
|
||||
import json, os
|
||||
from videohash import VideoHash
|
||||
from moviepy.editor import VideoFileClip
|
||||
|
||||
def is_valid_video(file_path):
|
||||
try:
|
||||
with VideoFileClip(file_path) as video:
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Invalid video {file_path}: {str(e)}")
|
||||
return False
|
||||
|
||||
def load_hashes(file_path):
|
||||
try:
|
||||
with open(file_path, 'r') as file:
|
||||
return json.load(file)
|
||||
except FileNotFoundError:
|
||||
return {}
|
||||
|
||||
def save_hashes(hashes, file_path):
|
||||
with open(file_path, 'w') as file:
|
||||
json.dump(hashes, file, indent=4)
|
||||
|
||||
hashes = load_hashes('video_hashes.json')
|
||||
video_directory = 'STORAGE'
|
||||
|
||||
for username in os.listdir(video_directory):
|
||||
user_dir = os.path.join(video_directory, username)
|
||||
if not os.path.isdir(user_dir):
|
||||
continue
|
||||
|
||||
for video_file in os.listdir(user_dir):
|
||||
video_path = os.path.join(user_dir, video_file)
|
||||
if not video_file.endswith(('.mp4', '.mkv', '.avi')) or not is_valid_video(video_path):
|
||||
continue
|
||||
|
||||
if username in hashes and any(v[0] == video_file for v in hashes[username]):
|
||||
continue
|
||||
|
||||
try:
|
||||
video_hash = VideoHash(path=video_path)
|
||||
if username in hashes:
|
||||
hashes[username].append((video_file, video_hash.hash))
|
||||
else:
|
||||
hashes[username] = [(video_file, video_hash.hash)]
|
||||
except Exception as e:
|
||||
print(f"Error processing {video_file}: {str(e)}")
|
||||
|
||||
save_hashes(hashes, 'video_hashes.json')
|
||||
@ -1,17 +0,0 @@
|
||||
import os, config, funcs
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT phash FROM media WHERE phash IS NOT NULL")
|
||||
phashes = set([x[0] for x in cursor.fetchall()])
|
||||
|
||||
files = funcs.get_files("check_if_exists")
|
||||
|
||||
for file in files:
|
||||
image_phash = funcs.generate_phash(file)
|
||||
|
||||
if image_phash in phashes:
|
||||
print(f"File {file} exists in the database")
|
||||
os.remove(file)
|
||||
|
||||
funcs.cleanEmptyFolders("check_if_exists")
|
||||
@ -1,159 +0,0 @@
|
||||
from snapchat import get_data, get_stories, get_highlight_stories
|
||||
from datetime import datetime
|
||||
import requests
|
||||
import config
|
||||
import json
|
||||
import os
|
||||
|
||||
"""
|
||||
media_url_filename = url.split('/')[-1].split('?')[0]
|
||||
etag = response.headers.get('ETag', '').replace('"', '')
|
||||
filename = f"{username}~{timestamp}-{media_url_filename}~{etag}{extension}"
|
||||
filepath = os.path.join(directory, 'highlights', filename)
|
||||
"""
|
||||
|
||||
directory = "snapchat"
|
||||
data_directory = "data"
|
||||
|
||||
def get_existing_snap_ids(directory):
|
||||
existing_snap_ids = set()
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
if '~' not in file:
|
||||
continue
|
||||
|
||||
filename, _ = os.path.splitext(file)
|
||||
snap_id = filename.split('~')[2]
|
||||
existing_snap_ids.add(snap_id)
|
||||
return existing_snap_ids
|
||||
|
||||
def find_duplicate_snap(existing_snaps, snap_id, username):
|
||||
for snap in existing_snaps:
|
||||
if username == snap[2]:
|
||||
if snap_id in snap[1]:
|
||||
return snap
|
||||
return False
|
||||
|
||||
def archive_data(data, username):
|
||||
data_filename = f"{username}~{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
|
||||
data_filepath = os.path.join(data_directory, data_filename)
|
||||
with open(data_filepath, 'w') as f:
|
||||
f.write(json.dumps(data))
|
||||
print(f"Archived data for {username} at {data_filepath}")
|
||||
|
||||
def get_file_extension(url):
|
||||
response = requests.head(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to access media {url}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
if 'image' in content_type:
|
||||
return '.jpg'
|
||||
elif 'video' in content_type:
|
||||
return '.mp4'
|
||||
else:
|
||||
print(f"Unknown content type for media {url}")
|
||||
return None
|
||||
|
||||
def extract_file_type(url):
|
||||
file_types = {
|
||||
'400': '.jpg',
|
||||
'1322': '.mp4',
|
||||
'1325': '.mp4',
|
||||
'1034': '.mp4',
|
||||
'1023': '.jpg'
|
||||
}
|
||||
|
||||
base_url = url.split("?")[0] # Remove query string
|
||||
|
||||
snap_data = base_url.split('/')[-1]
|
||||
|
||||
# Extract the file type number
|
||||
data_parts = snap_data.split('.')
|
||||
if len(data_parts) > 1:
|
||||
file_type_number = data_parts[1]
|
||||
if file_type_number in file_types:
|
||||
return file_types[file_type_number]
|
||||
else:
|
||||
print(f"Unexpected URL format: {base_url}")
|
||||
return None
|
||||
|
||||
def download_media(url, filepath):
|
||||
if os.path.exists(filepath):
|
||||
print(f"File {filepath} already exists. Skipping download.")
|
||||
return filepath
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to download media {url}")
|
||||
return None
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
return filepath
|
||||
|
||||
def main():
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat'")
|
||||
usernames = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
cursor.execute("SELECT id, filename, username FROM media WHERE filename IS NOT NULL AND platform = 'snapchat'")
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
existing_snap_ids = get_existing_snap_ids(directory)
|
||||
|
||||
for username in usernames:
|
||||
print(f"Getting stories for {username}...")
|
||||
data = get_data(username)
|
||||
if not data:
|
||||
continue
|
||||
|
||||
archive_data(data, username)
|
||||
|
||||
print("Getting stories...")
|
||||
stories = get_stories(data)
|
||||
|
||||
print("Getting highlights...")
|
||||
stories.extend(get_highlight_stories(data))
|
||||
|
||||
for story in stories:
|
||||
snap_id = story['snap_id']
|
||||
url = story['url']
|
||||
timestamp = story['timestamp']
|
||||
|
||||
duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
|
||||
if duplicate_snap:
|
||||
print(f"Media {snap_id} already exists. Skipping download.")
|
||||
continue
|
||||
|
||||
# Check if media already exists
|
||||
if snap_id in existing_snap_ids:
|
||||
print(f"Media {snap_id} already exists. Skipping download.")
|
||||
continue
|
||||
|
||||
# Determine file extension using HEAD request.
|
||||
# TODO: find a better way to determine file extension without downloading the file.
|
||||
extension = extract_file_type(url)
|
||||
if not extension:
|
||||
continue
|
||||
|
||||
filename = f"{username}~{timestamp}~{snap_id}{extension}"
|
||||
filepath = os.path.join(directory, filename)
|
||||
|
||||
# Check if file already exists
|
||||
if os.path.exists(filepath):
|
||||
print(f"File {filename} already exists. Skipping download.")
|
||||
continue
|
||||
|
||||
# Download the media
|
||||
filepath = download_media(url, filepath)
|
||||
|
||||
print(f"Downloaded {filename} at {timestamp}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,154 +0,0 @@
|
||||
from datetime import datetime
|
||||
import config
|
||||
import funcs
|
||||
import cv2
|
||||
import os
|
||||
|
||||
directory = 'media/instagram/'
|
||||
|
||||
def UploadMedia(media):
|
||||
media_id = media['media_id']
|
||||
username = media['username']
|
||||
post_date = media['timestamp']
|
||||
user_id = media['user_id']
|
||||
filepath = media['filepath']
|
||||
highlight_id = media['highlight_id']
|
||||
post_type = media['post_type']
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
if media_id and int(media_id) in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
width, height = funcs.get_media_dimensions(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath)
|
||||
|
||||
if media_type == 'video':
|
||||
try:
|
||||
thumbPath = f'temp/{media_id}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
|
||||
phash = funcs.generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
elif media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
|
||||
if media_id:
|
||||
newFilename = f'{media_id}{file_extension}'
|
||||
else:
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
|
||||
server_path = f'media/{post_type}/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
if highlight_id:
|
||||
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def get_user_id(username):
|
||||
username = username.lower()
|
||||
if username in existing_users:
|
||||
return existing_users[username]
|
||||
|
||||
return None
|
||||
|
||||
def get_media():
|
||||
medias = []
|
||||
post_types = {
|
||||
'posts': 'post',
|
||||
'stories': 'story',
|
||||
'profile': 'profile',
|
||||
}
|
||||
|
||||
for post_type in os.listdir(directory):
|
||||
users_dir = os.path.join(directory, post_type)
|
||||
if not os.path.isdir(users_dir):
|
||||
continue
|
||||
users = os.listdir(users_dir)
|
||||
|
||||
for username in users:
|
||||
user_path = os.path.join(directory, post_type, username)
|
||||
if not os.path.isdir(user_path):
|
||||
continue
|
||||
for filename in os.listdir(user_path):
|
||||
if filename.startswith('.'):
|
||||
continue
|
||||
|
||||
data = {}
|
||||
filepath = os.path.join(user_path, filename)
|
||||
|
||||
if 'com.instagram.android__' in filename:
|
||||
timestamp_str = filename.split('__')[-1].split('.')[0]
|
||||
data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f')
|
||||
else:
|
||||
data['timestamp'] = datetime.now()
|
||||
|
||||
data['post_type'] = post_types[post_type]
|
||||
data['username'] = username
|
||||
data['filepath'] = filepath
|
||||
data['media_id'] = None
|
||||
data['user_id'] = get_user_id(data['username'])
|
||||
data['highlight_id'] = None
|
||||
medias.append(data)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram():
|
||||
medias = get_media()
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
existing_files.append(media['media_id'])
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
if not os.listdir(directory):
|
||||
print('No files to process. Exiting...')
|
||||
exit()
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
|
||||
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
|
||||
|
||||
dump_instagram()
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,34 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Image Gallery</title>
|
||||
<style>
|
||||
.gallery {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.gallery img {
|
||||
margin: 10px;
|
||||
max-width: 200px;
|
||||
height: auto;
|
||||
}
|
||||
.gallery div {
|
||||
text-align: center;
|
||||
margin: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Image Gallery</h1>
|
||||
<div class="gallery">
|
||||
{% for image in images %}
|
||||
<div>
|
||||
<h3>{{ image['username'] }}</h3>
|
||||
<img src="{{ image['media_url'] }}" alt="Image for {{ image['username'] }}">
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,84 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Media Gallery</title>
|
||||
<style>
|
||||
body {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
}
|
||||
.container {
|
||||
max-width: 1600px;
|
||||
width: 100%;
|
||||
padding: 20px;
|
||||
}
|
||||
.media-container {
|
||||
column-count: 4;
|
||||
column-gap: 10px;
|
||||
}
|
||||
.media-item {
|
||||
break-inside: avoid;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
img, video {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
display: block;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>Media Gallery</h1>
|
||||
<div class="media-container" id="media-container"></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let page = 0;
|
||||
|
||||
async function loadMore() {
|
||||
const response = await fetch(`/load-more?page=${page}`);
|
||||
const mediaFiles = await response.json();
|
||||
const container = document.getElementById('media-container');
|
||||
|
||||
mediaFiles.forEach(file => {
|
||||
const mediaItem = document.createElement('div');
|
||||
mediaItem.className = 'media-item';
|
||||
|
||||
if (file.endsWith('.png') || file.endsWith('.jpg') || file.endsWith('.jpeg') || file.endsWith('.gif')) {
|
||||
const img = document.createElement('img');
|
||||
img.src = `/media/${file}`;
|
||||
img.alt = file;
|
||||
mediaItem.appendChild(img);
|
||||
} else if (file.endsWith('.mp4') || file.endsWith('.mkv') || file.endsWith('.mov')) {
|
||||
const video = document.createElement('video');
|
||||
video.controls = false;
|
||||
video.autoplay = true;
|
||||
video.muted = true;
|
||||
video.loop = true;
|
||||
const source = document.createElement('source');
|
||||
source.src = `/media/${file}`;
|
||||
source.type = 'video/mp4';
|
||||
video.appendChild(source);
|
||||
mediaItem.appendChild(video);
|
||||
}
|
||||
|
||||
container.appendChild(mediaItem);
|
||||
});
|
||||
|
||||
page += 1;
|
||||
}
|
||||
|
||||
window.addEventListener('scroll', () => {
|
||||
if (window.innerHeight + window.scrollY >= document.body.offsetHeight) {
|
||||
loadMore();
|
||||
}
|
||||
});
|
||||
|
||||
// Initial load
|
||||
loadMore();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,32 +0,0 @@
|
||||
from flask import Flask, render_template, send_from_directory, jsonify, request
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
media_dir = 'storysaver'
|
||||
MEDIA_PER_PAGE = 20
|
||||
|
||||
def get_media_files(start, count):
|
||||
media_files = []
|
||||
for root, dirs, files in os.walk(media_dir):
|
||||
for filename in files:
|
||||
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.mp4', '.mkv', '.mov')):
|
||||
file_path = os.path.relpath(os.path.join(root, filename), media_dir)
|
||||
media_files.append(file_path)
|
||||
return media_files[start:start + count]
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
return render_template('index.html')
|
||||
|
||||
@app.route('/media/<path:filename>')
|
||||
def media(filename):
|
||||
return send_from_directory(media_dir, filename)
|
||||
|
||||
@app.route('/load-more')
|
||||
def load_more():
|
||||
page = int(request.args.get('page', 0))
|
||||
media_files = get_media_files(page * MEDIA_PER_PAGE, MEDIA_PER_PAGE)
|
||||
return jsonify(media_files)
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=True)
|
||||
@ -1,133 +0,0 @@
|
||||
from BunnyCDN.Storage import Storage
|
||||
from PIL import Image
|
||||
import os, uuid, cv2, config
|
||||
import hashlib
|
||||
from moviepy.editor import VideoFileClip
|
||||
|
||||
def scan_dupes(folder_path):
|
||||
newCursor.execute("SELECT hash FROM media")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
folder_path = os.path.join(root, folder)
|
||||
for filename in os.listdir(folder_path):
|
||||
media_id = filename.replace('.mp4', '').replace('.jpg', '')
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
if media_id:
|
||||
fileHash = calculate_file_hash(filepath)
|
||||
if fileHash in existing_files:
|
||||
print(f'Duplicate')
|
||||
os.remove(filepath)
|
||||
|
||||
def clean_empty_folders(directory):
|
||||
for foldername, subfolders, filenames in os.walk(directory, topdown=False):
|
||||
for subfolder in subfolders:
|
||||
folder_path = os.path.join(foldername, subfolder)
|
||||
if not os.listdir(folder_path):
|
||||
os.rmdir(folder_path)
|
||||
print(f"Removed empty folder: {folder_path}")
|
||||
|
||||
def upload_file(filepath, username, media_type='image', post_type = 'story'):
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = filename.split('.')[-1]
|
||||
dirtype = 'stories' if post_type == 'story' else 'posts'
|
||||
|
||||
#dirtype = 'profile'
|
||||
|
||||
fileHash = calculate_file_hash(filepath)
|
||||
|
||||
try:
|
||||
if int(media_id) in existing_files:
|
||||
print(f'Duplicate')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
except: media_id = uuid.uuid4().hex
|
||||
|
||||
server_path = f'users/{dirtype}/{username}/{media_id}.{file_extension}'
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
duration = 0
|
||||
if media_type == 'image':
|
||||
try:
|
||||
with Image.open(filepath) as img:
|
||||
width, height = img.size
|
||||
except:
|
||||
os.remove(filepath)
|
||||
return
|
||||
else:
|
||||
width, height = get_video_dimensions(filepath)
|
||||
duration = get_video_duration(filepath)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, hash, filename, media_id, duration) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, post_type, fileHash, filename, media_id, duration)
|
||||
newCursor.execute(query, values)
|
||||
newDB.commit()
|
||||
|
||||
os.remove(filepath)
|
||||
print(f'[{newCursor.rowcount}]{os.path.basename(filepath)} {file_url}')
|
||||
|
||||
|
||||
def get_video_dimensions(video_path):
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
cap.release()
|
||||
return width, height
|
||||
|
||||
def get_video_duration(file_path):
|
||||
"""
|
||||
Returns the duration of the video file in seconds.
|
||||
|
||||
:param file_path: Path to the video file
|
||||
:return: Duration in seconds
|
||||
"""
|
||||
with VideoFileClip(file_path) as video:
|
||||
return video.duration
|
||||
|
||||
def get_media_type(filename):
|
||||
if filename.lower().endswith(".jpg") or filename.lower().endswith(".webp") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png") or filename.lower().endswith(".gif"):
|
||||
return 'image'
|
||||
if filename.lower().endswith(".mp4") or filename.lower().endswith(".mov"):
|
||||
return 'video'
|
||||
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
for root, dirs, files in os.walk(folder_path):
|
||||
for folder in dirs:
|
||||
username = folder
|
||||
folder_path = os.path.join(root, folder)
|
||||
|
||||
post_type = 'post' if 'post' in folder_path.lower() else 'story'
|
||||
|
||||
for filename in os.listdir(folder_path):
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
mediatype = get_media_type(filename)
|
||||
upload_file(username=username, media_type=mediatype, filepath=filepath, post_type=post_type)
|
||||
|
||||
def calculate_file_hash(file_path, hash_func='sha256'):
|
||||
h = hashlib.new(hash_func)
|
||||
|
||||
with open(file_path, 'rb') as file:
|
||||
chunk = 0
|
||||
while chunk != b'':
|
||||
chunk = file.read(8192)
|
||||
h.update(chunk)
|
||||
|
||||
return h.hexdigest()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = Storage('345697f9-d9aa-4a6b-a5ec8bffc16d-ceaf-453e', 'storysave')
|
||||
|
||||
storiesPath = 'StorySave/'
|
||||
|
||||
dump_instagram(storiesPath)
|
||||
|
||||
print("Processing completed.")
|
||||
@ -0,0 +1,424 @@
|
||||
import requests
|
||||
import hashlib
|
||||
|
||||
access_key = "ccd3f9d4-9e6f-4bd2-8f594402b5a7-3646-48fe"
|
||||
video_library_id = 106867
|
||||
|
||||
def create_video(title):
|
||||
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos"
|
||||
|
||||
payload = f"{{\"title\":\"{title}\"}}"
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"content-type": "application/*+json",
|
||||
"AccessKey": access_key
|
||||
}
|
||||
|
||||
response = requests.post(url, data=payload, headers=headers)
|
||||
|
||||
return response
|
||||
|
||||
def generate_signature(library_id, api_key, expiration_time, video_id):
|
||||
signature = hashlib.sha256((library_id + api_key + str(expiration_time) + video_id).encode()).hexdigest()
|
||||
return signature
|
||||
|
||||
def upload_video_process(file_path, video_id):
|
||||
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos/{video_id}"
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"AccessKey": access_key
|
||||
}
|
||||
|
||||
with open(file_path, "rb") as file:
|
||||
file_data = file.read()
|
||||
|
||||
response = requests.put(url, headers=headers, data=file_data)
|
||||
|
||||
return response.status_code
|
||||
|
||||
def upload_video(file_path, title=None):
|
||||
video_item = create_video(title)
|
||||
if video_item.status_code != 200:
|
||||
return False
|
||||
|
||||
video_id = video_item.json()['guid']
|
||||
upload_video_process(file_path, video_id)
|
||||
|
||||
return {
|
||||
"embed_link": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/playlist.m3u8",
|
||||
"animated_thumbnail": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/preview.webp",
|
||||
"default_thumbnail": f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/thumbnail.jpg",
|
||||
}
|
||||
|
||||
|
||||
def upload_video_recurbate(videoInfo):
|
||||
title = f"{videoInfo['username']} {videoInfo['platform']}"
|
||||
video_item = create_video(title)
|
||||
if video_item.status_code != 200:
|
||||
return False
|
||||
|
||||
video_id = video_item.json()['guid']
|
||||
upload_video_process(videoInfo['filename'], video_id)
|
||||
|
||||
videoInfo["embed_link"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/playlist.m3u8"
|
||||
videoInfo["animated_thumbnail"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/preview.webp"
|
||||
videoInfo["default_thumbnail"] = f"https://vz-58ca89f1-986.b-cdn.net/{video_id}/thumbnail.jpg"
|
||||
|
||||
return True
|
||||
|
||||
def delete_video(video_id):
|
||||
video_id = video_id.replace('https://vz-58ca89f1-986.b-cdn.net/', '').replace('/playlist.m3u8', '')
|
||||
|
||||
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos/{video_id}"
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"AccessKey": access_key
|
||||
}
|
||||
|
||||
response = requests.delete(url, headers=headers)
|
||||
|
||||
return response.status_code
|
||||
|
||||
def list_videos():
|
||||
url = f"https://video.bunnycdn.com/library/{video_library_id}/videos?page=1&itemsPerPage=2147483647&orderBy=date"
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"AccessKey": access_key
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
return response.json()['items']
|
||||
|
||||
def get_heatmap(video_id):
|
||||
url = "https://video.bunnycdn.com/library/libraryId/videos/videoId/heatmap"
|
||||
url = url.replace('libraryId', str(video_library_id)).replace('videoId', str(video_id))
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"AccessKey": access_key
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers).json()
|
||||
|
||||
return response
|
||||
|
||||
def get_video(video_id):
|
||||
url = "https://video.bunnycdn.com/library/libraryId/videos/videoId"
|
||||
url = url.replace('libraryId', str(video_library_id)).replace('videoId', str(video_id))
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"AccessKey": access_key
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers).json()
|
||||
|
||||
return response
|
||||
|
||||
|
||||
import os
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from urllib import parse
|
||||
|
||||
class Storage:
|
||||
def __init__(self, api_key, storage_zone, storage_zone_region="de"):
|
||||
"""
|
||||
Creates an object for using BunnyCDN Storage API
|
||||
Parameters
|
||||
----------
|
||||
api_key : String
|
||||
Your bunnycdn storage
|
||||
Apikey/FTP password of
|
||||
storage zone
|
||||
|
||||
storage_zone : String
|
||||
Name of your storage zone
|
||||
|
||||
storage_zone_region(optional parameter) : String
|
||||
The storage zone region code
|
||||
as per BunnyCDN
|
||||
"""
|
||||
self.headers = {
|
||||
# headers to be passed in HTTP requests
|
||||
"AccessKey": api_key,
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "applcation/json",
|
||||
}
|
||||
|
||||
# applying constraint that storage_zone must be specified
|
||||
assert storage_zone != "", "storage_zone is not specified/missing"
|
||||
|
||||
# For generating base_url for sending requests
|
||||
if storage_zone_region == "de" or storage_zone_region == "":
|
||||
self.base_url = "https://storage.bunnycdn.com/" + storage_zone + "/"
|
||||
else:
|
||||
self.base_url = (
|
||||
"https://"
|
||||
+ storage_zone_region
|
||||
+ ".storage.bunnycdn.com/"
|
||||
+ storage_zone
|
||||
+ "/"
|
||||
)
|
||||
|
||||
def DownloadFile(self, storage_path, download_path=os.getcwd()):
|
||||
"""
|
||||
This function will get the files and subfolders of storage zone mentioned in path
|
||||
and download it to the download_path location mentioned
|
||||
Parameters
|
||||
----------
|
||||
storage_path : String
|
||||
The path of the directory
|
||||
(including file name and excluding storage zone name)
|
||||
from which files are to be retrieved
|
||||
download_path : String
|
||||
The directory on local server to which downloaded file must be saved
|
||||
Note:For download_path instead of '\' '\\' should be used example: C:\\Users\\XYZ\\OneDrive
|
||||
"""
|
||||
|
||||
assert (
|
||||
storage_path != ""
|
||||
), "storage_path must be specified" # to make sure storage_path is not null
|
||||
# to build correct url
|
||||
if storage_path[0] == "/":
|
||||
storage_path = storage_path[1:]
|
||||
if storage_path[-1] == "/":
|
||||
storage_path = storage_path[:-1]
|
||||
url = self.base_url + parse.quote(storage_path)
|
||||
file_name = url.split("/")[-1] # For storing file name
|
||||
|
||||
# to return appropriate help messages if file is present or not and download file if present
|
||||
try:
|
||||
response = requests.get(url, headers=self.headers, stream=True)
|
||||
response.raise_for_status()
|
||||
except HTTPError as http:
|
||||
return {
|
||||
"status": "error",
|
||||
"HTTP": response.status_code,
|
||||
"msg": f"Http error occured {http}",
|
||||
}
|
||||
except Exception as err:
|
||||
return {
|
||||
"status": "error",
|
||||
"HTTP": response.status_code,
|
||||
"msg": f"error occured {err}",
|
||||
}
|
||||
else:
|
||||
download_path = os.path.join(download_path, file_name)
|
||||
# Downloading file
|
||||
with open(download_path, "wb") as file:
|
||||
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
file.write(chunk)
|
||||
return {
|
||||
"status": "success",
|
||||
"HTTP": response.status_code,
|
||||
"msg": "File downloaded Successfully",
|
||||
}
|
||||
|
||||
def PutFile(
|
||||
self,
|
||||
file_name,
|
||||
storage_path=None,
|
||||
local_upload_file_path=os.getcwd(),
|
||||
):
|
||||
|
||||
"""
|
||||
This function uploads files to your BunnyCDN storage zone
|
||||
Parameters
|
||||
----------
|
||||
storage_path : String
|
||||
The path of directory in storage zone
|
||||
(including the name of file as desired and excluding storage zone name)
|
||||
to which file is to be uploaded
|
||||
file_name : String
|
||||
The name of the file as stored in local server
|
||||
local_upload_file_path : String
|
||||
The path of file as stored in local server(excluding file name)
|
||||
from where file is to be uploaded
|
||||
Examples
|
||||
--------
|
||||
file_name : 'ABC.txt'
|
||||
local_upload_file_path : 'C:\\User\\Sample_Directory'
|
||||
storage_path : '<Directory name in storage zone>/<file name as to be uploaded on storage zone>.txt'
|
||||
#Here .txt because the file being uploaded in example is txt
|
||||
"""
|
||||
local_upload_file_path = os.path.join(local_upload_file_path, file_name)
|
||||
|
||||
# to build correct url
|
||||
if storage_path is not None and storage_path != "":
|
||||
if storage_path[0] == "/":
|
||||
storage_path = storage_path[1:]
|
||||
if storage_path[-1] == "/":
|
||||
storage_path = storage_path[:-1]
|
||||
url = self.base_url + parse.quote(storage_path)
|
||||
else:
|
||||
url = self.base_url + parse.quote(file_name)
|
||||
with open(local_upload_file_path, "rb") as file:
|
||||
file_data = file.read()
|
||||
response = requests.put(url, data=file_data, headers=self.headers)
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except HTTPError as http:
|
||||
return {
|
||||
"status": "error",
|
||||
"HTTP": response.status_code,
|
||||
"msg": f"Upload Failed HTTP Error Occured: {http}",
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"status": "success",
|
||||
"HTTP": response.status_code,
|
||||
"msg": "The File Upload was Successful",
|
||||
}
|
||||
|
||||
def DeleteFile(self, storage_path=""):
|
||||
"""
|
||||
This function deletes a file or folder mentioned in the storage_path from the storage zone
|
||||
Parameters
|
||||
----------
|
||||
storage_path : The directory path to your file (including file name) or folder which is to be deleted.
|
||||
If this is the root of your storage zone, you can ignore this parameter.
|
||||
"""
|
||||
# Add code below
|
||||
assert (
|
||||
storage_path != ""
|
||||
), "storage_path must be specified" # to make sure storage_path is not null
|
||||
# to build correct url
|
||||
if storage_path[0] == "/":
|
||||
storage_path = storage_path[1:]
|
||||
url = self.base_url + parse.quote(storage_path)
|
||||
|
||||
try:
|
||||
response = requests.delete(url, headers=self.headers)
|
||||
response.raise_for_status
|
||||
except HTTPError as http:
|
||||
return {
|
||||
"status": "error",
|
||||
"HTTP": response.raise_for_status(),
|
||||
"msg": f"HTTP Error occured: {http}",
|
||||
}
|
||||
except Exception as err:
|
||||
return {
|
||||
"status": "error",
|
||||
"HTTP": response.status_code,
|
||||
"msg": f"Object Delete failed ,Error occured:{err}",
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"status": "success",
|
||||
"HTTP": response.status_code,
|
||||
"msg": "Object Successfully Deleted",
|
||||
}
|
||||
|
||||
def GetStoragedObjectsList(self, storage_path=None):
|
||||
"""
|
||||
This functions returns a list of files and directories located in given storage_path.
|
||||
Parameters
|
||||
----------
|
||||
storage_path : The directory path that you want to list.
|
||||
"""
|
||||
# to build correct url
|
||||
if storage_path is not None:
|
||||
if storage_path[0] == "/":
|
||||
storage_path = storage_path[1:]
|
||||
if storage_path[-1] != "/":
|
||||
url = self.base_url + parse.quote(storage_path) + "/"
|
||||
else:
|
||||
url = self.base_url
|
||||
# Sending GET request
|
||||
try:
|
||||
response = requests.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
except HTTPError as http:
|
||||
return {
|
||||
"status": "error",
|
||||
"HTTP": response.status_code,
|
||||
"msg": f"http error occured {http}",
|
||||
}
|
||||
else:
|
||||
storage_list = []
|
||||
for dictionary in response.json():
|
||||
temp_dict = {}
|
||||
for key in dictionary:
|
||||
if key == "ObjectName" and dictionary["IsDirectory"] is False:
|
||||
temp_dict["File_Name"] = dictionary[key]
|
||||
if key == "ObjectName" and dictionary["IsDirectory"]:
|
||||
temp_dict["Folder_Name"] = dictionary[key]
|
||||
storage_list.append(temp_dict)
|
||||
return storage_list
|
||||
|
||||
def MoveFile(self, old_path, new_path):
|
||||
"""
|
||||
Moves a file by downloading from the old path and uploading to the new path,
|
||||
then deleting from the old path. Uses existing PutFile and DeleteFile methods.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
old_path : str
|
||||
The current path (relative to storage zone root) of the file to move.
|
||||
new_path : str
|
||||
The new path (relative to storage zone root) for the file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A dictionary containing 'status', 'msg', and optionally 'HTTP'.
|
||||
"""
|
||||
# Validate arguments
|
||||
if not old_path or not new_path:
|
||||
return {
|
||||
"status": "error",
|
||||
"msg": "Both old_path and new_path must be provided."
|
||||
}
|
||||
|
||||
# 1. Download from old_path to a temporary local directory
|
||||
# If you already have the file locally, you can skip this download step.
|
||||
download_response = self.DownloadFile(old_path, download_path="temp")
|
||||
if download_response.get("status") != "success":
|
||||
return {
|
||||
"status": "error",
|
||||
"msg": f"Failed to download file for moving. Reason: {download_response.get('msg', 'unknown')}",
|
||||
"HTTP": download_response.get("HTTP")
|
||||
}
|
||||
|
||||
# Extract the filename from old_path to know what we downloaded
|
||||
filename = os.path.basename(old_path)
|
||||
|
||||
# 2. Upload to new_path using existing PutFile
|
||||
# We'll assume new_path includes the desired filename. If it does not, adjust logic.
|
||||
put_response = self.PutFile(
|
||||
file_name=filename,
|
||||
storage_path=new_path, # e.g. "folder/newfile.jpg"
|
||||
local_upload_file_path="temp" # where we downloaded it
|
||||
)
|
||||
if put_response.get("status") != "success":
|
||||
return {
|
||||
"status": "error",
|
||||
"msg": f"Failed to upload file to new path. Reason: {put_response.get('msg', 'unknown')}",
|
||||
"HTTP": put_response.get("HTTP")
|
||||
}
|
||||
|
||||
# 3. Delete the original file using existing DeleteFile
|
||||
delete_response = self.DeleteFile(old_path)
|
||||
if delete_response.get("status") != "success":
|
||||
return {
|
||||
"status": "error",
|
||||
"msg": f"Failed to delete old file. Reason: {delete_response.get('msg', 'unknown')}",
|
||||
"HTTP": delete_response.get("HTTP")
|
||||
}
|
||||
|
||||
# (Optional) Clean up the local temp file
|
||||
local_temp_path = os.path.join("temp", filename)
|
||||
if os.path.exists(local_temp_path):
|
||||
os.remove(local_temp_path)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"msg": f"File successfully moved from '{old_path}' to '{new_path}'."
|
||||
}
|
||||
@ -0,0 +1,145 @@
|
||||
import os
|
||||
from funcs import calculate_file_hash, get_media_dimensions, generate_phash
|
||||
import config
|
||||
|
||||
# --- Configuration & Constants ---
|
||||
BASE_URL = "https://cdn.altpins.com/"
|
||||
TEMP_DIR = os.path.join(os.getcwd(), 'temp')
|
||||
CACHE_DIR = os.path.join(os.getcwd(), 'cache')
|
||||
|
||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
|
||||
def normalize_server_path(media_url, replace_all=True):
|
||||
"""
|
||||
Remove the BASE_URL from media_url and normalize slashes.
|
||||
If replace_all is True, replace double slashes and backslashes.
|
||||
"""
|
||||
path = media_url.replace(BASE_URL, '')
|
||||
if replace_all:
|
||||
path = path.replace('//', '/').replace('\\', '/')
|
||||
else:
|
||||
path = path.replace('\\', '/')
|
||||
return path
|
||||
|
||||
def update_hashes(cursor, db, obj_storage):
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE hash IS NULL;")
|
||||
results = cursor.fetchall()
|
||||
total = len(results)
|
||||
print(f"Found {total} files to process for hash updating.")
|
||||
|
||||
for idx, (record_id, media_id, media_url) in enumerate(results, start=1):
|
||||
server_path = normalize_server_path(media_url)
|
||||
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
|
||||
|
||||
if not os.path.exists(local_file):
|
||||
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
|
||||
|
||||
filehash = calculate_file_hash(local_file)
|
||||
cursor.execute("UPDATE media SET hash = %s WHERE id = %s;", (filehash, record_id))
|
||||
db.commit()
|
||||
print(f"[{idx}/{total}] {media_id}: {filehash}, Rows affected: {cursor.rowcount}")
|
||||
|
||||
def update_dimensions(cursor, db, obj_storage):
|
||||
cursor.execute("SELECT id, media_id, media_url FROM media WHERE width = 0 OR height = 0;")
|
||||
results = cursor.fetchall()
|
||||
total = len(results)
|
||||
print(f"Found {total} files to process for dimensions updating.")
|
||||
|
||||
for idx, (record_id, media_id, media_url) in enumerate(results, start=1):
|
||||
server_path = normalize_server_path(media_url)
|
||||
local_file = os.path.join(CACHE_DIR, os.path.basename(server_path))
|
||||
|
||||
if not os.path.exists(local_file):
|
||||
obj_storage.DownloadFile(storage_path=server_path, download_path=CACHE_DIR)
|
||||
|
||||
# Optionally, you could get the media type if needed:
|
||||
width, height = get_media_dimensions(local_file)
|
||||
|
||||
if width == 0 or height == 0:
|
||||
print(f"Error getting dimensions for {media_url}")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET width = %s, height = %s WHERE id = %s;", (width, height, record_id))
|
||||
db.commit()
|
||||
print(f"[{idx}/{total}] {media_id}: width: {width}, height: {height}, Rows affected: {cursor.rowcount}")
|
||||
|
||||
def update_file_size(cursor, db, obj_storage):
|
||||
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0 AND status != 'deleted';")
|
||||
results = cursor.fetchall()
|
||||
total = len(results)
|
||||
print(f"Found {total} files to process for file size updating.")
|
||||
|
||||
for idx, (record_id, media_url) in enumerate(results, start=1):
|
||||
server_path = normalize_server_path(media_url)
|
||||
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
|
||||
|
||||
if not os.path.exists(local_file):
|
||||
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
|
||||
|
||||
file_size = os.path.getsize(local_file)
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, record_id))
|
||||
db.commit()
|
||||
print(f"[{idx}/{total}] {media_url}: {file_size} bytes, Rows affected: {cursor.rowcount}")
|
||||
|
||||
def update_phash(cursor, db, obj_storage):
|
||||
generate_for = 'media_url'
|
||||
media_type = 'image'
|
||||
cursor.execute(f"SELECT id, {generate_for} FROM media WHERE media_type = %s AND phash IS NULL AND status != 'deleted';", [media_type])
|
||||
medias = cursor.fetchall()
|
||||
total = len(medias)
|
||||
print(f"Found {total} files to process for pHash updating.")
|
||||
|
||||
for idx, (record_id, media_url) in enumerate(medias, start=1):
|
||||
server_path = normalize_server_path(media_url, replace_all=False)
|
||||
local_file = os.path.join(TEMP_DIR, os.path.basename(server_path))
|
||||
|
||||
if not os.path.exists(local_file):
|
||||
obj_storage.DownloadFile(storage_path=server_path, download_path=TEMP_DIR)
|
||||
|
||||
phash = generate_phash(local_file)
|
||||
if not phash:
|
||||
print(f"Error generating pHash for {local_file}")
|
||||
continue
|
||||
|
||||
cursor.execute("UPDATE media SET phash = %s WHERE id = %s", [phash, record_id])
|
||||
db.commit()
|
||||
print(f"[{idx}/{total}] Processed record {record_id} with pHash: {phash}")
|
||||
|
||||
def update_user_ids(cursor, db):
|
||||
cursor.execute("SELECT DISTINCT username FROM media WHERE user_id IS NULL AND platform = 'instagram';")
|
||||
usernames = [username[0] for username in cursor.fetchall()]
|
||||
total = len(usernames)
|
||||
print(f"Found {total} usernames to process for user_id updating.")
|
||||
|
||||
for idx, username in enumerate(usernames, start=1):
|
||||
print(f"[{idx}/{total}] Username: {username}")
|
||||
|
||||
cursor.execute("SELECT DISTINCT user_id FROM media WHERE username = %s AND user_id IS NOT NULL;", [username])
|
||||
possible_user_ids = [user_id for user_id, in cursor.fetchall()]
|
||||
|
||||
if len(possible_user_ids) == 0:
|
||||
print(f"No user_id found for {username}")
|
||||
continue
|
||||
|
||||
if len(possible_user_ids) > 1:
|
||||
print(f"Multiple user_ids found for {username}: {possible_user_ids}")
|
||||
continue
|
||||
|
||||
user_id = possible_user_ids[0]
|
||||
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
|
||||
db.commit()
|
||||
print(f"[{idx}/{total}] Updated user_id for {username}, Rows affected: {cursor.rowcount}")
|
||||
|
||||
def main():
|
||||
obj_storage = config.get_storage()
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
update_hashes(cursor, db, obj_storage)
|
||||
update_dimensions(cursor, db, obj_storage)
|
||||
update_file_size(cursor, db, obj_storage)
|
||||
update_phash(cursor, db, obj_storage)
|
||||
update_user_ids(cursor, db)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@ -0,0 +1,78 @@
|
||||
import os
|
||||
import config
|
||||
import logging
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler() # or use logging.FileHandler('script.log')
|
||||
]
|
||||
)
|
||||
|
||||
# Prepare database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Ensure local temp directory exists
|
||||
TEMP_DIR = "temp"
|
||||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||||
|
||||
URL_PREFIX = "https://cdn.altpins.com/"
|
||||
|
||||
# Retrieve records from database
|
||||
query = f"""
|
||||
SELECT id, date, media_url, platform, username, hash
|
||||
FROM media
|
||||
WHERE media_url like '%none%';
|
||||
"""
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
|
||||
# Initialize Bunny.net Storage (credentials redacted)
|
||||
obj_storage = config.get_custom_storage()
|
||||
|
||||
count = 0
|
||||
total = len(rows)
|
||||
for row in rows:
|
||||
count += 1
|
||||
pin_id, date, media_url, platform, username, file_hash = row
|
||||
logging.info(f"[{count}/{total}] Processing screenshot ID: {pin_id}")
|
||||
|
||||
serverPath = media_url.replace(URL_PREFIX, "").split("?")[0]
|
||||
|
||||
filename = os.path.basename(serverPath)
|
||||
filename = filename.replace("none", file_hash).replace("None", file_hash)
|
||||
|
||||
filepath = os.path.join(TEMP_DIR, filename)
|
||||
|
||||
# 2. Create new path (based on date)
|
||||
year = date.year
|
||||
month = str(date.month).zfill(2)
|
||||
day = str(date.day).zfill(2)
|
||||
formatted_date = os.path.join(str(year), month, day)
|
||||
|
||||
# Extract the server path (remove domain and query)
|
||||
newPath = os.path.join("media", "stories", username, filename)
|
||||
new_media_url = f"{URL_PREFIX}{newPath}"
|
||||
|
||||
# 3. Move file to new path
|
||||
logging.info(f"Moving screenshot from {serverPath} to {newPath}")
|
||||
status = obj_storage.MoveFile(serverPath, newPath)
|
||||
|
||||
if status['status'] != 'success':
|
||||
logging.info(f"Failed to move file {serverPath} to {newPath}. Error: {status['status']}")
|
||||
continue
|
||||
|
||||
# 4. Update DB
|
||||
logging.info(f"Updating DB record {pin_id} to new URL\n{new_media_url}\nhttps://altpins.com/pin/{pin_id}")
|
||||
cursor.execute("UPDATE media SET media_url = %s WHERE id = %s", [new_media_url, pin_id])
|
||||
db.commit()
|
||||
|
||||
logging.info(f"Successfully processed screenshot {pin_id}")
|
||||
|
||||
|
||||
# Close the DB connection
|
||||
cursor.close()
|
||||
db.close()
|
||||
logging.info("All done!")
|
||||
@ -1 +0,0 @@
|
||||
gAAAAABmRUff7c9t9gngWj_2cwvaTBrUDJ_JUyYVUfG-p3SvDV7qOSHddJ4eHADiJeRtJNtY9UxkohSB5I1MmLahAb_hxxwIVA==
|
||||
@ -0,0 +1,41 @@
|
||||
from storysave_api import get_hd_profile_picture
|
||||
import config, funcs, os, time
|
||||
|
||||
known_phashes = {'e7c51a904b69d366': 'default empty profile picture',
|
||||
'cb3ce46194c335dc': 'default empty profile picture',
|
||||
}
|
||||
|
||||
known_hashes = {
|
||||
'09c3cf34d4f117d99fa6285f4bfd3a0d888d7ab2cbca665b16097f6b93ca0de6' : 'default empty profile picture',
|
||||
'2b9c0914d8f3f0aa6cf86705df70b7b21e9ca2f9013a346463788e7cebd0158f' : 'default empty profile picture',
|
||||
}
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT DISTINCT username, user_id, favorite FROM following WHERE user_id IS NOT NULL AND platform = 'instagram' ORDER BY favorite DESC;")
|
||||
usernames = cursor.fetchall()
|
||||
|
||||
for username, user_id, favorite in usernames:
|
||||
profilepicurl = get_hd_profile_picture(user_id=user_id)
|
||||
if not profilepicurl:
|
||||
print(f'Failed for {username}')
|
||||
continue
|
||||
|
||||
filename = os.path.basename(profilepicurl).split('?')[0]
|
||||
user_dir = os.path.join('media', 'instagram', 'profile', username)
|
||||
filepath = os.path.join(user_dir, filename)
|
||||
|
||||
filepath = funcs.download_file(profilepicurl, filepath)
|
||||
|
||||
if not filepath:
|
||||
continue
|
||||
|
||||
phash = funcs.generate_phash(filepath)
|
||||
if phash in known_phashes:
|
||||
print(f"Profile picture for {username} is the default empty profile picture.")
|
||||
os.remove(filepath)
|
||||
continue
|
||||
|
||||
print(f"Downloaded profile picture for {username}.")
|
||||
|
||||
time.sleep(1)
|
||||
@ -0,0 +1,91 @@
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
import shutil
|
||||
import os
|
||||
from funcs import get_media_dimensions
|
||||
|
||||
media_dir = "media"
|
||||
stories_dir = os.path.join(media_dir, "stories")
|
||||
posts_dir = os.path.join(media_dir, "posts")
|
||||
|
||||
os.makedirs(stories_dir, exist_ok=True)
|
||||
os.makedirs(posts_dir, exist_ok=True)
|
||||
|
||||
|
||||
def is_story(width, height, tolerance=0.02):
|
||||
if width == 0 or height == 0:
|
||||
return False
|
||||
ratio = min(width, height) / max(width, height)
|
||||
return abs(ratio - (9 / 16)) <= (9 / 16 * tolerance)
|
||||
|
||||
|
||||
def determine_post_type(filepath):
|
||||
lower = filepath.lower()
|
||||
if "posts" in lower:
|
||||
return "posts"
|
||||
try:
|
||||
width, height = get_media_dimensions(filepath)
|
||||
except Exception as e:
|
||||
print(f"Error getting dimensions for {filepath}: {e}")
|
||||
return None
|
||||
return "stories" if is_story(width, height) else "posts"
|
||||
|
||||
|
||||
class DownloadHandler(FileSystemEventHandler):
|
||||
def process_file(self, file_path):
|
||||
file = os.path.basename(file_path)
|
||||
|
||||
# Ignore incomplete or weird temp names
|
||||
if "crdownload" in file or file.count("~") != 3:
|
||||
return
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
return
|
||||
|
||||
post_type = determine_post_type(file_path)
|
||||
if post_type == "posts":
|
||||
dest_dir = posts_dir
|
||||
elif post_type == "stories":
|
||||
dest_dir = stories_dir
|
||||
else:
|
||||
print(f"Could not determine post type for {file}. Skipping...")
|
||||
return
|
||||
|
||||
output_path = os.path.join(dest_dir, file)
|
||||
|
||||
if os.path.exists(output_path):
|
||||
print(f"File already exists {output_path}. Removing...")
|
||||
os.remove(file_path)
|
||||
return
|
||||
|
||||
shutil.move(file_path, output_path)
|
||||
print(f"Moved {file_path} → {output_path}")
|
||||
|
||||
def on_created(self, event):
|
||||
if not event.is_directory:
|
||||
self.process_file(event.src_path)
|
||||
|
||||
def on_moved(self, event):
|
||||
if not event.is_directory:
|
||||
self.process_file(event.dest_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
download_path = os.path.join(os.path.expanduser("~"), "Downloads")
|
||||
event_handler = DownloadHandler()
|
||||
|
||||
# Initial scan for files already in Downloads
|
||||
for f in os.listdir(download_path):
|
||||
full_path = os.path.join(download_path, f)
|
||||
if os.path.isfile(full_path):
|
||||
event_handler.process_file(full_path)
|
||||
|
||||
observer = Observer()
|
||||
observer.schedule(event_handler, download_path, recursive=False)
|
||||
observer.start()
|
||||
|
||||
try:
|
||||
observer.join()
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
@ -1,37 +0,0 @@
|
||||
{
|
||||
"uuids": {
|
||||
"phone_id": "53c03380-c7b9-44ab-b10e-1b585e8e428b",
|
||||
"uuid": "2a9c7a37-c902-4332-8a32-1fd903acd991",
|
||||
"client_session_id": "2b0a28f0-86c4-4cd4-b044-c4effd953cc9",
|
||||
"advertising_id": "d330f041-56f1-4f45-906d-d3740717f0b1",
|
||||
"android_device_id": "android-df5a2572f9762ff7",
|
||||
"request_id": "35de6403-02e2-46b4-a02c-403cea1fe9c6",
|
||||
"tray_session_id": "ed1874f7-cb8d-4ed6-bea8-13c53b9c3d67"
|
||||
},
|
||||
"mid": "ZwOR_QABAAGgkEbeoytBO3EL-dgC",
|
||||
"ig_u_rur": null,
|
||||
"ig_www_claim": null,
|
||||
"authorization_data": {
|
||||
"ds_user_id": "1587432849",
|
||||
"sessionid": "1587432849%3Ak5q9QqmHia2WWq%3A18%3AAYcDFsLKMiFCtVhCcqYl7KZrFLw5IOSgf1pNfQZYLA"
|
||||
},
|
||||
"cookies": {},
|
||||
"last_login": 1728287241.130515,
|
||||
"device_settings": {
|
||||
"app_version": "269.0.0.18.75",
|
||||
"android_version": 26,
|
||||
"android_release": "8.0.0",
|
||||
"dpi": "480dpi",
|
||||
"resolution": "1080x1920",
|
||||
"manufacturer": "OnePlus",
|
||||
"device": "devitron",
|
||||
"model": "6T Dev",
|
||||
"cpu": "qcom",
|
||||
"version_code": "314665256"
|
||||
},
|
||||
"user_agent": "Instagram 269.0.0.18.75 Android (26/8.0.0; 480dpi; 1080x1920; OnePlus; 6T Dev; devitron; qcom; en_US; 314665256)",
|
||||
"country": "US",
|
||||
"country_code": 1,
|
||||
"locale": "en_US",
|
||||
"timezone_offset": -14400
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,96 +0,0 @@
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import json
|
||||
|
||||
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}
|
||||
|
||||
def get_data(username):
|
||||
url = f"https://www.snapchat.com/add/{username}"
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
data_script = soup.find("script", id="__NEXT_DATA__")
|
||||
if not data_script:
|
||||
print(f"No data found for {username}.")
|
||||
return None
|
||||
data = json.loads(data_script.string)
|
||||
return data
|
||||
|
||||
def get_all_users_data(usernames):
|
||||
all_data = {}
|
||||
|
||||
# Define a helper function for threading
|
||||
def fetch_data(username):
|
||||
return username, get_data(username)
|
||||
|
||||
# Use ThreadPoolExecutor for concurrent fetching
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = {executor.submit(fetch_data, username): username for username in usernames}
|
||||
|
||||
for future in as_completed(futures):
|
||||
username = futures[future]
|
||||
try:
|
||||
username, data = future.result()
|
||||
all_data[username] = data
|
||||
except Exception as e:
|
||||
print(f"Error fetching data for {username}: {e}")
|
||||
all_data[username] = None
|
||||
|
||||
return all_data
|
||||
|
||||
def parse_stories(stories):
|
||||
parsed_stories = []
|
||||
|
||||
for story in stories:
|
||||
parsed_story = parse_story(story)
|
||||
parsed_stories.append(parsed_story)
|
||||
|
||||
return parsed_stories
|
||||
|
||||
def get_stories(data):
|
||||
try:
|
||||
stories = data['props']['pageProps']['story']['snapList']
|
||||
return parse_stories(stories)
|
||||
except KeyError:
|
||||
return []
|
||||
|
||||
def get_highlights(data):
|
||||
highlights = []
|
||||
page_props = data.get('props', {}).get('pageProps', {})
|
||||
# Possible keys that might contain highlights
|
||||
possible_highlight_keys = ['curatedHighlights', 'savedHighlights', 'highlights']
|
||||
for key in possible_highlight_keys:
|
||||
highlight_data = page_props.get(key, [])
|
||||
if highlight_data:
|
||||
highlights.extend(highlight_data)
|
||||
return highlights
|
||||
|
||||
def parse_story(story):
|
||||
original_snap_id = story.get('snapId', {}).get('value', '')
|
||||
snap_url = story.get('snapUrls', {}).get('mediaUrl', '')
|
||||
timestamp = story.get('timestampInSec', {}).get('value', '')
|
||||
|
||||
return {
|
||||
"original_snap_id": original_snap_id,
|
||||
"snap_id": get_snap_id(snap_url),
|
||||
"url": snap_url,
|
||||
"timestamp": timestamp,
|
||||
"platform": "snapchat",
|
||||
"type": "story",
|
||||
}
|
||||
|
||||
def get_snap_id(url):
|
||||
return url.split('/')[-1].split('.')[0]
|
||||
|
||||
def get_highlight_stories(data):
|
||||
stories = []
|
||||
highlights = get_highlights(data)
|
||||
|
||||
for highlight in highlights:
|
||||
snap_list = highlight.get('snapList', [])
|
||||
|
||||
for snap in snap_list:
|
||||
story = parse_story(snap)
|
||||
stories.append(story)
|
||||
|
||||
return stories
|
||||
@ -1,270 +0,0 @@
|
||||
from snapchat import get_stories, get_highlight_stories, get_all_users_data
|
||||
from datetime import datetime
|
||||
from uuid import uuid4
|
||||
import requests
|
||||
import config
|
||||
import funcs
|
||||
import json
|
||||
import cv2
|
||||
import os
|
||||
|
||||
directory = "snapchat"
|
||||
data_directory = "data"
|
||||
|
||||
def find_duplicate_snap(existing_snaps, snap_id, username):
|
||||
"""
|
||||
Find a snap in the existing_snaps list on database.s
|
||||
"""
|
||||
for snap in existing_snaps:
|
||||
if username == snap[2]:
|
||||
if snap_id in snap[1]:
|
||||
return snap
|
||||
return False
|
||||
|
||||
def archive_data(data, username):
|
||||
data_filename = f"{username}~{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
|
||||
data_filepath = os.path.join(data_directory, data_filename)
|
||||
with open(data_filepath, 'w') as f:
|
||||
f.write(json.dumps(data))
|
||||
|
||||
def get_file_extension(url):
|
||||
response = requests.head(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to access media {url}")
|
||||
return None
|
||||
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
if 'image' in content_type:
|
||||
return '.jpg'
|
||||
elif 'video' in content_type:
|
||||
return '.mp4'
|
||||
else:
|
||||
print(f"Unknown content type for media {url}")
|
||||
return None
|
||||
|
||||
def extract_file_type(url):
|
||||
file_types = {
|
||||
'400': '.jpg',
|
||||
'1322': '.mp4',
|
||||
'1325': '.mp4',
|
||||
'1034': '.mp4',
|
||||
'1023': '.jpg'
|
||||
}
|
||||
|
||||
base_url = url.split("?")[0] # Remove query string
|
||||
|
||||
snap_data = base_url.split('/')[-1]
|
||||
|
||||
# Extract the file type number
|
||||
data_parts = snap_data.split('.')
|
||||
if len(data_parts) > 1:
|
||||
file_type_number = data_parts[1]
|
||||
if file_type_number in file_types:
|
||||
return file_types[file_type_number]
|
||||
else:
|
||||
print(f"Unexpected URL format: {base_url}")
|
||||
return None
|
||||
|
||||
|
||||
def download_media(url, filepath):
|
||||
if os.path.exists(filepath):
|
||||
# File already exists, skip download and return the filepath as if it was downloaded.
|
||||
return filepath
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to download media {url}")
|
||||
return None
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(response.content)
|
||||
return filepath
|
||||
|
||||
def get_snapchat_stories():
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
os.makedirs(data_directory, exist_ok=True)
|
||||
|
||||
cursor.execute("SELECT username FROM following WHERE platform = 'snapchat' ORDER BY id DESC")
|
||||
usernames = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
cursor.execute("SELECT id, filename, username FROM media WHERE filename IS NOT NULL AND platform = 'snapchat' ORDER BY id DESC")
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
snapchat_users_data = get_all_users_data(usernames)
|
||||
|
||||
ready_stories = []
|
||||
|
||||
for username in usernames:
|
||||
print(f"Getting stories for {username}...")
|
||||
|
||||
data = snapchat_users_data.get(username)
|
||||
if not data:
|
||||
print(f"Failed to get data for {username}. Skipping.")
|
||||
continue
|
||||
|
||||
archive_data(data, username)
|
||||
|
||||
stories = get_stories(data)
|
||||
|
||||
stories.extend(get_highlight_stories(data))
|
||||
|
||||
for story in stories:
|
||||
snap_id = story['snap_id']
|
||||
url = story['url']
|
||||
timestamp = story['timestamp']
|
||||
|
||||
duplicate_snap = find_duplicate_snap(existing_medias, snap_id, username)
|
||||
if duplicate_snap:
|
||||
# Snap already exists in the database
|
||||
continue
|
||||
|
||||
# Determine file extension using HEAD request.
|
||||
extension = extract_file_type(url)
|
||||
if not extension:
|
||||
print(f"Failed to determine file extension for {url}. Skipping.")
|
||||
continue
|
||||
|
||||
filename = f"{username}~{timestamp}~{snap_id}{extension}"
|
||||
filepath = os.path.join(directory, filename)
|
||||
|
||||
media = {
|
||||
'username': username,
|
||||
'timestamp': timestamp,
|
||||
'filepath': filepath,
|
||||
'snap_id': snap_id,
|
||||
'original_snap_id': story['original_snap_id'],
|
||||
'media_url': url,
|
||||
}
|
||||
|
||||
ready_stories.append(media)
|
||||
print(f"Media {snap_id} ready for download.")
|
||||
|
||||
# sort ready_stories by timestamp from oldest to newest
|
||||
ready_stories.sort(key=lambda x: x['timestamp'])
|
||||
|
||||
return ready_stories
|
||||
|
||||
def get_snapchat_files():
|
||||
stories = funcs.get_files(directory)
|
||||
stories = [get_media_data(filepath) for filepath in stories]
|
||||
stories = [story for story in stories if story]
|
||||
return stories
|
||||
|
||||
def main():
|
||||
ready_stories = get_snapchat_stories()
|
||||
stories_from_files = get_snapchat_files()
|
||||
|
||||
ready_stories.extend(stories_from_files)
|
||||
|
||||
download_stories(ready_stories)
|
||||
|
||||
def download_stories(stories):
|
||||
for story in stories:
|
||||
# Download the media
|
||||
filepath = story['filepath']
|
||||
url = story['media_url']
|
||||
filename = os.path.basename(filepath)
|
||||
timestamp = story['timestamp']
|
||||
|
||||
filepath = download_media(url, filepath)
|
||||
print(f"Downloaded {filename} at {timestamp}")
|
||||
|
||||
if not filepath:
|
||||
continue
|
||||
|
||||
story['filepath'] = filepath
|
||||
|
||||
UploadMedia(story)
|
||||
|
||||
def UploadMedia(media):
|
||||
username = media['username']
|
||||
timestamp = media['timestamp']
|
||||
filepath = media['filepath']
|
||||
filename = os.path.basename(filepath)
|
||||
snap_id = media['snap_id']
|
||||
original_snap_id = media['original_snap_id']
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
post_date = datetime.fromtimestamp(int(timestamp))
|
||||
|
||||
width, height = funcs.get_media_dimensions(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath)
|
||||
|
||||
if media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
elif media_type == 'video':
|
||||
try:
|
||||
thumb_path = generate_thumbnail(filepath)
|
||||
obj_storage.PutFile(thumb_path, f'thumbnails/{filename}')
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{filename}"
|
||||
phash = funcs.generate_phash(thumb_path)
|
||||
os.remove(thumb_path)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
|
||||
server_path = f'media/snaps/{username}/{filename}'
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path)
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, snap_id, original_snap_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, 'story', post_date, file_hash, filename, duration, thumbnail_url, phash, 'snapchat', snap_id, original_snap_id)
|
||||
|
||||
cursor.execute(query, values)
|
||||
db.commit()
|
||||
print(f'[{cursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def generate_thumbnail(filepath):
|
||||
thumb_path = f'temp/{uuid4()}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumb_path, frame)
|
||||
cap.release()
|
||||
return thumb_path
|
||||
|
||||
def get_media_data(filepath):
|
||||
filename = os.path.basename(filepath)
|
||||
parts = filename.split('~')
|
||||
if len(parts) < 3:
|
||||
return False
|
||||
|
||||
username = parts[0]
|
||||
timestamp = parts[1]
|
||||
snap_id = parts[2]
|
||||
snap_id = os.path.splitext(snap_id)[0]
|
||||
|
||||
data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': snap_id, 'original_snap_id': None, 'media_url': None}
|
||||
# data = {'username': username, 'timestamp': timestamp, 'filepath': filepath, 'snap_id': None, 'original_snap_id': snap_id, 'media_url': None}
|
||||
|
||||
return data
|
||||
|
||||
def process_snap_ids(filenames):
|
||||
snap_ids = []
|
||||
for filename in filenames:
|
||||
snap_id = filename.split('~')[2]
|
||||
snap_id = os.path.splitext(snap_id)[0]
|
||||
if snap_id not in snap_ids:
|
||||
snap_ids.append(snap_id)
|
||||
|
||||
return snap_ids
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting snappy...')
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
obj_storage = config.get_storage()
|
||||
|
||||
main()
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,147 +0,0 @@
|
||||
from datetime import datetime
|
||||
from uuid import uuid4
|
||||
import funcs
|
||||
import config
|
||||
import cv2
|
||||
import os
|
||||
|
||||
|
||||
media_directory = "media/ready_for_upload"
|
||||
platform = "instagram"
|
||||
|
||||
working_directory = os.path.join(media_directory, platform)
|
||||
|
||||
def UploadMedia(media):
|
||||
username = media['username']
|
||||
user_id = media['user_id']
|
||||
filepath = media['filepath']
|
||||
platform = media['platform']
|
||||
|
||||
media_id = media['media_id']
|
||||
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
if not media_type:
|
||||
print(f'Error determining media type for {filename}. Skipping...')
|
||||
return False
|
||||
|
||||
post_type = funcs.determine_post_type(filepath)
|
||||
if not post_type:
|
||||
print(f'Error determining post type for {filename}. Skipping...')
|
||||
return False
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
post_date = datetime.now()
|
||||
|
||||
width, height = funcs.get_media_dimensions(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath)
|
||||
|
||||
if media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
elif media_type == 'video':
|
||||
try:
|
||||
thumb_path = generate_thumbnail(filepath)
|
||||
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
|
||||
phash = funcs.generate_phash(thumb_path)
|
||||
os.remove(thumb_path)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
server_path = f'media/{post_type}/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
post_type = 'story' if post_type == 'stories' else 'post'
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform, media_id) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, platform, media_id)
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def generate_thumbnail(filepath):
|
||||
thumb_path = f'temp/{uuid4()}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumb_path, frame)
|
||||
cap.release()
|
||||
return thumb_path
|
||||
|
||||
def get_user_id(username):
|
||||
username = username.lower()
|
||||
if username in existing_users:
|
||||
return existing_users[username]
|
||||
|
||||
return None
|
||||
|
||||
def get_media(folder_path):
|
||||
medias = []
|
||||
|
||||
user_folders = os.listdir(folder_path)
|
||||
for user_folder in user_folders:
|
||||
user_folder_path = os.path.join(folder_path, user_folder)
|
||||
|
||||
if not os.path.isdir(user_folder_path):
|
||||
continue
|
||||
|
||||
files = os.listdir(user_folder_path)
|
||||
for filename in files:
|
||||
filepath = os.path.join(folder_path, user_folder, filename)
|
||||
|
||||
# skip file if its hidden
|
||||
if filename.startswith('.'):
|
||||
continue
|
||||
|
||||
try:
|
||||
media_id = filename.split('.')[0]
|
||||
media_id = int(media_id)
|
||||
except:
|
||||
media_id = None
|
||||
|
||||
media = {
|
||||
'username': user_folder,
|
||||
'filepath': filepath,
|
||||
'user_id': get_user_id(user_folder),
|
||||
'media_id': media_id,
|
||||
'platform': platform
|
||||
}
|
||||
|
||||
medias.append(media)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
medias = get_media(folder_path)
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
|
||||
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
|
||||
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
|
||||
|
||||
dump_instagram(working_directory)
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,142 +0,0 @@
|
||||
from datetime import datetime
|
||||
import config
|
||||
import funcs
|
||||
import cv2
|
||||
import os
|
||||
|
||||
directory = 'storysaver'
|
||||
|
||||
def UploadMedia(media):
|
||||
media_id = media['media_id']
|
||||
username = media['username']
|
||||
post_date = media['timestamp']
|
||||
user_id = media['user_id']
|
||||
filepath = media['filepath']
|
||||
highlight_id = media['highlight_id']
|
||||
post_type = media['post_type']
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
if media_id and int(media_id) in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
width, height = funcs.get_media_dimensions(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath)
|
||||
|
||||
if media_type == 'video':
|
||||
try:
|
||||
thumbPath = f'temp/{media_id}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
|
||||
phash = funcs.generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
elif media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
|
||||
if media_id:
|
||||
newFilename = f'{media_id}{file_extension}'
|
||||
else:
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
|
||||
server_path = f'media/{post_type}/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
if highlight_id:
|
||||
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def get_user_id(username):
|
||||
username = username.lower()
|
||||
if username in existing_users:
|
||||
return existing_users[username]
|
||||
|
||||
return None
|
||||
|
||||
def get_media():
|
||||
medias = []
|
||||
post_types = {
|
||||
'posts': 'post',
|
||||
'stories': 'story',
|
||||
'profile': 'profile',
|
||||
}
|
||||
|
||||
for post_type in os.listdir('media'):
|
||||
users = os.listdir(f'media/{post_type}')
|
||||
for user in users:
|
||||
user_path = f'media/{post_type}/{user}'
|
||||
for filename in os.listdir(user_path):
|
||||
data = {}
|
||||
filepath = os.path.join(user_path, filename)
|
||||
|
||||
data['post_type'] = post_types[post_type]
|
||||
data['username'] = user
|
||||
data['timestamp'] = filename.split('__')[-1].split('.')[0] if 'com.instagram.android__' in filename else datetime.now()
|
||||
if 'com.instagram.android__' in filename:
|
||||
data['timestamp'] = datetime.strptime(data, '%Y%m%d%H%M%S%f')
|
||||
data['filepath'] = filepath
|
||||
data['media_id'] = None
|
||||
data['user_id'] = get_user_id(data['username'])
|
||||
data['highlight_id'] = None
|
||||
medias.append(data)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram():
|
||||
medias = get_media()
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
existing_files.append(media['media_id'])
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
if not os.listdir(directory):
|
||||
print('No files to process. Exiting...')
|
||||
exit()
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
|
||||
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
|
||||
|
||||
dump_instagram(directory)
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,140 +0,0 @@
|
||||
from datetime import datetime
|
||||
from uuid import uuid4
|
||||
import funcs
|
||||
import config
|
||||
import cv2
|
||||
import os
|
||||
|
||||
directory = 'processed_tiktoks'
|
||||
|
||||
def UploadMedia(media):
|
||||
platform = 'TikTok'
|
||||
username = media['username']
|
||||
filepath = media['filepath']
|
||||
file_size = os.path.getsize(filepath)
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
if not media_type:
|
||||
print(f'Error determining media type for {filename}. Skipping...')
|
||||
return False
|
||||
|
||||
post_type = funcs.determine_post_type(filepath)
|
||||
if not post_type:
|
||||
print(f'Error determining post type for {filename}. Skipping...')
|
||||
return False
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
if file_hash in existing_hashes:
|
||||
print(f'File {filename} already exists. Skipping...')
|
||||
return False
|
||||
|
||||
post_date = datetime.now()
|
||||
|
||||
width, height = funcs.get_media_dimensions(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath)
|
||||
|
||||
if media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
elif media_type == 'video':
|
||||
try:
|
||||
thumb_path = generate_thumbnail(filepath)
|
||||
obj_storage.PutFile(thumb_path, f'thumbnails/{file_hash}.jpg') # this might be a problem in case of duplicate hashes
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{file_hash}.jpg"
|
||||
phash = funcs.generate_phash(thumb_path)
|
||||
os.remove(thumb_path)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
server_path = f'media/tiktoks/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
post_type = 'story' if post_type == 'stories' else 'post'
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, post_type, date, hash, filename, duration, thumbnail, phash, platform, file_size) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, post_type, post_date, file_hash, filename, duration, thumbnail_url, phash, platform, file_size)
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def generate_thumbnail(filepath):
|
||||
thumb_path = f'temp/{uuid4()}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumb_path, frame)
|
||||
cap.release()
|
||||
return thumb_path
|
||||
|
||||
def get_media_data(filepath):
|
||||
filename = os.path.basename(filepath)
|
||||
parts = filename.split('~')
|
||||
|
||||
if len(parts) == 3:
|
||||
username, title, tiktok_id = parts
|
||||
elif len(parts) == 2:
|
||||
username, title = parts
|
||||
tiktok_id = None
|
||||
else:
|
||||
return False
|
||||
|
||||
data = {'username': username, 'filepath': filepath, 'tiktok_id': tiktok_id, 'title': title}
|
||||
|
||||
return data
|
||||
|
||||
def get_media(folder_path):
|
||||
medias = []
|
||||
|
||||
users = os.listdir(folder_path)
|
||||
for user in users:
|
||||
user_folder = os.path.join(folder_path, user)
|
||||
if not os.path.isdir(user_folder):
|
||||
print(f"Skipping {user}")
|
||||
continue
|
||||
|
||||
files = os.listdir(user_folder)
|
||||
for filename in files:
|
||||
filepath = os.path.join(user_folder, filename)
|
||||
|
||||
data = get_media_data(filepath)
|
||||
if data:
|
||||
medias.append(data)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram(folder_path):
|
||||
medias = get_media(folder_path)
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
if not os.listdir(directory):
|
||||
print('No files to process. Exiting...')
|
||||
exit()
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
|
||||
newCursor.execute("SELECT hash FROM media WHERE hash IS NOT NULL AND platform = 'TikTok'")
|
||||
existing_hashes = [row[0] for row in newCursor.fetchall()]
|
||||
|
||||
dump_instagram(directory)
|
||||
|
||||
print("Processing completed.")
|
||||
@ -1,58 +0,0 @@
|
||||
from uuid import uuid4
|
||||
import uuid
|
||||
import os
|
||||
|
||||
def is_valid_uuid(uuid_to_test, version=4):
|
||||
try:
|
||||
uuid_obj = uuid.UUID(uuid_to_test, version=version)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return str(uuid_obj) == uuid_to_test
|
||||
|
||||
source_dir = 'tiktoks/'
|
||||
processed_dir = 'processed_tiktoks'
|
||||
|
||||
os.makedirs(processed_dir, exist_ok=True)
|
||||
|
||||
users = os.listdir(source_dir)
|
||||
|
||||
for user in users:
|
||||
user_dir = os.path.join(source_dir, user)
|
||||
if not os.path.isdir(user_dir):
|
||||
print(f"Skipping {user}")
|
||||
continue
|
||||
|
||||
for file in os.listdir(user_dir):
|
||||
filename = os.path.splitext(file)[0]
|
||||
filepath = os.path.join(user_dir, file)
|
||||
file_ext = os.path.splitext(file)[1]
|
||||
|
||||
tiktok_id = str(uuid4())
|
||||
username = user
|
||||
|
||||
if is_valid_uuid(filename):
|
||||
title = ''
|
||||
tiktok_id = filename
|
||||
elif 'masstik' in file or 'masstiktok' in file:
|
||||
data = file.split('_')
|
||||
title = filename.split('_')[-1]
|
||||
else:
|
||||
title = filename
|
||||
|
||||
|
||||
print("="*100)
|
||||
title = title.encode('utf-8', 'ignore').decode('utf-8')
|
||||
print(f"Username: {username}\nTitle: {title}")
|
||||
|
||||
new_filename = f"{username}~{title}~{tiktok_id}{file_ext}"
|
||||
new_filepath = os.path.join(processed_dir, username, new_filename)
|
||||
|
||||
os.makedirs(os.path.dirname(new_filepath), exist_ok=True)
|
||||
if not os.path.exists(new_filepath):
|
||||
os.rename(filepath, new_filepath)
|
||||
print(f"Renamed {file} to {new_filepath}")
|
||||
else:
|
||||
print("File with the same name already exists. Renaming aborted.")
|
||||
|
||||
print("="*100)
|
||||
@ -1,38 +0,0 @@
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
import shutil
|
||||
import time
|
||||
import os
|
||||
|
||||
class DownloadHandler(FileSystemEventHandler):
|
||||
def process_file(self, file_path):
|
||||
file = os.path.basename(file_path)
|
||||
if 'crdownload' not in file and file.count('~') == 3:
|
||||
print(f'Moving {file}...')
|
||||
outputPath = os.path.join('storysaver', file)
|
||||
try:
|
||||
shutil.move(file_path, outputPath)
|
||||
except Exception as e:
|
||||
print(f'Failed to move file: {e}')
|
||||
|
||||
def on_created(self, event):
|
||||
if not event.is_directory and 'crdownload' not in event.src_path:
|
||||
self.process_file(event.src_path)
|
||||
|
||||
def on_moved(self, event):
|
||||
if not event.is_directory and 'crdownload' not in event.dest_path:
|
||||
self.process_file(event.dest_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
downloadPath = os.path.join(os.path.expanduser('~'), 'Downloads')
|
||||
event_handler = DownloadHandler()
|
||||
observer = Observer()
|
||||
observer.schedule(event_handler, downloadPath, recursive=False)
|
||||
observer.start()
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1) # Add a 1-second sleep to reduce CPU usage
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
@ -0,0 +1,143 @@
|
||||
import os
|
||||
import time
|
||||
import requests
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
# --- Configuration ---
|
||||
USERNAME = "maorshabakov" # your Instagram username
|
||||
PASSWORD = "PeyxCU%MD*Zq9p" # your Instagram password
|
||||
TARGET_USER = "cata.leyah" # the username of the profile to scrape
|
||||
DOWNLOAD_DIR = "downloads" # directory to save media
|
||||
SCROLL_PAUSE_TIME = 2 # seconds to wait after each scroll
|
||||
|
||||
# --- Helper functions ---
|
||||
def login_instagram(driver, username, password):
|
||||
driver.get("https://www.instagram.com/accounts/login/")
|
||||
time.sleep(3) # wait for the login page to load
|
||||
|
||||
# Accept cookies if prompted (may need to adjust for your region)
|
||||
try:
|
||||
accept_button = driver.find_element(By.XPATH, "//button[text()='Allow all cookies']")
|
||||
accept_button.click()
|
||||
time.sleep(2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# check if already logged in by checking if the current url has been redirected to the home page
|
||||
if driver.current_url == "https://www.instagram.com/":
|
||||
print("Already logged in.")
|
||||
return
|
||||
|
||||
# Enter username and password
|
||||
username_input = driver.find_element(By.NAME, "username")
|
||||
password_input = driver.find_element(By.NAME, "password")
|
||||
username_input.send_keys(username)
|
||||
password_input.send_keys(password)
|
||||
password_input.send_keys(Keys.RETURN)
|
||||
time.sleep(5) # wait for login to complete
|
||||
|
||||
def scroll_to_load_posts(driver, post_count=12):
|
||||
post_links = dict()
|
||||
|
||||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||
while True:
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(SCROLL_PAUSE_TIME)
|
||||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
new_posts = get_post_links(driver)
|
||||
for link in new_posts:
|
||||
if link not in post_links:
|
||||
post_links[link] = True
|
||||
|
||||
if len(post_links) >= post_count:
|
||||
break
|
||||
|
||||
if new_height == last_height:
|
||||
break
|
||||
last_height = new_height
|
||||
|
||||
def get_post_links(driver):
|
||||
# Find all post links on the profile page.
|
||||
# Instagram posts are links with hrefs that contain '/p/'
|
||||
post_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '/p/')]")
|
||||
links = [elem.get_attribute("href") for elem in post_elements]
|
||||
# Remove duplicates
|
||||
return list(set(links))
|
||||
|
||||
def download_media(url, download_folder, filename):
|
||||
response = requests.get(url, stream=True)
|
||||
if response.status_code == 200:
|
||||
filepath = os.path.join(download_folder, filename)
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(1024):
|
||||
f.write(chunk)
|
||||
print(f"Downloaded: {filename}")
|
||||
else:
|
||||
print(f"Failed to download: {url}")
|
||||
|
||||
def extract_media_url(driver):
|
||||
# Try to get video first
|
||||
try:
|
||||
video = driver.find_element(By.TAG_NAME, "video")
|
||||
media_url = video.get_attribute("src")
|
||||
if media_url:
|
||||
return media_url, "mp4"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to image extraction
|
||||
try:
|
||||
# Sometimes the post image is inside a div with role="button"
|
||||
image = driver.find_element(By.XPATH, "//img[contains(@src, 'scontent')]")
|
||||
media_url = image.get_attribute("src")
|
||||
if media_url:
|
||||
return media_url, "jpg"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None, None
|
||||
|
||||
# --- Main script ---
|
||||
def main():
|
||||
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--user-data-dir=.profiles/thenigga")
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
driver.maximize_window()
|
||||
|
||||
try:
|
||||
# Log in to Instagram
|
||||
login_instagram(driver, USERNAME, PASSWORD)
|
||||
|
||||
# Navigate to the target user's profile
|
||||
driver.get(f"https://www.instagram.com/{TARGET_USER}/")
|
||||
time.sleep(5) # let the page load
|
||||
|
||||
# Scroll down to load all posts
|
||||
scroll_to_load_posts(driver)
|
||||
|
||||
# Gather all post links from the profile page
|
||||
post_links = get_post_links(driver)
|
||||
print(f"Found {len(post_links)} posts.")
|
||||
|
||||
# Process each post
|
||||
for idx, post_link in enumerate(post_links):
|
||||
driver.get(post_link)
|
||||
time.sleep(3) # wait for post to load
|
||||
|
||||
# click download button where div class post-download-all-button
|
||||
download_button = driver.find_element(By.XPATH, "//div[@class='post-download-all-button']")
|
||||
driver.execute_script("arguments[0].click();", download_button)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue