cleanup
parent
4d23278033
commit
48d2330193
@ -0,0 +1,35 @@
|
||||
import os
|
||||
import json
|
||||
import gzip
|
||||
|
||||
data_dir = 'data'
|
||||
data_compressed_dir = 'data_compressed'
|
||||
os.makedirs(data_compressed_dir, exist_ok=True)
|
||||
|
||||
def compress_file(filepath, output_file):
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
compress_data(data, output_file)
|
||||
return output_file
|
||||
|
||||
def compress_data(data, output_file):
|
||||
with gzip.open(output_file, 'wb') as f:
|
||||
f.write(json.dumps(data).encode('utf-8'))
|
||||
return output_file
|
||||
|
||||
|
||||
data_files = os.listdir(data_dir)
|
||||
for file in data_files:
|
||||
if not file.endswith('.json'):
|
||||
continue
|
||||
|
||||
filepath = f'{data_dir}/{file}'
|
||||
output_file = f'{data_compressed_dir}/{file}.gz'
|
||||
output_file = compress_file(filepath, output_file)
|
||||
if output_file:
|
||||
print(f'Compressed {file} to {output_file}')
|
||||
os.remove(filepath)
|
||||
else:
|
||||
print(f'Failed to compress {file}')
|
||||
|
||||
print('Data compression completed')
|
||||
@ -0,0 +1,87 @@
|
||||
from funcs import get_files
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
import config
|
||||
import os
|
||||
|
||||
def generate_image_phash(filepath, hash_size=8):
|
||||
try:
|
||||
# Open the image using PIL
|
||||
pil_image = Image.open(filepath)
|
||||
|
||||
# Compute pHash using the imagehash library
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
return phash
|
||||
except Exception as e:
|
||||
print(f"Error processing image {filepath}: {e}")
|
||||
return None
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
try:
|
||||
# Compute the Hamming distance between the pHashes
|
||||
distance = phash1 - phash2
|
||||
return distance <= threshold
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
def find_duplicate_phash(phash, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
|
||||
# Convert stored pHash string to ImageHash object
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
# Check if the current pHash is a duplicate
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_hash(hash, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_hash = media[1]
|
||||
if hash == existing_hash:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_media_by_id(media_id, existing_medias):
|
||||
for media in existing_medias:
|
||||
existing_media_id = media[1]
|
||||
if media_id == existing_media_id:
|
||||
return media
|
||||
return None
|
||||
|
||||
def get_data_by_filename(filename, data):
|
||||
for item in data:
|
||||
if filename in item['filepath']:
|
||||
return item
|
||||
return None
|
||||
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Fetch existing media with pHashes (assuming media are images, adjust media_type if needed)
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL AND media_id IS NULL;", ['image'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
directory = 'check_if_exists/' # Directory containing user images
|
||||
files = [file for file in get_files(directory) if file.endswith(('.jpg', '.jpeg', '.png'))]
|
||||
for filepath in files:
|
||||
image_filename = os.path.basename(filepath)
|
||||
|
||||
# Generate pHash for the image
|
||||
phash = generate_image_phash(filepath, hash_size=8)
|
||||
if phash is None:
|
||||
continue
|
||||
|
||||
# Check if the image is a duplicate of any in the database
|
||||
duplicate_media = find_duplicate_phash(phash, existing_medias)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate image path: {filepath}')
|
||||
newpath = os.path.join('duplicates', duplicate_media[2], image_filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {image_filename} to duplicates/')
|
||||
@ -0,0 +1,79 @@
|
||||
from funcs import get_files
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
import config
|
||||
import cv2
|
||||
import os
|
||||
|
||||
def get_video_phash(filepath, hash_size=8):
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if not ret:
|
||||
print(f"Error reading frame from {filepath}")
|
||||
return None
|
||||
|
||||
# Resize frame to a standard size
|
||||
standard_size = (320, 240)
|
||||
resized_frame = cv2.resize(frame, standard_size, interpolation=cv2.INTER_AREA)
|
||||
|
||||
# Convert OpenCV image (BGR) to PIL Image (RGB)
|
||||
image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
|
||||
pil_image = Image.fromarray(image_rgb)
|
||||
|
||||
# Compute pHash
|
||||
phash = imagehash.phash(pil_image, hash_size=hash_size)
|
||||
|
||||
return phash
|
||||
|
||||
def are_phashes_duplicates(phash1, phash2, threshold=5):
|
||||
# Compute Hamming distance between the pHashes
|
||||
try:
|
||||
distance = phash1 - phash2
|
||||
except TypeError as e:
|
||||
print(f"Error comparing pHashes: {e}")
|
||||
return False
|
||||
|
||||
return distance <= threshold
|
||||
|
||||
def get_media_by_phash(phash, existing_medias, threshold=5):
|
||||
for media in existing_medias:
|
||||
existing_phash_str = media[1]
|
||||
existing_username = media[2]
|
||||
|
||||
existing_phash = imagehash.hex_to_hash(existing_phash_str)
|
||||
|
||||
if are_phashes_duplicates(phash, existing_phash, threshold=threshold):
|
||||
return media
|
||||
return None
|
||||
|
||||
# Database connection
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
# Directory containing user videos
|
||||
directory = 'check_if_exists/' # Directory containing user images
|
||||
|
||||
# Fetch existing videos with pHashes
|
||||
cursor.execute("SELECT id, phash, username FROM media WHERE media_type = %s AND phash IS NOT NULL;", ['video'])
|
||||
existing_medias = cursor.fetchall()
|
||||
|
||||
# make a list of all video files
|
||||
files = [file for file in get_files(directory) if file.endswith(('.mp4', '.avi', '.mov'))]
|
||||
|
||||
|
||||
for filepath in files:
|
||||
video_filename = os.path.basename(filepath)
|
||||
|
||||
phash = get_video_phash(filepath, hash_size=8) # Use hash_size=8
|
||||
if phash is None:
|
||||
continue
|
||||
|
||||
duplicate_media = get_media_by_phash(phash, existing_medias, threshold=5)
|
||||
if duplicate_media:
|
||||
print(f'Duplicate url found: https://altpins.com/pin/{duplicate_media[0]}')
|
||||
print(f'Duplicate video path: {filepath}')
|
||||
newpath = os.path.join('duplicates', duplicate_media[2], video_filename)
|
||||
os.makedirs(os.path.dirname(newpath), exist_ok=True)
|
||||
os.rename(filepath, newpath)
|
||||
print(f'Moved {filepath} to duplicates/')
|
||||
@ -0,0 +1,19 @@
|
||||
import config, storysave_api
|
||||
|
||||
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
usernames = []
|
||||
with open('usernames.txt', 'r') as f:
|
||||
for line in f:
|
||||
usernames.append(line.strip())
|
||||
|
||||
for username in usernames:
|
||||
print(f"Username: {username}")
|
||||
|
||||
user_id = storysave_api.get_user_id(username)
|
||||
|
||||
# Update the user_id in the database
|
||||
cursor.execute("UPDATE media SET user_id = %s WHERE username = %s AND user_id IS NULL;", [user_id, username])
|
||||
db.commit()
|
||||
print(f"[{cursor.rowcount}] Updated user_id for {username}")
|
||||
@ -0,0 +1,32 @@
|
||||
import config
|
||||
import os
|
||||
|
||||
temp_directory = "cache"
|
||||
os.makedirs(temp_directory, exist_ok=True)
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
db, cursor = config.gen_connection()
|
||||
|
||||
cursor.execute("SELECT id, media_url FROM media WHERE file_size = 0;")
|
||||
results = cursor.fetchall()
|
||||
|
||||
count = 0
|
||||
print(f"Found {len(results)} files to process.")
|
||||
|
||||
for result in results:
|
||||
count += 1
|
||||
|
||||
id, media_url = result
|
||||
|
||||
serverPath = media_url.replace("https://storysave.b-cdn.net/", '').replace('//', '/').replace('\\', '/')
|
||||
localFilePath = os.path.join(os.getcwd(), temp_directory, os.path.basename(serverPath))
|
||||
|
||||
if not os.path.exists(localFilePath):
|
||||
continue
|
||||
|
||||
file_size = os.path.getsize(localFilePath)
|
||||
|
||||
cursor.execute("UPDATE media SET file_size = %s WHERE id = %s;", (file_size, id))
|
||||
db.commit()
|
||||
|
||||
print(f"[{count}/{len(results)}] {media_url}: {file_size}, {cursor.rowcount}")
|
||||
@ -0,0 +1,154 @@
|
||||
from datetime import datetime
|
||||
import config
|
||||
import funcs
|
||||
import cv2
|
||||
import os
|
||||
|
||||
directory = 'media/instagram/'
|
||||
|
||||
def UploadMedia(media):
|
||||
media_id = media['media_id']
|
||||
username = media['username']
|
||||
post_date = media['timestamp']
|
||||
user_id = media['user_id']
|
||||
filepath = media['filepath']
|
||||
highlight_id = media['highlight_id']
|
||||
post_type = media['post_type']
|
||||
thumbnail_url = None
|
||||
phash = None
|
||||
|
||||
if media_id and int(media_id) in existing_files:
|
||||
print('Duplicate file detected. Removing...')
|
||||
os.remove(filepath)
|
||||
return True
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
file_extension = os.path.splitext(filename)[1].lower()
|
||||
|
||||
media_type = funcs.get_media_type(filename)
|
||||
|
||||
file_hash = funcs.calculate_file_hash(filepath)
|
||||
|
||||
width, height = funcs.get_media_dimensions(filepath)
|
||||
|
||||
duration = funcs.get_video_duration(filepath)
|
||||
|
||||
if media_type == 'video':
|
||||
try:
|
||||
thumbPath = f'temp/{media_id}.jpg'
|
||||
cap = cv2.VideoCapture(filepath)
|
||||
ret, frame = cap.read()
|
||||
cv2.imwrite(thumbPath, frame)
|
||||
cap.release()
|
||||
obj_storage.PutFile(thumbPath, f'thumbnails/{media_id}.jpg') # slower
|
||||
thumbnail_url = f"https://storysave.b-cdn.net/thumbnails/{media_id}.jpg"
|
||||
phash = funcs.generate_phash(thumbPath)
|
||||
os.remove(thumbPath)
|
||||
except:
|
||||
print('Error generating thumbnail. Skipping...')
|
||||
return False
|
||||
elif media_type == 'image':
|
||||
phash = funcs.generate_phash(filepath)
|
||||
|
||||
if media_id:
|
||||
newFilename = f'{media_id}{file_extension}'
|
||||
else:
|
||||
newFilename = f'{file_hash}{file_extension}'
|
||||
|
||||
server_path = f'media/{post_type}/{username}/{newFilename}'
|
||||
|
||||
file_url = f"https://storysave.b-cdn.net/{server_path}"
|
||||
|
||||
obj_storage.PutFile(filepath, server_path) # slow as fuck
|
||||
|
||||
if highlight_id:
|
||||
newCursor.execute("INSERT IGNORE INTO highlights (highlight_id, user_id, media_id) VALUES (%s, %s, %s)", (highlight_id, user_id, media_id))
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] added highlight {highlight_id} to user {user_id}')
|
||||
|
||||
query = "INSERT IGNORE INTO media (username, media_type, media_url, width, height, media_id, post_type, date, user_id, hash, filename, duration, thumbnail, phash, platform) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
|
||||
values = (username, media_type, file_url, width, height, media_id, post_type, post_date, user_id, file_hash, filename, duration, thumbnail_url, phash, 'instagram')
|
||||
|
||||
newCursor.execute(query, values) # slower
|
||||
newDB.commit()
|
||||
print(f'[{newCursor.rowcount}] records updated. File {filename} uploaded to {file_url}')
|
||||
|
||||
os.remove(filepath)
|
||||
|
||||
return True
|
||||
|
||||
def get_user_id(username):
|
||||
username = username.lower()
|
||||
if username in existing_users:
|
||||
return existing_users[username]
|
||||
|
||||
return None
|
||||
|
||||
def get_media():
|
||||
medias = []
|
||||
post_types = {
|
||||
'posts': 'post',
|
||||
'stories': 'story',
|
||||
'profile': 'profile',
|
||||
}
|
||||
|
||||
for post_type in os.listdir(directory):
|
||||
users_dir = os.path.join(directory, post_type)
|
||||
if not os.path.isdir(users_dir):
|
||||
continue
|
||||
users = os.listdir(users_dir)
|
||||
|
||||
for username in users:
|
||||
user_path = os.path.join(directory, post_type, username)
|
||||
if not os.path.isdir(user_path):
|
||||
continue
|
||||
for filename in os.listdir(user_path):
|
||||
if filename.startswith('.'):
|
||||
continue
|
||||
|
||||
data = {}
|
||||
filepath = os.path.join(user_path, filename)
|
||||
|
||||
if 'com.instagram.android__' in filename:
|
||||
timestamp_str = filename.split('__')[-1].split('.')[0]
|
||||
data['timestamp'] = datetime.strptime(timestamp_str, '%Y%m%d%H%M%S%f')
|
||||
else:
|
||||
data['timestamp'] = datetime.now()
|
||||
|
||||
data['post_type'] = post_types[post_type]
|
||||
data['username'] = username
|
||||
data['filepath'] = filepath
|
||||
data['media_id'] = None
|
||||
data['user_id'] = get_user_id(data['username'])
|
||||
data['highlight_id'] = None
|
||||
medias.append(data)
|
||||
|
||||
return medias
|
||||
|
||||
def dump_instagram():
|
||||
medias = get_media()
|
||||
|
||||
for media in medias:
|
||||
UploadMedia(media)
|
||||
existing_files.append(media['media_id'])
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting processing...')
|
||||
|
||||
if not os.listdir(directory):
|
||||
print('No files to process. Exiting...')
|
||||
exit()
|
||||
|
||||
newDB, newCursor = config.gen_connection()
|
||||
|
||||
obj_storage = config.get_storage()
|
||||
|
||||
newCursor.execute("SELECT media_id FROM media WHERE media_id IS NOT NULL")
|
||||
existing_files = [image[0] for image in newCursor.fetchall()]
|
||||
|
||||
newCursor.execute("SELECT DISTINCT username, user_id FROM media WHERE user_id IS NOT NULL")
|
||||
existing_users = {user[0].lower(): user[1].lower() for user in newCursor.fetchall()}
|
||||
|
||||
dump_instagram()
|
||||
|
||||
print("Processing completed.")
|
||||
Loading…
Reference in New Issue