optimization

main
oscar 3 months ago
parent ea7d8a4635
commit ecfa2f8745

@ -16,6 +16,19 @@ FF_QUALITY = "80"
os.makedirs(THUMB_DIR, exist_ok=True) os.makedirs(THUMB_DIR, exist_ok=True)
VIDEO_DIRS = [
"U:/encoded",
"U:/count_sorted",
"E:/streamaster/downloaded"
]
def find_video_file(filename: str) -> str | None:
for directory in VIDEO_DIRS:
candidate = os.path.join(directory, filename)
if os.path.exists(candidate):
return candidate
return None
# ───────── DB HELPER ───────── # # ───────── DB HELPER ───────── #
def db_get_videos(): def db_get_videos():
conn, cur = get_local_db_connection() conn, cur = get_local_db_connection()
@ -24,8 +37,9 @@ def db_get_videos():
SELECT SELECT
video_id, username, site AS platform, video_id, username, site AS platform,
filepath, size, duration, gender, filepath, size, duration, gender,
created_at, updated_at created_at, updated_at, thumbnail
FROM videos FROM videos
WHERE status != 'missing'
""") """)
rows = cur.fetchall() rows = cur.fetchall()
cur.close(); conn.close() cur.close(); conn.close()
@ -51,18 +65,32 @@ def _gen_thumb_cmd(src: str, dest: str):
def generate_thumbnails_for_videos(videos): def generate_thumbnails_for_videos(videos):
tasks = [] tasks = []
for v in videos: for v in videos:
video_id = v["video_id"] video_id = v.get("video_id")
filepath = v.get("filepath")
thumb_path = _hashed_thumb_path(video_id) thumb_path = _hashed_thumb_path(video_id)
if not filepath:
print(f"⚠️ Skipping {video_id}: missing filepath")
continue
if not os.path.exists(filepath):
print(f"⚠️ Skipping {video_id}: file not found → {filepath}")
continue
if not os.path.exists(thumb_path): if not os.path.exists(thumb_path):
tasks.append((v["filepath"], thumb_path)) tasks.append((filepath, thumb_path))
v["thumbnail"] = thumb_path v["thumbnail"] = thumb_path
if tasks: if tasks:
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exe: with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exe:
list(exe.map(lambda t: subprocess.run(_gen_thumb_cmd(*t), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL), tasks)) list(exe.map(lambda t: subprocess.run(
_gen_thumb_cmd(*t),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
), tasks))
# ───────── CACHE BUILDER ───────── # # ───────── CACHE BUILDER ───────── #
def build_cache(): def build_cache():
@ -93,7 +121,7 @@ def build_cache():
video_map[key] = vids video_map[key] = vids
generate_thumbnails_for_videos(videos) # generate_thumbnails_for_videos(videos)
return { return {
"timestamp": time.time(), "timestamp": time.time(),

@ -0,0 +1,8 @@
import requests
def get_data(username):
url = f"https://chaturbate.com/api/biocontext/{username}"
data = requests.get(url)
data = data.json()
return data

@ -0,0 +1,211 @@
import os
from config import get_local_db_connection
from funcs import get_duration, get_file_size_in_mb, calculate_file_hash
from tqdm import tqdm
import os, hashlib, subprocess, json
from config import get_local_db_connection
from concurrent.futures import ThreadPoolExecutor
THUMB_DIR = "static/thumbnails"
THUMB_WIDTH = 640
FF_QUALITY = "80"
VIDEO_DIRS = [
"U:/streamaster",
"E:/streamaster/downloaded"
]
def get_all_video_files():
files = {}
for base in VIDEO_DIRS:
for root, _, filenames in os.walk(base):
for filename in filenames:
if filename.endswith(".mp4"):
files[filename] = os.path.join(root, filename)
return files
def find_video_path(filename: str):
return all_videos[filename] if filename in all_videos else None
def mark_missing_videos(cursor, conn):
cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing'")
videos = cursor.fetchall()
with tqdm(videos, desc="Scanning for missing videos...") as pbar:
for vid in videos:
pbar.update(1)
video_id, filepath = vid.values()
if not filepath:
continue
filename = os.path.basename(filepath)
if not find_video_path(filename):
print(f"🚫 Missing: {filename}")
cursor.execute("UPDATE videos SET status = 'missing' WHERE video_id = %s", (video_id,))
conn.commit()
def fill_missing_filepaths(cursor, conn):
cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing'")
videos = cursor.fetchall()
with tqdm(videos, desc="Updating filepaths...") as pbar:
for vid in videos:
pbar.update(1)
video_id, filepath = vid.values()
filename = f"{video_id}.mp4"
path = find_video_path(filename)
if path:
path = path.replace("\\", "/")
if path == filepath:
continue
cursor.execute("UPDATE videos SET filepath = %s WHERE video_id = %s", (path, video_id))
conn.commit()
def fill_missing_hashes(cursor, conn):
cursor.execute("SELECT video_id, filepath FROM videos WHERE (hash IS NULL OR hash = '') AND status != 'missing'")
videos = cursor.fetchall()
with tqdm(videos, desc="Updating hashes...") as pbar:
for vid in videos:
pbar.update(1)
video_id, filepath = vid.values()
if filepath and os.path.exists(filepath):
h = calculate_file_hash(filepath)
cursor.execute("UPDATE videos SET hash = %s WHERE video_id = %s", (h, video_id))
conn.commit()
def fill_missing_sizes(cursor, conn):
cursor.execute("SELECT video_id, filepath FROM videos WHERE size = 0 AND status != 'missing'")
videos = cursor.fetchall()
with tqdm(videos, desc="Updating sizes...") as pbar:
for vid in videos:
pbar.update(1)
video_id, filepath = vid.values()
if filepath and os.path.exists(filepath):
size = get_file_size_in_mb(filepath)
cursor.execute("UPDATE videos SET size = %s WHERE video_id = %s", (size, video_id))
conn.commit()
def fill_missing_durations(cursor, conn):
cursor.execute("SELECT video_id, filepath FROM videos WHERE duration = 0 AND status != 'missing'")
videos = cursor.fetchall()
with tqdm(videos, desc="Updating durations...") as pbar:
for vid in videos:
pbar.update(1)
video_id, filepath = vid.values()
if filepath and os.path.exists(filepath):
duration = get_duration(filepath)
cursor.execute("UPDATE videos SET duration = %s WHERE video_id = %s", (duration, video_id))
conn.commit()
def fill_missing_gender(cursor, conn):
import chaturbate
cursor.execute("SELECT DISTINCT username, site FROM videos WHERE gender IS NULL AND status != 'missing'")
videos = cursor.fetchall()
with tqdm(videos, desc="Updating genders...") as pbar:
for vid in videos:
pbar.update(1)
username, site = vid.values()
# try to fetch an item from videos table with the same username and site but with a non-null gender
cursor.execute("SELECT gender FROM videos WHERE username = %s AND site = %s AND gender IS NOT NULL LIMIT 1", (username, site))
gender = cursor.fetchone()
if not gender:
gender = chaturbate.get_data(username)
if not gender:
continue
if 'status' in gender:
if gender['status'] == 401:
continue
gender = gender['sex']
if 'woman' in gender:
gender_str = 'Female'
elif 'couple' in gender:
gender_str = 'Couple'
elif 'trans' in gender:
gender_str = 'Trans'
else:
print(f"fuck?: {gender}")
continue
else:
gender_str = gender['gender']
cursor.execute("UPDATE videos SET gender = %s WHERE username = %s AND site = %s", (gender_str, username, site))
conn.commit()
print(f"[{cursor.rowcount}] ✅ Updated gender for {username} on {site}")
def generate_thumbnails_for_videos(cursor, conn):
cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing' AND thumbnail IS NULL")
videos = cursor.fetchall()
tasks = []
for v in videos:
video_id = v.get("video_id")
filepath = v.get("filepath")
thumb_path = _hashed_thumb_path(video_id)
if not filepath:
# print(f"⚠️ Skipping {video_id}: missing filepath")
continue
if not os.path.exists(filepath):
# print(f"⚠️ Skipping {video_id}: file not found → {filepath}")
continue
if not os.path.exists(thumb_path):
tasks.append((filepath, thumb_path))
v["thumbnail"] = thumb_path
if tasks:
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exe:
list(exe.map(lambda t: subprocess.run(
_gen_thumb_cmd(*t),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
), tasks))
for v in videos:
if 'thumbnail' not in v:
continue
v['thumbnail'] = v['thumbnail'].replace("\\", "/")
cursor.execute("UPDATE videos SET thumbnail = %s WHERE video_id = %s", (v['thumbnail'], v['video_id']))
conn.commit()
def _hashed_thumb_path(video_id: str):
h = hashlib.md5(video_id.encode()).hexdigest()
sub1, sub2 = h[:2], h[2:4]
path = os.path.join(THUMB_DIR, sub1, sub2)
os.makedirs(path, exist_ok=True)
return os.path.join(path, f"{video_id}.webp")
def _gen_thumb_cmd(src: str, dest: str):
return [
"ffmpeg", "-y", "-loglevel", "error",
"-ss", "0", "-i", src,
"-vframes", "1",
"-vf", f"thumbnail,scale={THUMB_WIDTH}:-1",
"-q:v", FF_QUALITY,
dest
]
if __name__ == '__main__':
conn, cursor = get_local_db_connection()
all_videos = get_all_video_files()
print("🔍 Scanning for missing data...")
fill_missing_filepaths(cursor, conn)
# mark_missing_videos(cursor, conn)
# fill_missing_hashes(cursor, conn)
fill_missing_sizes(cursor, conn)
fill_missing_durations(cursor, conn)
# fill_missing_gender(cursor, conn)
generate_thumbnails_for_videos(cursor, conn)
cursor.close()
conn.close()
print("✅ All cleanup tasks completed.")

@ -398,3 +398,9 @@ def get_videos_matched(video_dirs, data_dirs):
parsed_videos, unmatched = match_data_to_video_fast(videos, data) parsed_videos, unmatched = match_data_to_video_fast(videos, data)
return parsed_videos, unmatched return parsed_videos, unmatched
def calculate_file_hash(file_path):
import hashlib
with open(file_path, 'rb') as f:
data = f.read()
return hashlib.sha256(data).hexdigest()
Loading…
Cancel
Save