optimization

3 months ago · ecfa2f8745
parent ea7d8a4635
commit ecfa2f8745
4 changed files with 258 additions and 5 deletions
--- a/app.py
+++ b/app.py
@ -16,6 +16,19 @@ FF_QUALITY         = "80"
 os.makedirs(THUMB_DIR, exist_ok=True)
 VIDEO_DIRS = [
    "U:/encoded",
    "U:/count_sorted",
    "E:/streamaster/downloaded"
 ]
 def find_video_file(filename: str) -> str | None:
    for directory in VIDEO_DIRS:
        candidate = os.path.join(directory, filename)
        if os.path.exists(candidate):
            return candidate
    return None
 # ───────── DB HELPER ───────── #
 def db_get_videos():
    conn, cur = get_local_db_connection()
@ -24,8 +37,9 @@ def db_get_videos():
        SELECT
            video_id, username, site AS platform,
            filepath, size, duration, gender,
-            created_at, updated_at
+            created_at, updated_at, thumbnail
        FROM videos
        WHERE status != 'missing'
    """)
    rows = cur.fetchall()
    cur.close(); conn.close()
@ -51,18 +65,32 @@ def _gen_thumb_cmd(src: str, dest: str):
 def generate_thumbnails_for_videos(videos):
    tasks = []
    for v in videos:
-        video_id   = v["video_id"]
+        video_id   = v.get("video_id")
        filepath = v.get("filepath")
        thumb_path = _hashed_thumb_path(video_id)
        if not filepath:
            print(f"⚠️ Skipping {video_id}: missing filepath")
            continue
        if not os.path.exists(filepath):
            print(f"⚠️ Skipping {video_id}: file not found → {filepath}")
            continue
        if not os.path.exists(thumb_path):
-            tasks.append((v["filepath"], thumb_path))
+            tasks.append((filepath, thumb_path))
        v["thumbnail"] = thumb_path
    if tasks:
        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exe:
-            list(exe.map(lambda t: subprocess.run(_gen_thumb_cmd(*t), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL), tasks))
+            list(exe.map(lambda t: subprocess.run(
                _gen_thumb_cmd(*t),
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL
            ), tasks))
 # ───────── CACHE BUILDER ───────── #
 def build_cache():
@ -93,7 +121,7 @@ def build_cache():
        video_map[key] = vids
-    generate_thumbnails_for_videos(videos)
+    # generate_thumbnails_for_videos(videos)
    return {
        "timestamp": time.time(),
--- a/chaturbate.py
+++ b/chaturbate.py
@ -0,0 +1,8 @@
 import requests
 def get_data(username):
    url = f"https://chaturbate.com/api/biocontext/{username}"
    data = requests.get(url)
    data = data.json()
    return data
--- a/cleanup.py
+++ b/cleanup.py
@ -0,0 +1,211 @@
 import os
 from config import get_local_db_connection
 from funcs import get_duration, get_file_size_in_mb, calculate_file_hash
 from tqdm import tqdm
 import os, hashlib, subprocess, json
 from config import get_local_db_connection
 from concurrent.futures import ThreadPoolExecutor
 THUMB_DIR          = "static/thumbnails"
 THUMB_WIDTH        = 640
 FF_QUALITY         = "80"
 VIDEO_DIRS = [
    "U:/streamaster",
    "E:/streamaster/downloaded"
 ]
 def get_all_video_files():
    files = {}
    for base in VIDEO_DIRS:
        for root, _, filenames in os.walk(base):
            for filename in filenames:
                if filename.endswith(".mp4"):
                    files[filename] = os.path.join(root, filename)
    return files
 def find_video_path(filename: str):
    return all_videos[filename] if filename in all_videos else None
 def mark_missing_videos(cursor, conn):
    cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing'")
    videos = cursor.fetchall()
    with tqdm(videos, desc="Scanning for missing videos...") as pbar:
        for vid in videos:
            pbar.update(1)
            video_id, filepath = vid.values()
            if not filepath:
                continue
            filename = os.path.basename(filepath)
            if not find_video_path(filename):
                print(f"🚫 Missing: {filename}")
                cursor.execute("UPDATE videos SET status = 'missing' WHERE video_id = %s", (video_id,))
                conn.commit()
 def fill_missing_filepaths(cursor, conn):
    cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing'")
    videos = cursor.fetchall()
    with tqdm(videos, desc="Updating filepaths...") as pbar:
        for vid in videos:
            pbar.update(1)
            video_id, filepath = vid.values()
            filename = f"{video_id}.mp4"
            path = find_video_path(filename)
            if path:
                path = path.replace("\\", "/")
                if path == filepath:
                    continue
                cursor.execute("UPDATE videos SET filepath = %s WHERE video_id = %s", (path, video_id))
                conn.commit()
 def fill_missing_hashes(cursor, conn):
    cursor.execute("SELECT video_id, filepath FROM videos WHERE (hash IS NULL OR hash = '') AND status != 'missing'")
    videos = cursor.fetchall()
    with tqdm(videos, desc="Updating hashes...") as pbar:
        for vid in videos:
            pbar.update(1)
            video_id, filepath = vid.values()
            if filepath and os.path.exists(filepath):
                h = calculate_file_hash(filepath)
                cursor.execute("UPDATE videos SET hash = %s WHERE video_id = %s", (h, video_id))
                conn.commit()
 def fill_missing_sizes(cursor, conn):
    cursor.execute("SELECT video_id, filepath FROM videos WHERE size = 0 AND status != 'missing'")
    videos = cursor.fetchall()
    with tqdm(videos, desc="Updating sizes...") as pbar:
        for vid in videos:
            pbar.update(1)
            video_id, filepath = vid.values()
            if filepath and os.path.exists(filepath):
                size = get_file_size_in_mb(filepath)
                cursor.execute("UPDATE videos SET size = %s WHERE video_id = %s", (size, video_id))
                conn.commit()
 def fill_missing_durations(cursor, conn):
    cursor.execute("SELECT video_id, filepath FROM videos WHERE duration = 0 AND status != 'missing'")
    videos = cursor.fetchall()
    with tqdm(videos, desc="Updating durations...") as pbar:
        for vid in videos:
            pbar.update(1)
            video_id, filepath = vid.values()
            if filepath and os.path.exists(filepath):
                duration = get_duration(filepath)
                cursor.execute("UPDATE videos SET duration = %s WHERE video_id = %s", (duration, video_id))
                conn.commit()
 def fill_missing_gender(cursor, conn):
    import chaturbate
    cursor.execute("SELECT DISTINCT username, site FROM videos WHERE gender IS NULL AND status != 'missing'")
    videos = cursor.fetchall()
    with tqdm(videos, desc="Updating genders...") as pbar:
        for vid in videos:
            pbar.update(1)
            username, site = vid.values()
            # try to fetch an item from videos table with the same username and site but with a non-null gender
            cursor.execute("SELECT gender FROM videos WHERE username = %s AND site = %s AND gender IS NOT NULL LIMIT 1", (username, site))
            gender = cursor.fetchone()
            if not gender:
                gender = chaturbate.get_data(username)
                if not gender:
                    continue
                if 'status' in gender:
                    if gender['status'] == 401:
                        continue
                gender = gender['sex']
                if 'woman' in gender:
                    gender_str = 'Female'
                elif 'couple' in gender:
                    gender_str = 'Couple'
                elif 'trans' in gender:
                    gender_str = 'Trans'
                else:
                    print(f"fuck?: {gender}")
                    continue
            else:
                gender_str = gender['gender']
            cursor.execute("UPDATE videos SET gender = %s WHERE username = %s AND site = %s", (gender_str, username, site))
            conn.commit()
            print(f"[{cursor.rowcount}] ✅ Updated gender for {username} on {site}")
 def generate_thumbnails_for_videos(cursor, conn):
    cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing' AND thumbnail IS NULL")
    videos = cursor.fetchall()
    tasks = []
    for v in videos:
        video_id   = v.get("video_id")
        filepath = v.get("filepath")
        thumb_path = _hashed_thumb_path(video_id)
        if not filepath:
            # print(f"⚠️ Skipping {video_id}: missing filepath")
            continue
        if not os.path.exists(filepath):
            # print(f"⚠️ Skipping {video_id}: file not found → {filepath}")
            continue
        if not os.path.exists(thumb_path):
            tasks.append((filepath, thumb_path))
        v["thumbnail"] = thumb_path
    if tasks:
        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as exe:
            list(exe.map(lambda t: subprocess.run(
                _gen_thumb_cmd(*t),
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL
            ), tasks))
        for v in videos:
            if 'thumbnail' not in v:
                continue
            v['thumbnail'] = v['thumbnail'].replace("\\", "/")
            cursor.execute("UPDATE videos SET thumbnail = %s WHERE video_id = %s", (v['thumbnail'], v['video_id']))
            conn.commit()
 def _hashed_thumb_path(video_id: str):
    h = hashlib.md5(video_id.encode()).hexdigest()
    sub1, sub2 = h[:2], h[2:4]
    path = os.path.join(THUMB_DIR, sub1, sub2)
    os.makedirs(path, exist_ok=True)
    return os.path.join(path, f"{video_id}.webp")
 def _gen_thumb_cmd(src: str, dest: str):
    return [
        "ffmpeg", "-y", "-loglevel", "error",
        "-ss", "0", "-i", src,
        "-vframes", "1",
        "-vf", f"thumbnail,scale={THUMB_WIDTH}:-1",
        "-q:v", FF_QUALITY,
        dest
    ]
 if __name__ == '__main__':
    conn, cursor = get_local_db_connection()
    all_videos = get_all_video_files()
    print("🔍 Scanning for missing data...")
    fill_missing_filepaths(cursor, conn)
    # mark_missing_videos(cursor, conn)
    # fill_missing_hashes(cursor, conn)
    fill_missing_sizes(cursor, conn)
    fill_missing_durations(cursor, conn)
    # fill_missing_gender(cursor, conn)
    generate_thumbnails_for_videos(cursor, conn)
    cursor.close()
    conn.close()
    print("✅ All cleanup tasks completed.")
--- a/funcs.py
+++ b/funcs.py
@ -398,3 +398,9 @@ def get_videos_matched(video_dirs, data_dirs):
    parsed_videos, unmatched = match_data_to_video_fast(videos, data)
    return parsed_videos, unmatched
 def calculate_file_hash(file_path):
    import hashlib
    with open(file_path, 'rb') as f:
        data = f.read()
    return hashlib.sha256(data).hexdigest()