small update

2 months ago · c79ec8e179
parent 9ac5555876
commit c79ec8e179
3 changed files with 288 additions and 47 deletions
--- a/chaturbate.py
+++ b/chaturbate.py
@ -1,8 +0,0 @@
-import requests
-
-
-def get_data(username):
-    url = f"https://chaturbate.com/api/biocontext/{username}"
-    data = requests.get(url)
-    data = data.json()
-    return data
--- a/cleanup.py
+++ b/cleanup.py
@ -3,7 +3,7 @@ from config import get_local_db_connection
 from funcs import get_duration, get_file_size_in_mb, calculate_file_hash
 from tqdm import tqdm

-import os, hashlib, subprocess, json
+import os, hashlib, subprocess
 from config import get_local_db_connection
 from concurrent.futures import ThreadPoolExecutor

@ -12,8 +12,8 @@ THUMB_WIDTH        = 640
 FF_QUALITY         = "80"

 VIDEO_DIRS = [
-    "U:/streamaster",
-    "E:/streamaster/downloaded"
+    "U:/streamaster/",
+    "E:/streamaster/streamaster/downloaded"
 ]

 def get_all_video_files():
@ -45,20 +45,29 @@ def mark_missing_videos(cursor, conn):
                conn.commit()

 def fill_missing_filepaths(cursor, conn):
-    cursor.execute("SELECT video_id, filepath FROM videos WHERE status != 'missing'")
+    cursor.execute("SELECT id, filepath, status FROM videos")
    videos = cursor.fetchall()

    with tqdm(videos, desc="Updating filepaths...") as pbar:
        for vid in videos:
            pbar.update(1)
-            video_id, filepath = vid.values()
-            filename = f"{video_id}.mp4"
+
+            filepath = vid['filepath']
+            if not filepath:
+                continue
+            filename = os.path.basename(filepath)
+            status = vid['status']
+            
            path = find_video_path(filename)
-            if path:
+            
+            if not path:
+                continue
+            
            path = path.replace("\\", "/")
-                if path == filepath:
+            if path == filepath and status != 'missing':
                continue
-                cursor.execute("UPDATE videos SET filepath = %s WHERE video_id = %s", (path, video_id))
+            
+            cursor.execute("UPDATE videos SET filepath = %s, status = 'active' WHERE id = %s", (path, vid['id']))
            conn.commit()

 def fill_missing_hashes(cursor, conn):
@ -88,7 +97,7 @@ def fill_missing_sizes(cursor, conn):
                conn.commit()

 def fill_missing_durations(cursor, conn):
-    cursor.execute("SELECT video_id, filepath FROM videos WHERE duration = 0 AND status != 'missing'")
+    cursor.execute("SELECT video_id, filepath FROM videos WHERE duration = 0 AND status != 'missing' ORDER BY size ASC")
    videos = cursor.fetchall()

    with tqdm(videos, desc="Updating durations...") as pbar:
@ -97,11 +106,21 @@ def fill_missing_durations(cursor, conn):
            video_id, filepath = vid.values()
            if filepath and os.path.exists(filepath):
                duration = get_duration(filepath)
+                if duration <= 0:
+                    print(f"🚫 Failed to get duration for {filepath}")
+                    os.remove(filepath)
+                    continue
                cursor.execute("UPDATE videos SET duration = %s WHERE video_id = %s", (duration, video_id))
                conn.commit()

 def fill_missing_gender(cursor, conn):
-    import chaturbate
+    def get_data(username):
+        import requests
+        url = f"https://chaturbate.com/api/biocontext/{username}"
+        data = requests.get(url)
+        data = data.json()
+        return data
+    
    cursor.execute("SELECT DISTINCT username, site FROM videos WHERE gender IS NULL AND status != 'missing'")
    videos = cursor.fetchall()
    
@ -113,13 +132,13 @@ def fill_missing_gender(cursor, conn):
            cursor.execute("SELECT gender FROM videos WHERE username = %s AND site = %s AND gender IS NOT NULL LIMIT 1", (username, site))
            gender = cursor.fetchone()
            if not gender:
-                gender = chaturbate.get_data(username)
-                if not gender:
+                data = get_data(username)
+                if not data:
                    continue
-                if 'status' in gender:
-                    if gender['status'] == 401:
+                if 'status' in data:
+                    if data['status'] == 401:
                        continue
-                gender = gender['sex']
+                gender = data['sex']
                if 'woman' in gender:
                    gender_str = 'Female'
                elif 'couple' in gender:
@ -141,19 +160,19 @@ def generate_thumbnails_for_videos(cursor, conn):
    videos = cursor.fetchall()
    
    tasks = []
+    with tqdm(videos, desc="Generating thumbnails...") as pbar:
        for v in videos:
+            pbar.update(1)
            video_id   = v.get("video_id")
            filepath = v.get("filepath")
-        thumb_path = _hashed_thumb_path(video_id)

            if not filepath:
-            # print(f"⚠️ Skipping {video_id}: missing filepath")
                continue

            if not os.path.exists(filepath):
-            # print(f"⚠️ Skipping {video_id}: file not found → {filepath}")
                continue

+            thumb_path = _hashed_thumb_path(video_id)
            if not os.path.exists(thumb_path):
                tasks.append((filepath, thumb_path))

@ -193,18 +212,20 @@ def _gen_thumb_cmd(src: str, dest: str):

 if __name__ == '__main__':
    conn, cursor = get_local_db_connection()
-    all_videos = get_all_video_files()

    print("🔍 Scanning for missing data...")

+    if True:
+        all_videos = get_all_video_files()
        fill_missing_filepaths(cursor, conn)
        mark_missing_videos(cursor, conn)
-    # fill_missing_durations(cursor, conn)
-    # fill_missing_sizes(cursor, conn)
-    # generate_thumbnails_for_videos(cursor, conn)
        
-    # fill_missing_hashes(cursor, conn)
+    generate_thumbnails_for_videos(cursor, conn)
+    fill_missing_sizes(cursor, conn)
+    fill_missing_durations(cursor, conn)
+
    # fill_missing_gender(cursor, conn)
+    # fill_missing_hashes(cursor, conn)
    

    cursor.close()
--- a/superencoderav.py
+++ b/superencoderav.py
@ -0,0 +1,228 @@
+import os, shutil
+import ffmpeg
+from tqdm import tqdm
+
+def is_av1(filepath):
+    """
+    Check if a video file is already AV1-encoded.
+    """
+    try:
+        probe = ffmpeg.probe(filepath)
+        for stream in probe['streams']:
+            if stream['codec_type'] == 'video' and 'codec_name' in stream:
+                if stream['codec_name'] == 'av1':
+                    return True
+    except ffmpeg.Error as e:
+        print(f"Error probing {filepath}: {e}")
+    return False
+
+def get_video_info(filepath):
+    """
+    Returns (bitrate_in_kbps, (width, height)) for the specified video file.
+    If probing fails, returns (None, (None, None)).
+    """
+    try:
+        probe = ffmpeg.probe(filepath)
+        format_info = probe['format']
+        video_stream = next(
+            (stream for stream in probe['streams'] if stream['codec_type'] == 'video'),
+            None
+        )
+        if video_stream:
+            # Convert from bits/sec to kbps
+            bitrate_kbps = int(format_info['bit_rate']) // 1000  
+            width = video_stream['width']
+            height = video_stream['height']
+            return bitrate_kbps, (width, height)
+    except ffmpeg.Error as e:
+        print(f"Error getting video info for {filepath}: {e}")
+    return None, (None, None)
+
+def get_files(folder):
+    """
+    Recursively gather all .mp4 files in `folder`.
+    Sort them by file size (smallest to largest) just as an example sorting.
+    """
+    all_files = []
+    for root, _, filenames in os.walk(folder):
+        for filename in filenames:
+            if filename.lower().endswith(('.mp4', '.mkv', '.avi', '.mov')):
+                if not "encoded" in root:
+                    all_files.append(os.path.join(root, filename))
+    return sorted(all_files, key=os.path.getsize)
+
+def parse_text_for_print(text):
+    """
+    If string is longer than 100 characters, only print the first 100 characters.
+    """
+    return text[:100] + '...' if len(text) > 100 else text
+
+def get_target_bitrate(width, height):
+    """
+    Your existing function to choose a bitrate based on resolution.
+    """
+    resolutions = {
+        (854, 480): 1000,
+        (1280, 720): 1500,
+        (1920, 1080): 3000,
+        (2560, 1440): 5000,
+        (3840, 2160): 12000
+    }
+    
+    for res, bitrate in resolutions.items():
+        if width <= res[0] and height <= res[1]:
+            return bitrate
+    
+    return 2500
+
+def get_fps(filepath):
+    """Get the frames per second (FPS) of the input video."""
+    try:
+        probe = ffmpeg.probe(filepath)
+        video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+        if video_stream and 'r_frame_rate' in video_stream:
+            fps_str = video_stream['r_frame_rate']
+            num, den = map(int, fps_str.split('/'))
+            fps = num / den
+            return fps
+    except ffmpeg.Error as e:
+        print(f"Error getting FPS for {filepath}: {e}")
+    return None
+
+def encode_video(filepath, output_path, target_bitrate):
+    """
+    Encode video using ffmpeg with a target bitrate (in kbps).
+    Using NVIDIA AV1 hardware encoder (av1_nvenc) for RTX 40-series.
+    """
+    try:
+        fps = get_fps(filepath)
+        if fps is None:
+            print(f"Could not determine FPS for {filepath}, using default keyframe interval.")
+            fps = 30  # Default fallback if FPS can't be determined
+
+        keyframe_interval = int(fps)  # Set the keyframe interval to match 1 second
+
+        # Calculate 1.5x for max bitrate
+        max_bitrate = int(1.5 * target_bitrate)
+
+        print(f"  Encoding {filepath} to AV1 at {target_bitrate} kbps...")
+        (
+            ffmpeg
+            .input(filepath)
+            .output(
+                output_path,
+                vcodec='av1_nvenc',
+                format='mp4',
+                b=f"{target_bitrate}k",
+                maxrate=f"{max_bitrate}k",
+                bufsize=f"{max_bitrate}k",
+                preset='p5',
+                g=keyframe_interval
+            )
+            .run(
+                overwrite_output=True,
+            )
+        )
+        filepath_print = parse_text_for_print(filepath)
+        print(f"  Finished encoding {filepath_print} to AV1 at {target_bitrate} kbps "
+              f"(maxrate={max_bitrate} kbps).")
+    except ffmpeg.Error as e:
+        filepath_print = parse_text_for_print(filepath) 
+        print(f"  Error encoding {filepath_print} to AV1: {e}")
+
+def check_and_replace_if_smaller(original_path, temp_output_path):
+    """
+    Compare file sizes and replace the original if the new one is smaller.
+    Otherwise, delete the temporary file.
+    """
+    if not os.path.exists(temp_output_path):
+        print(f"[ERROR] Temp file {temp_output_path} not found. Skipping replacement...")
+        return
+
+    original_size = os.path.getsize(original_path)
+    processed_size = os.path.getsize(temp_output_path)
+
+    size_original_mb = original_size / (1024 * 1024)
+    size_processed_mb = processed_size / (1024 * 1024)
+    size_diff_perc = (1 - processed_size / original_size) * 100
+    size_diff_mb = size_original_mb - size_processed_mb
+
+    if size_original_mb < 25:
+        shutil.move(temp_output_path, original_path)
+        return True
+
+    if processed_size >= original_size or size_diff_mb < 1:
+        os.remove(temp_output_path)
+        return False
+    else:
+        print(100*"=")
+        print(f"  Re-encoded is smaller by {size_diff_perc:.2f}% ({size_diff_mb:.2f} MB). Replacing original.")
+        print(f"  Original: {size_original_mb:.2f} MB \n  Re-encoded: {size_processed_mb:.2f} MB.")
+        print(100*"=")
+        shutil.move(temp_output_path, original_path)
+        return True
+
+def reencode_videos_av1(input_folder):
+    """
+    Main loop:
+    1. Gather .mp4 files
+    2. For each file, check if it's AV1 or if it needs re-encoding based on target bitrate.
+    3. Re-encode if needed.
+    4. Compare file sizes and replace if smaller.
+    """
+    files = get_files(input_folder)
+    for input_path in tqdm(files, desc="Processing videos", unit="file"):
+        short_name = parse_text_for_print(input_path)
+        file_size_in_mb = os.path.getsize(input_path) / (1024 * 1024)
+        print(f"\nProcessing {short_name} ({file_size_in_mb:.2f} MB)...")
+        
+        # 2) Get current bitrate & resolution
+        current_bitrate, (width, height) = get_video_info(input_path)
+        if not current_bitrate:
+            print("Video's bitrate is not available. Skipping")
+            continue
+        
+        target_bitrate = get_target_bitrate(width, height)
+        
+        # If current bitrate <= target, it's not worth it to re-encode
+        if current_bitrate <= target_bitrate:
+            target_bitrate = current_bitrate
+        
+        if is_av1(input_path):
+            print("Video is already encoded in AV1. Skipping")
+            
+            # move to 'encoded' folder inside the input folder
+            new_file_path = os.path.join(input_folder, "encoded", os.path.basename(input_path))
+            os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
+            shutil.move(input_path, new_file_path)
+
+            continue
+        
+        # 3) Re-encode
+        output_path = os.path.join('temp', os.path.basename(input_path))
+        encode_video(input_path, output_path, target_bitrate)
+
+        # 4) Compare file sizes and replace if smaller
+        check_and_replace_if_smaller(input_path, output_path)
+        
+        # move to 'encoded' folder inside the input folder
+        new_file_path = os.path.join(input_folder, "encoded", os.path.basename(input_path))
+        os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
+        shutil.move(input_path, new_file_path)
+
+# ---------------------- Main Script Entry ---------------------- #
+if __name__ == "__main__":
+    import sys
+    
+    if len(sys.argv) > 1:
+        input_folder = sys.argv[1]
+    else:
+        input_folder = input("Enter the input folder path: ")
+        
+    if not os.path.isdir(input_folder):
+        print(f"Input folder '{input_folder}' does not exist.")
+        sys.exit(1)
+
+    print("Re-encoding videos to AV1 (only if bitrate is above our resolution-based presets)...")
+    reencode_videos_av1(input_folder)
+    print("All done!")