cleanup and organied

main
oscar 1 month ago
parent 2a7cd821c1
commit fd62cb5a30

@ -38,14 +38,15 @@ def mark_missing_videos(cursor, conn):
pbar.update(1) pbar.update(1)
video_id, filepath = vid.values() video_id, filepath = vid.values()
if not filepath: if not filepath:
continue filename = f'{video_id}.mp4'
filename = os.path.basename(filepath) else:
filename = os.path.basename(filepath)
if not find_video_path(filename): if not find_video_path(filename):
print(f"🚫 Missing: {filename}") print(f"🚫 Missing: {filename}")
cursor.execute("UPDATE videos SET status = 'missing' WHERE video_id = %s", (video_id,)) cursor.execute("UPDATE videos SET status = 'missing' WHERE video_id = %s", (video_id,))
conn.commit() conn.commit()
def fill_missing_filepaths(cursor, conn): def find_missing_videos(cursor, conn):
cursor.execute("SELECT id, filepath, status, video_id FROM videos") cursor.execute("SELECT id, filepath, status, video_id FROM videos")
videos = cursor.fetchall() videos = cursor.fetchall()
@ -242,7 +243,7 @@ if __name__ == '__main__':
if True: if True:
all_videos = get_all_video_files() all_videos = get_all_video_files()
fill_missing_filepaths(cursor, conn) find_missing_videos(cursor, conn)
mark_missing_videos(cursor, conn) mark_missing_videos(cursor, conn)
generate_thumbnails_for_videos(cursor, conn) generate_thumbnails_for_videos(cursor, conn)

@ -0,0 +1,114 @@
import subprocess
import json
import os
import tempfile
# --- helpers --------------------------------------------------------------- #
def ffprobe_json(fp: str) -> dict:
"""Return the full ffprobe-JSON for a media file."""
cmd = [
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-show_format", fp
]
return json.loads(subprocess.check_output(cmd, text=True))
def get_signature(fp: str) -> tuple:
"""
A signature is everything that has to match for a bit-perfect concat:
video: codec, width, height, fps (as a float), pix_fmt, color_range
audio: codec, sample_rate, channels, channel_layout
"""
info = ffprobe_json(fp)
v_stream = next(s for s in info["streams"] if s["codec_type"] == "video")
a_stream = next((s for s in info["streams"] if s["codec_type"] == "audio"), None)
def fps(stream):
fr = stream.get("r_frame_rate", "0/0")
num, den = map(int, fr.split("/"))
return round(num / den, 3) if den else 0.0
sig = (
v_stream["codec_name"],
int(v_stream["width"]), int(v_stream["height"]),
fps(v_stream),
v_stream.get("pix_fmt"),
v_stream.get("color_range"),
a_stream["codec_name"] if a_stream else None,
int(a_stream["sample_rate"]) if a_stream else None,
a_stream.get("channels") if a_stream else None,
a_stream.get("channel_layout") if a_stream else None,
)
return sig
def all_signatures_equal(videos):
ref = get_signature(videos[0]["filepath"])
return all(get_signature(v["filepath"]) == ref for v in videos[1:])
def concat_copy(videos, out_path):
"""Lossless concat with the *concat demuxer* (-c copy)."""
with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
for v in videos:
f.write(f"file '{os.path.abspath(v['filepath']).replace('\'', '\\\'')}'\n")
list_file = f.name
cmd = [
"ffmpeg", "-y",
"-f", "concat", "-safe", "0",
"-i", list_file,
"-c", "copy",
out_path,
]
print("Running FFmpeg concat...")
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
os.unlink(list_file)
# Look for specific error patterns in FFmpeg's stderr
ffmpeg_errors = [
"corrupt input packet",
"Invalid OBU",
"Failed to parse temporal unit",
"Packet corrupt",
"partial file",
"Non-monotonic DTS"
]
if result.returncode != 0 or any(err in result.stderr for err in ffmpeg_errors):
print("❌ FFmpeg concat failed or produced corrupted output.")
# Remove broken file if it exists
if os.path.exists(out_path):
os.remove(out_path)
print(f"🗑️ Removed corrupt output: {out_path}")
return False
print("✅ FFmpeg concat completed successfully.")
return True
def copy_concatenate_videos(videos_list):
if not (len(videos_list) > 1 and all_signatures_equal(videos_list)):
print("Streams are not compatible for lossless concat.")
return False
print("All streams are compatible attempting lossless concat …")
main_video = videos_list[0]
video_path = main_video["filepath"]
output_path = os.path.join("temp", os.path.basename(video_path))
os.makedirs("concated", exist_ok=True)
success = concat_copy(videos_list, output_path)
if not success:
print("Falling back to re-encoding due to concat failure.")
return False
# Remove originals
for v in videos_list:
os.remove(v["filepath"])
# move temp to concated folder
os.rename(output_path, os.path.join("concated", os.path.basename(video_path)))
return main_video

@ -32,9 +32,7 @@ def organize_videos():
# group the videos for concatenation # group the videos for concatenation
for video_list in sorted_processed_videos: for video_list in sorted_processed_videos:
first_video = video_list[0] video_id = video_list[0]['video_id']
video_id = os.path.splitext(os.path.basename(first_video['filepath']))[0]
videos_sum_size = sum([video['size'] for video in video_list]) videos_sum_size = sum([video['size'] for video in video_list])
@ -54,7 +52,7 @@ def organize_videos():
print(f"Videos are fucked.") print(f"Videos are fucked.")
main_video = video_list[0] main_video = video_list[0]
video_name = main_video['videoID'] video_name = main_video['video_id']
fucked_dir = os.path.join("concate_fucked", video_name) fucked_dir = os.path.join("concate_fucked", video_name)
os.makedirs(fucked_dir, exist_ok=True) os.makedirs(fucked_dir, exist_ok=True)

@ -1,5 +1,5 @@
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from VideoManager import get_duration from video_manager import get_duration
import os, json, subprocess, shutil import os, json, subprocess, shutil
@ -34,14 +34,13 @@ def update_video_data(dataPath, data):
if existing_data == data: if existing_data == data:
return # No update needed if data hasn't changed. return # No update needed if data hasn't changed.
data["updatedAt"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") data["updated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(dataPath, "w") as f: with open(dataPath, "w") as f:
json.dump(data, f) # Write to file if new or if data has changed. json.dump(data, f) # Write to file if new or if data has changed.
def is_recent(updated_at_str, minutes=30): def is_recent(updated_at, minutes=30):
updated_at = format_datetime(updated_at_str)
updated_at = updated_at.replace(tzinfo=timezone.utc) updated_at = updated_at.replace(tzinfo=timezone.utc)
now = datetime.now(timezone.utc) now = datetime.now(timezone.utc)
return now - updated_at < timedelta(minutes=minutes) return now - updated_at < timedelta(minutes=minutes)
@ -104,7 +103,7 @@ def group_videos(video_list, sort_by="count", order="desc"):
# Ensure videos for each user and site are sorted by creation date # Ensure videos for each user and site are sorted by creation date
for key in video_data: for key in video_data:
video_data[key].sort(key=lambda x: format_datetime(x["createdAt"])) video_data[key].sort(key=lambda x: (x["created_at"]))
# Further sort groups if required based on size or count # Further sort groups if required based on size or count
if sort_by == "size": if sort_by == "size":
@ -124,8 +123,8 @@ def process_videos(video_data):
video_path = video["filepath"] video_path = video["filepath"]
data_path = video["jsonpath"] data_path = video["jsonpath"]
if 'size' not in video: filesize = get_file_size_in_mb(video_path)
filesize = get_file_size_in_mb(video_path) if 'size' not in video or video['size'] != filesize:
video['size'] = filesize video['size'] = filesize
is_updated = True is_updated = True
@ -135,8 +134,8 @@ def process_videos(video_data):
# Move corrupted videos to the failed folder # Move corrupted videos to the failed folder
if video['duration'] == 0: if video['duration'] == 0:
print(f"{video['videoID']} is corrupted, moving to failed folder") print(f"{video['video_id']} is corrupted, moving to failed folder")
failed_video_path = os.path.join(failed_directory, video["videoID"] + ".mp4") failed_video_path = os.path.join(failed_directory, video["video_id"] + ".mp4")
failed_data_path = failed_video_path.replace(".mp4", ".json") failed_data_path = failed_video_path.replace(".mp4", ".json")
shutil.move(video_path, failed_video_path) shutil.move(video_path, failed_video_path)
@ -166,7 +165,7 @@ def group_for_concatenation(videos, time_limit=30):
reference_params = None # We'll store the 'ffprobe' params for the first video in each group reference_params = None # We'll store the 'ffprobe' params for the first video in each group
for video in videos: for video in videos:
video_start = format_datetime(video['createdAt']) video_start = (video['created_at'])
video_end = video_start + timedelta(seconds=video['duration']) video_end = video_start + timedelta(seconds=video['duration'])
# Probe the video to get parameters # Probe the video to get parameters
@ -218,7 +217,7 @@ def group_for_concatenation(videos, time_limit=30):
if concatenated_video_groups: if concatenated_video_groups:
last_group = concatenated_video_groups[-1] last_group = concatenated_video_groups[-1]
last_video = last_group[-1] last_video = last_group[-1]
last_updated_at = datetime.strptime(last_video['createdAt'], "%Y-%m-%d %H:%M:%S") last_updated_at = datetime.strptime(last_video['created_at'], "%Y-%m-%d %H:%M:%S")
if datetime.now() - last_updated_at <= timedelta(minutes=time_limit): if datetime.now() - last_updated_at <= timedelta(minutes=time_limit):
print(f"Last group is not ready for upload. Removing from final groups.") print(f"Last group is not ready for upload. Removing from final groups.")
concatenated_video_groups.pop() concatenated_video_groups.pop()
@ -287,10 +286,10 @@ def get_video_params(video_path):
def generate_list_file(videos): def generate_list_file(videos):
directory = os.path.dirname(videos[0]["filepath"]) directory = os.path.dirname(videos[0]["filepath"])
list_filename = os.path.join(directory, f"{videos[0]['videoID']}.txt") list_filename = os.path.join(directory, f"{videos[0]['video_id']}.txt")
with open(list_filename, "w") as list_file: with open(list_filename, "w") as list_file:
for video in videos: for video in videos:
list_file.write(f"file '{video['videoID']}.mp4'\n") list_file.write(f"file '{video['video_id']}.mp4'\n")
return list_filename return list_filename
@ -398,4 +397,54 @@ def calculate_file_hash(file_path):
import hashlib import hashlib
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
data = f.read() data = f.read()
return hashlib.sha256(data).hexdigest() return hashlib.sha256(data).hexdigest()
def group_for_concatenation_simple(videos, time_limit=60):
"""
Groups videos into lists where:
- total group size <= 9GB (9216 MB),
- time gap between consecutive videos <= time_limit minutes,
- AND all have the same resolution/fps/codecs for no-reencode concat.
"""
concatenated_video_groups = []
current_group = []
current_size_mb = 0
last_video_end = None
for video in videos:
video_start = video['created_at']
video_end = video_start + timedelta(seconds=video['duration'])
if current_group:
# Check if adding this video breaks the size limit
time_difference = (video_start - last_video_end).total_seconds() / 60
size_exceeded = (current_size_mb + video['size'] > 9216)
time_exceeded = (time_difference > time_limit)
# If we exceed size, exceed time gap, or mismatch in parameters => start new group
if size_exceeded or time_exceeded:
concatenated_video_groups.append(current_group)
current_group = []
current_size_mb = 0
# Add the current video to the group
current_group.append(video)
current_size_mb += video['size']
last_video_end = video_end
# Add the last group if not empty
if current_group:
concatenated_video_groups.append(current_group)
# Optional: Ensure the last group is "ready" for upload based on time difference
if concatenated_video_groups:
last_group = concatenated_video_groups[-1]
last_video = last_group[-1]
last_updated_at = last_video['created_at']
if datetime.now() - last_updated_at <= timedelta(minutes=time_limit):
print(f"Last group is not ready for upload. Removing from final groups.")
concatenated_video_groups.pop()
concatenated_video_groups = [group for group in concatenated_video_groups if len(group) > 1]
return concatenated_video_groups

@ -210,6 +210,10 @@ def smart_choice(cursor, small_mb=250):
created_at DESC; created_at DESC;
""", (small_mb,)) """, (small_mb,))
return cursor.fetchall() return cursor.fetchall()
def select_user_videos(username, cursor):
cursor.execute("SELECT * FROM videos WHERE username = %s AND status != 'missing' AND codec IS NULL ORDER BY size ASC", (username,))
return cursor.fetchall()
def reencode_videos_av1(): def reencode_videos_av1():
# get videos # get videos
@ -218,7 +222,11 @@ def reencode_videos_av1():
# videos = cursor.fetchall() # videos = cursor.fetchall()
while True: while True:
videos = smart_choice(cursor) username = input("Enter username: ")
if username:
videos = select_user_videos(username, cursor)
else:
videos = smart_choice(cursor)
with tqdm(videos, desc="Processing videos", unit="file") as pbar: with tqdm(videos, desc="Processing videos", unit="file") as pbar:
for video in videos: for video in videos:
pbar.update(1) pbar.update(1)

Loading…
Cancel
Save