from datetime import datetime, timedelta, timezone from video_manager import get_duration import os, json, subprocess, shutil def is_file_empty(filepath): return os.stat(filepath).st_size == 0 def format_datetime(datetime_str): """Format the datetime string to a more readable format.""" return datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S") def get_file_size_in_mb(file_path): return os.path.getsize(file_path) / (1024 ** 2) def get_file_size_gb(file_path): return os.path.getsize(file_path) / 1024 / 1024 / 1024 def get_data(data_path): try: with open(data_path, 'r') as file: data = json.load(file) return data except Exception as e: print(f"Error loading {data_path}: {e}") return None def update_video_data(dataPath, data): """Update or create a JSON file for the video metadata.""" if os.path.exists(dataPath): with open(dataPath, "r") as f: existing_data = json.load(f) if existing_data == data: return # No update needed if data hasn't changed. data["updated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open(dataPath, "w") as f: json.dump(data, f) # Write to file if new or if data has changed. def is_recent(updated_at, minutes=30): updated_at = updated_at.replace(tzinfo=timezone.utc) now = datetime.now(timezone.utc) return now - updated_at < timedelta(minutes=minutes) def is_file_size_bigger_than(file_size_in_mb, max_size_gb): """Check if the file size is bigger than the specified max size in GB.""" max_size_megabytes = max_size_gb * 1024 # Convert GB to MB return file_size_in_mb > max_size_megabytes def cleanup_data_files(folder_path): videos = [video for video in os.listdir(folder_path) if video.endswith(".json")] for filename in videos: json_path = os.path.join(folder_path, filename) video_path = json_path.replace(".json", ".mp4") if not os.path.exists(video_path): os.remove(json_path) def get_video_data(videoPath): with open(videoPath, "r") as f: data = json.load(f) return data def get_videos(folder_path): """Retrieve video metadata from the JSON files in a specified folder.""" video_list = [] # List all .mp4 files and their corresponding .json metadata files videos = [f for f in os.listdir(folder_path) if f.endswith(".mp4")] for video_filename in videos: video_path = os.path.join(folder_path, video_filename) json_path = video_path.replace(".mp4", ".json") if not os.path.exists(json_path): continue data = get_video_data(json_path) data['size'] = get_file_size_in_mb(video_path) # Include size in MB for further processing data['filepath'] = video_path video_list.append(data) return video_list def group_videos(video_list, sort_by="count", order="desc"): """Group video data by username and site, and sort the groups by video creation time.""" video_data = {} is_desc = order == "desc" for video in video_list: key = (video["username"], video["site"]) if key not in video_data: video_data[key] = [] video_data[key].append(video) # Ensure videos for each user and site are sorted by creation date for key in video_data: video_data[key].sort(key=lambda x: (x["created_at"])) # Further sort groups if required based on size or count if sort_by == "size": video_data = dict(sorted(video_data.items(), key=lambda x: sum(item['size'] for item in x[1]), reverse=is_desc)) elif sort_by == "count": video_data = dict(sorted(video_data.items(), key=lambda x: len(x[1]), reverse=is_desc)) return video_data def process_videos(video_data): processed_videos = [] failed_directory = "failed" for video in video_data: is_updated = False video_path = video["filepath"] data_path = video["jsonpath"] filesize = get_file_size_in_mb(video_path) if 'size' not in video or video['size'] != filesize: video['size'] = filesize is_updated = True if is_updated and 'duration' not in video: video['duration'] = get_duration(video_path) is_updated = True # Move corrupted videos to the failed folder if video['duration'] == 0: print(f"{video['video_id']} is corrupted, moving to failed folder") failed_video_path = os.path.join(failed_directory, video["video_id"] + ".mp4") failed_data_path = failed_video_path.replace(".mp4", ".json") shutil.move(video_path, failed_video_path) shutil.move(data_path, failed_data_path) continue # Skip further processing for this video if is_updated: update_video_data(data_path, video) processed_videos.append(video) return processed_videos def group_for_concatenation(videos, time_limit=30): """ Groups videos into lists where: - total group size <= 9GB (9216 MB), - time gap between consecutive videos <= time_limit minutes, - AND all have the same resolution/fps/codecs for no-reencode concat. """ concatenated_video_groups = [] current_group = [] current_size_mb = 0 last_video_end = None reference_params = None # We'll store the 'ffprobe' params for the first video in each group for video in videos: video_start = (video['created_at']) video_end = video_start + timedelta(seconds=video['duration']) # Probe the video to get parameters video_path = video['filepath'] params = get_video_params(video_path) if params is None: # If ffprobe fails, skip or handle the error print(f"Skipping {video_path}, failed to get ffprobe info.") continue if current_group: # Check if adding this video breaks the size limit time_difference = (video_start - last_video_end).total_seconds() / 60 size_exceeded = (current_size_mb + video['size'] > 9216) time_exceeded = (time_difference > time_limit) # Check if the video parameters match the group's reference param_mismatch = False if reference_params: # Compare relevant fields for field in ['video_codec','width','height','pix_fmt','fps', 'audio_codec','audio_sample_rate','audio_channels','audio_channel_layout']: if params[field] != reference_params[field]: param_mismatch = True break # If we exceed size, exceed time gap, or mismatch in parameters => start new group if size_exceeded or time_exceeded or param_mismatch: concatenated_video_groups.append(current_group) current_group = [] current_size_mb = 0 reference_params = None # reset for new group # If we're starting a new group, set reference parameters if not current_group: reference_params = params # Add the current video to the group current_group.append(video) current_size_mb += video['size'] last_video_end = video_end # Add the last group if not empty if current_group: concatenated_video_groups.append(current_group) # Optional: Ensure the last group is "ready" for upload based on time difference # (Your original logic that if last video was updated < time_limit minutes ago, remove the group) if concatenated_video_groups: last_group = concatenated_video_groups[-1] last_video = last_group[-1] last_updated_at = datetime.strptime(last_video['created_at'], "%Y-%m-%d %H:%M:%S") if datetime.now() - last_updated_at <= timedelta(minutes=time_limit): print(f"Last group is not ready for upload. Removing from final groups.") concatenated_video_groups.pop() concatenated_video_groups = [group for group in concatenated_video_groups if len(group) > 1] return concatenated_video_groups def get_video_params(video_path): """ Run ffprobe on a given video path to extract: - codec_name (video + audio) - width, height - pix_fmt - r_frame_rate (frame rate) - sample_rate, channel_layout (audio) Returns a dict with these parameters or None if there's an error. """ cmd = [ 'ffprobe', '-v', 'error', '-print_format', 'json', '-show_streams', '-show_format', video_path ] try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) info = json.loads(result.stdout) # We'll parse out the first video & audio streams we find. video_stream = next((s for s in info['streams'] if s['codec_type'] == 'video'), None) audio_stream = next((s for s in info['streams'] if s['codec_type'] == 'audio'), None) if not video_stream: raise ValueError(f"No video stream found in {video_path}") # Frame rate can be something like "30000/1001" - convert to float r_frame_rate = video_stream.get('r_frame_rate', '0/0') try: num, den = r_frame_rate.split('/') fps = float(num) / float(den) if float(den) != 0 else 0.0 except: fps = 0.0 # Gather the key parameters params = { 'video_codec': video_stream.get('codec_name', 'unknown'), 'width': video_stream.get('width', 0), 'height': video_stream.get('height', 0), 'pix_fmt': video_stream.get('pix_fmt', 'unknown'), 'fps': fps, 'audio_codec': audio_stream.get('codec_name', 'none') if audio_stream else 'none', 'audio_sample_rate': audio_stream.get('sample_rate', '0') if audio_stream else '0', 'audio_channels': audio_stream.get('channels', 0) if audio_stream else 0, 'audio_channel_layout': audio_stream.get('channel_layout', 'none') if audio_stream else 'none' } return params except subprocess.CalledProcessError as e: print(f"Failed to run ffprobe on {video_path}: {e}") return None def generate_list_file(videos): directory = os.path.dirname(videos[0]["filepath"]) list_filename = os.path.join(directory, f"{videos[0]['video_id']}.txt") with open(list_filename, "w") as list_file: for video in videos: list_file.write(f"file '{video['video_id']}.mp4'\n") return list_filename def concatenate_videos(grouped_videos, directory): """Concatenate pre-grouped videos, updating metadata and managing file operations.""" processed_videos = [] for group in grouped_videos: if len(group) == 1: processed_videos.append(group[0]) continue # Set up paths based on the first video in the group first_video = group[0] video_path = first_video["filepath"] data_path = video_path.replace(".mp4", ".json") temp_path = video_path.replace(".mp4", "_temp.mp4") # Generate a list file for ffmpeg concatenation list_filename = generate_list_file(directory, group) # Run ffmpeg to concatenate videos subprocess.run(["ffmpeg", "-f", "concat", "-safe", "0", "-i", list_filename, "-c", "copy", temp_path]) # Remove individual video files and their metadata [os.remove(v["filepath"]) for v in group] [os.remove(v["filepath"].replace(".mp4", ".json")) for v in group] os.remove(list_filename) os.rename(temp_path, video_path) # Update the metadata for the concatenated video first_video["filepath"] = video_path first_video["size"] = get_file_size_in_mb(video_path) first_video["duration"] = get_duration(video_path) update_video_data(data_path, first_video) # Ensure this function reflects the changes of concatenation processed_videos.append(first_video) return processed_videos def get_all_videos(directory): # find all .mp4 files in the directory and its subdirectories videos = [] for root, dirs, files in os.walk(directory): for file in files: if file.endswith(".mp4"): videos.append(os.path.join(root, file)) return videos def get_all_data(directory): # finds all json files in the directory and its subdirectories data = [] for root, dirs, files in os.walk(directory): for file in files: if file.endswith(".json"): data.append(os.path.join(root, file)) return data def match_data_to_video_fast(videos, data): data_dict = {os.path.splitext(os.path.basename(d))[0]: d for d in data} matched, unmatched = [], [] for v in videos: video_id = os.path.splitext(os.path.basename(v))[0] if video_id in data_dict: matched.append((v, data_dict[video_id])) else: unmatched.append(v) return parse_video_data(matched), unmatched def parse_video_data(matched_videos): """Retrieve video metadata from the JSON files in a specified folder.""" import tqdm video_list = [] with tqdm.tqdm(total=len(matched_videos), desc="Parsing video data") as pbar: for video in matched_videos: pbar.update(1) video_path, json_path = video data = get_video_data(json_path) data['filepath'] = video_path data['jsonpath'] = json_path video_list.append(data) return video_list def get_videos_matched(video_dirs, data_dirs): # get all videos videos = [] for d in video_dirs: videos += get_all_videos(d) # get all data data = [] for d in data_dirs: data += get_all_data(d) # match the data to the videos parsed_videos, unmatched = match_data_to_video_fast(videos, data) return parsed_videos, unmatched def calculate_file_hash(file_path): import hashlib with open(file_path, 'rb') as f: data = f.read() return hashlib.sha256(data).hexdigest() def group_for_concatenation_simple(videos, time_limit=120): """ Groups videos into lists where: - total group size <= 9GB (9216 MB), - time gap between consecutive videos <= time_limit minutes, - AND all have the same resolution/fps/codecs for no-reencode concat. """ concatenated_video_groups = [] current_group = [] current_size_mb = 0 last_video_end = None for video in videos: video_start = video['created_at'] video_end = video_start + timedelta(seconds=video['duration']) if current_group: # Check if adding this video breaks the size limit time_difference = (video_start - last_video_end).total_seconds() / 60 time_exceeded = (time_difference > time_limit) # size_exceeded = (current_size_mb + video['size'] > 9216) # If we exceed size, exceed time gap, or mismatch in parameters => start new group if time_exceeded: #or size_exceeded: concatenated_video_groups.append(current_group) current_group = [] current_size_mb = 0 # Add the current video to the group current_group.append(video) current_size_mb += video['size'] last_video_end = video_end # Add the last group if not empty if current_group: concatenated_video_groups.append(current_group) # Optional: Ensure the last group is "ready" for upload based on time difference if concatenated_video_groups: last_group = concatenated_video_groups[-1] last_video = last_group[-1] last_updated_at = last_video['created_at'] if datetime.now() - last_updated_at <= timedelta(minutes=time_limit): print(f"Last group is not ready for upload. Removing from final groups.") concatenated_video_groups.pop() concatenated_video_groups = [group for group in concatenated_video_groups if len(group) > 1] return concatenated_video_groups