#!/usr/bin/env python3 """Build a final voice-over video on Ubuntu with ffmpeg.""" from __future__ import annotations import argparse import shutil import subprocess from pathlib import Path AUDIO_EXTS = {".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg"} def run(cmd: list[str]) -> None: print("+ " + " ".join(cmd)) subprocess.run(cmd, check=True) def require_tool(name: str) -> str: path = shutil.which(name) if not path: raise SystemExit(f"{name} is required. Install it with: sudo apt install -y ffmpeg") return path def media_duration(path: Path) -> float: result = subprocess.check_output( [ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nw=1:nk=1", str(path), ], text=True, ).strip() return float(result) def audio_files(audio_dir: Path) -> list[Path]: files = [ path for path in sorted(audio_dir.iterdir()) if path.is_file() and path.suffix.lower() in AUDIO_EXTS ] if not files: raise FileNotFoundError(f"No audio files found in {audio_dir}") return files def concat_audio_dir(audio_dir: Path, work_dir: Path, silence: float) -> Path: work_dir.mkdir(parents=True, exist_ok=True) normalized: list[Path] = [] silence_path = work_dir / "silence.wav" run( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-f", "lavfi", "-t", f"{silence:.3f}", "-i", "anullsrc=channel_layout=stereo:sample_rate=48000", "-c:a", "pcm_s16le", str(silence_path), ] ) for index, src in enumerate(audio_files(audio_dir), start=1): dst = work_dir / f"audio_{index:02d}.wav" run( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", str(src), "-vn", "-ar", "48000", "-ac", "2", "-c:a", "pcm_s16le", str(dst), ] ) normalized.append(dst) concat_items: list[Path] = [] for index, item in enumerate(normalized): concat_items.append(item) if index != len(normalized) - 1 and silence > 0: concat_items.append(silence_path) list_path = work_dir / "audio_concat.txt" with list_path.open("w", encoding="utf-8") as handle: for item in concat_items: escaped = item.resolve().as_posix().replace("'", "'\\''") handle.write(f"file '{escaped}'\n") out_audio = work_dir / "combined_voice.wav" run( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-f", "concat", "-safe", "0", "-i", str(list_path), "-c:a", "pcm_s16le", str(out_audio), ] ) return out_audio def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Combine one video with voice-over audio.") parser.add_argument("--video", type=Path, required=True, help="Source video path.") parser.add_argument("--audio", type=Path, default=None, help="Single voice-over audio file.") parser.add_argument("--audio-dir", type=Path, default=None, help="Directory of ordered audio files.") parser.add_argument("--output", type=Path, default=Path("05_outputs/final_voiceover.mp4")) parser.add_argument("--work-dir", type=Path, default=Path("04_intermediate/ubuntu_voiceover")) parser.add_argument("--silence", type=float, default=0.35, help="Gap seconds between audio files.") parser.add_argument("--width", type=int, default=1920) parser.add_argument("--height", type=int, default=1080) parser.add_argument("--fps", type=int, default=30) parser.add_argument("--crf", type=int, default=20) parser.add_argument("--preset", default="medium") parser.add_argument("--video-speed", type=float, default=None, help="Override automatic speed.") return parser.parse_args() def main() -> int: args = parse_args() require_tool("ffmpeg") require_tool("ffprobe") if not args.video.exists(): raise FileNotFoundError(args.video) if bool(args.audio) == bool(args.audio_dir): raise SystemExit("Use exactly one of --audio or --audio-dir.") args.work_dir.mkdir(parents=True, exist_ok=True) args.output.parent.mkdir(parents=True, exist_ok=True) audio_path = args.audio if args.audio else concat_audio_dir(args.audio_dir, args.work_dir, args.silence) if not audio_path or not audio_path.exists(): raise FileNotFoundError(audio_path) video_duration = media_duration(args.video) audio_duration = media_duration(audio_path) if video_duration <= 0 or audio_duration <= 0: raise RuntimeError("Invalid media duration.") speed = args.video_speed if args.video_speed else video_duration / audio_duration if speed <= 0: raise ValueError("--video-speed must be greater than 0.") print(f"video_duration={video_duration:.3f}s") print(f"audio_duration={audio_duration:.3f}s") print(f"video_speed={speed:.6f}x") vf = ( f"[0:v]setpts=PTS/{speed:.8f},fps={args.fps}," f"scale={args.width}:{args.height}:force_original_aspect_ratio=decrease," f"pad={args.width}:{args.height}:(ow-iw)/2:(oh-ih)/2:black," "setsar=1,format=yuv420p[v];" "[1:a]aresample=48000,apad[a]" ) run( [ "ffmpeg", "-hide_banner", "-y", "-i", str(args.video), "-i", str(audio_path), "-filter_complex", vf, "-map", "[v]", "-map", "[a]", "-t", f"{audio_duration:.3f}", "-c:v", "libx264", "-preset", args.preset, "-crf", str(args.crf), "-c:a", "aac", "-b:a", "192k", "-ar", "48000", "-ac", "2", "-movflags", "+faststart", str(args.output), ] ) final_duration = media_duration(args.output) print(f"output={args.output}") print(f"final_duration={final_duration:.3f}s") return 0 if __name__ == "__main__": raise SystemExit(main())