#!/usr/bin/env python3 """ ingest.py: YouTube-to-vault normalizer for the ingest-youtube skill. Takes a YouTube URL (and optional vault root), shells out to yt-dlp for metadata + subtitles, cleans VTT timing markers into prose, and writes External Inputs/YouTube//-.md. Stdout: human-readable summary. Exit non-zero on any failure (no silent partial writes). Usage: python3 ingest.py [--vault ] [--lang ] Defaults: --vault: $VAULT_ROOT or current dir --lang: en,es (try English first, then Spanish; matches a common EN+ES bilingual default for users with multilingual content) --whisper: accepted as a future fallback flag; this version writes a stub if subtitles are unavailable """ from __future__ import annotations import argparse import html import json import os import re import shutil import subprocess import sys import tempfile from datetime import datetime, timezone from pathlib import Path from urllib.parse import parse_qs, urlparse, urlunparse VTT_TIMING_RE = re.compile(r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*") VTT_HEADER_RE = re.compile(r"^(WEBVTT|Kind:|Language:|NOTE\s|X-TIMESTAMP-MAP)", re.MULTILINE) SLUG_RE = re.compile(r"[^a-z0-9]+") YOUTUBE_VIDEO_ID_RE = re.compile(r"^[A-Za-z0-9_-]{11}$") SUBPROCESS_TIMEOUT_SECONDS = 60 SEED_KEYWORDS = ( "decision", "framework", "model", "principle", "the lesson is", "playbook", "anti-pattern", "case study", "what i learned", "the trick is", "the insight is", ) def validate_youtube_url(raw_url: str) -> str: if not raw_url or raw_url.startswith("-") or any(ord(ch) < 32 or ord(ch) == 127 for ch in raw_url): raise ValueError("URL must be a valid http(s) YouTube video URL") parsed = urlparse(raw_url) if parsed.scheme not in {"http", "https"}: raise ValueError("URL must use http or https") host = parsed.hostname.lower() if parsed.hostname else "" video_id = "" if host in {"youtube.com", "www.youtube.com", "m.youtube.com", "music.youtube.com"}: parts = [part for part in parsed.path.split("/") if part] if parsed.path == "/watch": video_id = parse_qs(parsed.query).get("v", [""])[0] elif len(parts) >= 2 and parts[0] in {"shorts", "embed", "v"}: video_id = parts[1] elif host == "youtu.be": video_id = parsed.path.lstrip("/").split("/", 1)[0] if not YOUTUBE_VIDEO_ID_RE.fullmatch(video_id): raise ValueError("URL must point to a single YouTube video") return urlunparse(("https", "www.youtube.com", "/watch", "", f"v={video_id}", "")) def run_ytdlp(args: list[str]) -> subprocess.CompletedProcess[str]: return subprocess.run( args, capture_output=True, text=True, check=False, timeout=SUBPROCESS_TIMEOUT_SECONDS, ) def yaml_scalar(value: object) -> str: if isinstance(value, bool): return "true" if value else "false" if isinstance(value, (int, float)): return str(value) if value is None: return '""' return json.dumps(str(value), ensure_ascii=False) def markdown_text(value: object) -> str: text = html.escape(str(value), quote=False) return re.sub(r"([\\`*_{}\[\]()#+.!|-])", r"\\\1", text) def slugify(text: str, max_len: int = 60) -> str: s = SLUG_RE.sub("-", text.lower()).strip("-") return s[:max_len].rstrip("-") or "untitled" def require_bin(name: str) -> str: path = shutil.which(name) if not path: sys.stderr.write( f"Error: {name} not installed. Install with `brew install {name}` " f"(macOS) or `pip3 install --user {name}`.\n" ) sys.exit(2) return path def fetch_metadata(url: str, ytdlp: str) -> dict: proc = run_ytdlp([ytdlp, "--ignore-config", "--skip-download", "--print-json", "--no-warnings", "--", url]) if proc.returncode != 0: sys.stderr.write(f"yt-dlp metadata fetch failed:\n{proc.stderr}\n") sys.exit(3) return json.loads(proc.stdout) def list_subs(url: str, ytdlp: str) -> str: proc = run_ytdlp([ytdlp, "--ignore-config", "--list-subs", "--skip-download", "--no-warnings", "--", url]) return proc.stdout def parse_available_subs(listing: str) -> tuple[set[str], set[str]]: """Return (manual_langs, auto_langs) from --list-subs output.""" manual: set[str] = set() auto: set[str] = set() section = None for line in listing.splitlines(): low = line.strip().lower() if "available subtitles" in low: section = "manual" continue if "available automatic captions" in low: section = "auto" continue if not line.strip() or line.startswith("Language"): continue if section in ("manual", "auto"): code = line.split()[0] if line.split() else "" if re.fullmatch(r"[a-z]{2,3}(-[a-zA-Z0-9]+)?", code): (manual if section == "manual" else auto).add(code) return manual, auto def pick_lang(prefs: list[str], manual: set[str], auto: set[str]) -> tuple[str, str] | None: """Return (lang_code, source) where source is 'manual' or 'auto', or None.""" for code in prefs: if code in manual: return code, "manual" for code in prefs: if code in auto: return code, "auto" if manual: return next(iter(sorted(manual))), "manual" if auto: return next(iter(sorted(auto))), "auto" return None def download_subs(url: str, lang: str, source: str, ytdlp: str, workdir: Path) -> Path: flag = "--write-sub" if source == "manual" else "--write-auto-sub" out_template = str(workdir / "%(id)s.%(ext)s") proc = run_ytdlp([ ytdlp, "--ignore-config", flag, "--sub-lang", lang, "--skip-download", "--sub-format", "vtt", "-o", out_template, "--no-warnings", "--", url, ]) if proc.returncode != 0: sys.stderr.write(f"yt-dlp subtitle download failed:\n{proc.stderr}\n") sys.exit(4) matches = list(workdir.glob("*.vtt")) if not matches: sys.stderr.write("yt-dlp reported success but no .vtt file landed\n") sys.exit(5) return matches[0] def clean_vtt(vtt_path: Path) -> str: raw = vtt_path.read_text(encoding="utf-8", errors="replace") lines = [] seen_phrases: set[str] = set() for line in raw.splitlines(): line = line.rstrip() if not line: continue if VTT_TIMING_RE.match(line) or VTT_HEADER_RE.match(line): continue if line.isdigit(): continue cleaned = re.sub(r"<[^>]+>", "", line).strip() if not cleaned: continue if cleaned in seen_phrases: continue seen_phrases.add(cleaned) lines.append(cleaned) text = " ".join(lines) text = re.sub(r"\s+", " ", text).strip() sentences = re.split(r"(?<=[.!?])\s+(?=[A-ZÁÉÍÓÚÑ¿¡])", text) return "\n\n".join(s.strip() for s in sentences if s.strip()) def detect_seeds(transcript: str) -> list[str]: low = transcript.lower() return [kw for kw in SEED_KEYWORDS if kw in low] def write_vault_file( vault_root: Path, channel_slug: str, upload_date: str, video_slug: str, frontmatter: dict, body: str, ) -> Path: target_dir = vault_root / "External Inputs" / "YouTube" / channel_slug target_dir.mkdir(parents=True, exist_ok=True) target = target_dir / f"{upload_date}-{video_slug}.md" yaml_lines = ["---"] for k, v in frontmatter.items(): yaml_lines.append(f"{k}: {yaml_scalar(v)}") yaml_lines.append("---") target.write_text("\n".join(yaml_lines) + "\n\n" + body + "\n", encoding="utf-8") return target def write_seed_stub( vault_root: Path, upload_date: str, channel_slug: str, video_id: str, seeds: list[str], video_url: str, video_title: str, ) -> Path: captures_dir = vault_root / "Meta" / "Captures" captures_dir.mkdir(parents=True, exist_ok=True) fname = f"{upload_date}-youtube-{channel_slug}-{video_id}.md" target = captures_dir / fname body = ( "---\n" "type: capture\n" "source: youtube\n" f"video_url: {yaml_scalar(video_url)}\n" f"detected_at: {yaml_scalar(datetime.now(timezone.utc).isoformat())}\n" f"keywords: {yaml_scalar(', '.join(seeds))}\n" "status: open\n" "---\n\n" f"# Capture seed: {markdown_text(video_title)}\n\n" f"Trigger keywords detected in transcript: {markdown_text(', '.join(seeds))}.\n\n" f"Source: {markdown_text(video_url)}\n\n" "## Notes\n\n(fill in)\n" ) target.write_text(body, encoding="utf-8") return target def main() -> int: parser = argparse.ArgumentParser(description="Ingest a YouTube video transcript into the vault") parser.add_argument("url", help="YouTube video URL") parser.add_argument("--vault", default=None, help="Vault root path (default: $VAULT_ROOT or .)") parser.add_argument("--lang", default="en,es", help="Comma-separated language preference") parser.add_argument("--whisper", action="store_true", help="Enable Whisper fallback if no subs") args = parser.parse_args() vault_root = Path(args.vault or os.environ.get("VAULT_ROOT") or ".").resolve() if not vault_root.is_dir(): sys.stderr.write(f"Vault root not a directory: {vault_root}\n") return 1 try: youtube_url = validate_youtube_url(args.url) except ValueError as exc: sys.stderr.write(f"Invalid YouTube URL: {exc}\n") return 2 ytdlp = require_bin("yt-dlp") prefs = [c.strip() for c in args.lang.split(",") if c.strip()] meta = fetch_metadata(youtube_url, ytdlp) video_id = meta.get("id", "unknown") title = meta.get("title", "Untitled") channel = meta.get("channel") or meta.get("uploader") or "unknown-channel" channel_slug = slugify(channel) video_slug = slugify(title) upload_date_raw = meta.get("upload_date", "") upload_date = ( f"{upload_date_raw[:4]}-{upload_date_raw[4:6]}-{upload_date_raw[6:8]}" if len(upload_date_raw) == 8 else datetime.now().strftime("%Y-%m-%d") ) listing = list_subs(youtube_url, ytdlp) manual, auto = parse_available_subs(listing) pick = pick_lang(prefs, manual, auto) sub_source = "none" transcript = "" lang_code = "und" if pick: lang_code, sub_source = pick with tempfile.TemporaryDirectory() as td: vtt = download_subs(youtube_url, lang_code, sub_source, ytdlp, Path(td)) transcript = clean_vtt(vtt) elif args.whisper: sys.stderr.write("Whisper fallback requested but not yet implemented in v0.1.\n") sys.stderr.write("Install whisper-cpp + ggml model and re-run, or pre-add subs to the video.\n") sub_source = "none" else: sys.stderr.write("No subtitles available and --whisper not set. Writing stub.\n") sub_source = "none" word_count = len(transcript.split()) if transcript else 0 seeds = detect_seeds(transcript) if transcript else [] body = transcript or ( f"# {markdown_text(title)}\n\n" f"No subtitles or auto-captions available for this video.\n\n" "To capture this transcript, add captions to the source video or transcribe the audio " "with your local Whisper workflow and re-run ingest.\n\n" f"Source: {markdown_text(youtube_url)}\n" ) fm = { "type": "external-input", "source": "youtube", "video_id": video_id, "url": youtube_url, "channel": channel, "channel_url": meta.get("channel_url", ""), "title": title, "upload_date": upload_date, "duration_seconds": meta.get("duration", 0), "language": lang_code, "subtitle_source": sub_source, "word_count": word_count, "ingested_at": datetime.now(timezone.utc).isoformat(), } target = write_vault_file(vault_root, channel_slug, upload_date, video_slug, fm, body) seed_paths: list[Path] = [] if seeds: seed_paths.append( write_seed_stub(vault_root, upload_date, channel_slug, video_id, seeds, youtube_url, title) ) seed_str = f" Seeds at: {', '.join(str(p) for p in seed_paths)}." if seed_paths else "" print( f"Wrote {word_count} words to {target}. " f"Language: {lang_code}. Subtitle source: {sub_source}.{seed_str}" ) return 0 if __name__ == "__main__": sys.exit(main())