#!/usr/bin/env python3 """Convert a YouTube .vtt (manual or auto-captions) into clean [HH:MM:SS] transcript lines. Usage: vtt_to_transcript.py Handles the rolling-duplicate problem in auto-captions: each cue repeats the tail of the previous cue, so we keep only newly-added words per cue and emit one line per cue start time. Strips inline <00:00:00.000> word-timing tags and HTML tags. """ import sys, re, html TS=re.compile(r'(\d{2}):(\d{2}):(\d{2})\.\d{3}\s*-->\s*(\d{2}):(\d{2}):(\d{2})') INLINE=re.compile(r'<[^>]+>') def hhmmss(h,m,s): return f"[{int(h):02d}:{int(m):02d}:{int(s):02d}]" def clean(text): text=INLINE.sub('',text) text=html.unescape(text) return re.sub(r'\s+',' ',text).strip() def main(): if len(sys.argv)!=3: sys.exit("usage: vtt_to_transcript.py ") raw=open(sys.argv[1],encoding='utf-8',errors='replace').read().splitlines() cues=[] # (start_label, text) i=0; cur=None while i {sys.argv[2]}") if __name__=="__main__": main()