playbook/antigravity-awesome-skills/skills/youtube-notetaker/scripts/vtt_to_transcript.py

60 lines
2.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""Convert a YouTube .vtt (manual or auto-captions) into clean [HH:MM:SS] transcript lines.
Usage: vtt_to_transcript.py <input.vtt> <output.txt>
Handles the rolling-duplicate problem in auto-captions: each cue repeats the tail of the
previous cue, so we keep only newly-added words per cue and emit one line per cue start
time. Strips inline <00:00:00.000> word-timing tags and HTML tags.
"""
import sys, re, html
TS=re.compile(r'(\d{2}):(\d{2}):(\d{2})\.\d{3}\s*-->\s*(\d{2}):(\d{2}):(\d{2})')
INLINE=re.compile(r'<[^>]+>')
def hhmmss(h,m,s): return f"[{int(h):02d}:{int(m):02d}:{int(s):02d}]"
def clean(text):
text=INLINE.sub('',text)
text=html.unescape(text)
return re.sub(r'\s+',' ',text).strip()
def main():
if len(sys.argv)!=3: sys.exit("usage: vtt_to_transcript.py <in.vtt> <out.txt>")
raw=open(sys.argv[1],encoding='utf-8',errors='replace').read().splitlines()
cues=[] # (start_label, text)
i=0; cur=None
while i<len(raw):
m=TS.search(raw[i])
if m:
if cur: cues.append(cur)
cur=[hhmmss(*m.groups()[:3]),[]]
i+=1
while i<len(raw) and not TS.search(raw[i]) and raw[i].strip()!='':
if raw[i].strip() and not raw[i].strip().isdigit():
cur[1].append(clean(raw[i]))
i+=1
else:
i+=1
if cur: cues.append(cur)
# De-duplicate rolling captions: keep only the suffix not already seen.
out=[]; seen_words=[]
for label,parts in cues:
text=clean(' '.join(parts))
if not text: continue
words=text.split()
# find longest overlap of seen tail with this cue's head
overlap=0; maxk=min(len(words),len(seen_words))
for k in range(maxk,0,-1):
if seen_words[-k:]==words[:k]: overlap=k; break
new=words[overlap:]
if new:
out.append(f"{label} {' '.join(new)}")
seen_words=(seen_words+new)[-40:] # bounded window
with open(sys.argv[2],'w',encoding='utf-8') as f:
f.write('\n'.join(out)+'\n')
print(f"wrote {len(out)} transcript lines -> {sys.argv[2]}")
if __name__=="__main__": main()