60 lines
2.1 KiB
Python
Executable File
60 lines
2.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Convert a YouTube .vtt (manual or auto-captions) into clean [HH:MM:SS] transcript lines.
|
|
|
|
Usage: vtt_to_transcript.py <input.vtt> <output.txt>
|
|
|
|
Handles the rolling-duplicate problem in auto-captions: each cue repeats the tail of the
|
|
previous cue, so we keep only newly-added words per cue and emit one line per cue start
|
|
time. Strips inline <00:00:00.000> word-timing tags and HTML tags.
|
|
"""
|
|
import sys, re, html
|
|
|
|
TS=re.compile(r'(\d{2}):(\d{2}):(\d{2})\.\d{3}\s*-->\s*(\d{2}):(\d{2}):(\d{2})')
|
|
INLINE=re.compile(r'<[^>]+>')
|
|
|
|
def hhmmss(h,m,s): return f"[{int(h):02d}:{int(m):02d}:{int(s):02d}]"
|
|
|
|
def clean(text):
|
|
text=INLINE.sub('',text)
|
|
text=html.unescape(text)
|
|
return re.sub(r'\s+',' ',text).strip()
|
|
|
|
def main():
|
|
if len(sys.argv)!=3: sys.exit("usage: vtt_to_transcript.py <in.vtt> <out.txt>")
|
|
raw=open(sys.argv[1],encoding='utf-8',errors='replace').read().splitlines()
|
|
cues=[] # (start_label, text)
|
|
i=0; cur=None
|
|
while i<len(raw):
|
|
m=TS.search(raw[i])
|
|
if m:
|
|
if cur: cues.append(cur)
|
|
cur=[hhmmss(*m.groups()[:3]),[]]
|
|
i+=1
|
|
while i<len(raw) and not TS.search(raw[i]) and raw[i].strip()!='':
|
|
if raw[i].strip() and not raw[i].strip().isdigit():
|
|
cur[1].append(clean(raw[i]))
|
|
i+=1
|
|
else:
|
|
i+=1
|
|
if cur: cues.append(cur)
|
|
|
|
# De-duplicate rolling captions: keep only the suffix not already seen.
|
|
out=[]; seen_words=[]
|
|
for label,parts in cues:
|
|
text=clean(' '.join(parts))
|
|
if not text: continue
|
|
words=text.split()
|
|
# find longest overlap of seen tail with this cue's head
|
|
overlap=0; maxk=min(len(words),len(seen_words))
|
|
for k in range(maxk,0,-1):
|
|
if seen_words[-k:]==words[:k]: overlap=k; break
|
|
new=words[overlap:]
|
|
if new:
|
|
out.append(f"{label} {' '.join(new)}")
|
|
seen_words=(seen_words+new)[-40:] # bounded window
|
|
with open(sys.argv[2],'w',encoding='utf-8') as f:
|
|
f.write('\n'.join(out)+'\n')
|
|
print(f"wrote {len(out)} transcript lines -> {sys.argv[2]}")
|
|
|
|
if __name__=="__main__": main()
|