AhmedAshrafMarzouk's picture
Create app.py
8011021 verified
# app.py
# Gradio: paste YouTube URLs (one per line) -> download ARABIC captions only
# Preference: manual Arabic > auto Arabic. If Arabic isn't available, it reports an error for that URL.
# Output: captions_sentences_ar.zip with one sentence per line in each .txt
import os
import re
import zipfile
import tempfile
from urllib.parse import urlparse, parse_qs
import gradio as gr
from youtube_transcript_api import (
YouTubeTranscriptApi,
TranscriptsDisabled,
NoTranscriptFound,
VideoUnavailable,
)
# ----------------- URL + filename helpers -----------------
def extract_video_id(url: str) -> str | None:
url = (url or "").strip()
if not url:
return None
m = re.search(r"(?:youtu\.be/)([A-Za-z0-9_-]{6,})", url)
if m:
return m.group(1)
try:
u = urlparse(url)
if "youtube.com" in (u.netloc or ""):
qs = parse_qs(u.query)
if "v" in qs and qs["v"]:
return qs["v"][0]
for pat in (r"(?:/shorts/)([A-Za-z0-9_-]{6,})", r"(?:/embed/)([A-Za-z0-9_-]{6,})"):
m = re.search(pat, u.path or "")
if m:
return m.group(1)
except Exception:
pass
m = re.search(r"([A-Za-z0-9_-]{11})", url)
return m.group(1) if m else None
def safe_filename(s: str, max_len: int = 120) -> str:
s = (s or "").strip()
s = re.sub(r"https?://", "", s)
s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
s = re.sub(r"_+", "_", s).strip("_")
return (s[:max_len] or "video").rstrip("_")
# ----------------- Arabic transcript fetching (manual > auto) -----------------
def fetch_arabic_caption_text(video_id: str) -> tuple[str, dict]:
"""
Returns (raw_caption_text, meta) for Arabic only.
Preference order:
1) Manually created Arabic transcript ('ar')
2) Auto-generated Arabic transcript ('ar')
If Arabic isn't available, raises NoTranscriptFound.
"""
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# 1) Manual Arabic
try:
t = transcript_list.find_manually_created_transcript(["ar"])
data = t.fetch()
return "\n".join(item["text"] for item in data), {
"lang": t.language_code,
"is_generated": False,
}
except Exception:
pass
# 2) Auto Arabic
try:
t = transcript_list.find_generated_transcript(["ar"])
data = t.fetch()
return "\n".join(item["text"] for item in data), {
"lang": t.language_code,
"is_generated": True,
}
except Exception:
raise NoTranscriptFound(video_id)
# ----------------- Cleaning: one sentence per line (Arabic-friendly heuristic) -----------------
def captions_to_ar_sentences(raw_text: str) -> list[str]:
"""
YouTube Arabic captions are often fragmentary.
We:
- remove timestamps / bracketed noise
- normalize whitespace
- split sentences on Arabic/Latin punctuation: . ! ? โ€ฆ ุŸ ุ›
"""
text = raw_text or ""
# Remove timestamps like [00:01], (00:01), 00:01 --> 00:03
text = re.sub(r"\[?\(?\d{1,2}:\d{2}(?::\d{2})?\)?\]?", " ", text)
text = re.sub(r"\d{1,2}:\d{2}\s*-->\s*\d{1,2}:\d{2}", " ", text)
# Remove common caption artifacts (English + Arabic-ish markers)
text = re.sub(
r"\s*\(?(?:applause|music|laughter|cheering|inaudible|ุชุตููŠู‚|ู…ูˆุณูŠู‚ู‰|ุถุญูƒ)\)?\s*",
" ",
text,
flags=re.I,
)
# Join lines + normalize spaces
text = text.replace("\n", " ")
text = re.sub(r"\s+", " ", text).strip()
if not text:
return []
# Add split markers after sentence end punctuation (Arabic + Latin)
# Includes: . ! ? โ€ฆ ุŸ ุ›
text = re.sub(r"([\.!\?โ€ฆุŸุ›])\s+", r"\1<SPLIT>", text)
parts = [p.strip() for p in text.split("<SPLIT>") if p.strip()]
# Final cleanup
cleaned = []
for s in parts:
s = re.sub(r"\s+", " ", s).strip()
s = re.sub(r"\s+([ุŒ,\.!\?โ€ฆุŸุ›:;])", r"\1", s)
if s:
cleaned.append(s)
return cleaned
# ----------------- Gradio worker -----------------
def build_zip_arabic(urls_text: str, include_header: bool) -> tuple[str | None, str]:
urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
if not urls:
return None, "Paste at least one YouTube URL (one per line)."
tmpdir = tempfile.mkdtemp(prefix="yt_captions_ar_")
zip_path = os.path.join(tmpdir, "captions_sentences_ar.zip")
ok = []
bad = []
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
for idx, url in enumerate(urls, start=1):
vid = extract_video_id(url)
if not vid:
bad.append(f"{idx}. Could not extract video id: {url}")
continue
try:
raw_text, meta = fetch_arabic_caption_text(vid)
sentences = captions_to_ar_sentences(raw_text)
if not sentences:
bad.append(f"{idx}. Arabic transcript was empty after cleaning: {url}")
continue
base = safe_filename(url)
fname = f"{base}__{vid}__ar.txt"
lines = []
if include_header:
lines += [
f"URL: {url}",
f"VideoID: {vid}",
"Language: ar",
f"Generated: {meta['is_generated']}",
"",
]
lines += sentences
zf.writestr(fname, "\n".join(lines) + "\n")
ok.append(
f"{idx}. โœ… {vid} โ€” {len(sentences)} lines (Arabic, {'auto' if meta['is_generated'] else 'manual'})"
)
except (TranscriptsDisabled, NoTranscriptFound):
bad.append(f"{idx}. No Arabic captions found (manual or auto): {url}")
except VideoUnavailable:
bad.append(f"{idx}. Video unavailable: {url}")
except Exception as e:
bad.append(f"{idx}. Error for {url}: {type(e).__name__}: {e}")
log = []
if ok:
log.append("Downloaded (Arabic only):")
log.extend(ok)
if bad:
log.append("")
log.append("Problems:")
log.extend(bad)
return zip_path, "\n".join(log).strip() or "Done."
# ----------------- UI -----------------
with gr.Blocks(title="YouTube Arabic Captions โ†’ Sentences", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# YouTube Arabic Captions โ†’ One Sentence Per Line
Paste YouTube URLs (one per line). The app downloads **Arabic captions only**
(prefers **manual** Arabic, falls back to **auto** Arabic), cleans them, and returns a zip.
"""
)
urls_in = gr.Textbox(
label="YouTube URLs (one per line)",
placeholder="https://www.youtube.com/watch?v=...\nhttps://youtu.be/...\n...",
lines=8,
)
include_header_in = gr.Checkbox(
label="Include metadata header in each file",
value=False,
)
run_btn = gr.Button("Download Arabic Captions", variant="primary")
out_file = gr.File(label="captions_sentences_ar.zip")
out_log = gr.Textbox(label="Log", lines=10)
run_btn.click(
fn=build_zip_arabic,
inputs=[urls_in, include_header_in],
outputs=[out_file, out_log],
)
if __name__ == "__main__":
demo.launch()