# app.py # Gradio: paste YouTube URLs (one per line) -> download ARABIC captions only # Preference: manual Arabic > auto Arabic. If Arabic isn't available, it reports an error for that URL. # Output: captions_sentences_ar.zip with one sentence per line in each .txt import os import re import zipfile import tempfile from urllib.parse import urlparse, parse_qs import gradio as gr from youtube_transcript_api import ( YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, ) # ----------------- URL + filename helpers ----------------- def extract_video_id(url: str) -> str | None: url = (url or "").strip() if not url: return None m = re.search(r"(?:youtu\.be/)([A-Za-z0-9_-]{6,})", url) if m: return m.group(1) try: u = urlparse(url) if "youtube.com" in (u.netloc or ""): qs = parse_qs(u.query) if "v" in qs and qs["v"]: return qs["v"][0] for pat in (r"(?:/shorts/)([A-Za-z0-9_-]{6,})", r"(?:/embed/)([A-Za-z0-9_-]{6,})"): m = re.search(pat, u.path or "") if m: return m.group(1) except Exception: pass m = re.search(r"([A-Za-z0-9_-]{11})", url) return m.group(1) if m else None def safe_filename(s: str, max_len: int = 120) -> str: s = (s or "").strip() s = re.sub(r"https?://", "", s) s = re.sub(r"[^A-Za-z0-9._-]+", "_", s) s = re.sub(r"_+", "_", s).strip("_") return (s[:max_len] or "video").rstrip("_") # ----------------- Arabic transcript fetching (manual > auto) ----------------- def fetch_arabic_caption_text(video_id: str) -> tuple[str, dict]: """ Returns (raw_caption_text, meta) for Arabic only. Preference order: 1) Manually created Arabic transcript ('ar') 2) Auto-generated Arabic transcript ('ar') If Arabic isn't available, raises NoTranscriptFound. """ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # 1) Manual Arabic try: t = transcript_list.find_manually_created_transcript(["ar"]) data = t.fetch() return "\n".join(item["text"] for item in data), { "lang": t.language_code, "is_generated": False, } except Exception: pass # 2) Auto Arabic try: t = transcript_list.find_generated_transcript(["ar"]) data = t.fetch() return "\n".join(item["text"] for item in data), { "lang": t.language_code, "is_generated": True, } except Exception: raise NoTranscriptFound(video_id) # ----------------- Cleaning: one sentence per line (Arabic-friendly heuristic) ----------------- def captions_to_ar_sentences(raw_text: str) -> list[str]: """ YouTube Arabic captions are often fragmentary. We: - remove timestamps / bracketed noise - normalize whitespace - split sentences on Arabic/Latin punctuation: . ! ? … ؟ ؛ """ text = raw_text or "" # Remove timestamps like [00:01], (00:01), 00:01 --> 00:03 text = re.sub(r"\[?\(?\d{1,2}:\d{2}(?::\d{2})?\)?\]?", " ", text) text = re.sub(r"\d{1,2}:\d{2}\s*-->\s*\d{1,2}:\d{2}", " ", text) # Remove common caption artifacts (English + Arabic-ish markers) text = re.sub( r"\s*\(?(?:applause|music|laughter|cheering|inaudible|تصفيق|موسيقى|ضحك)\)?\s*", " ", text, flags=re.I, ) # Join lines + normalize spaces text = text.replace("\n", " ") text = re.sub(r"\s+", " ", text).strip() if not text: return [] # Add split markers after sentence end punctuation (Arabic + Latin) # Includes: . ! ? … ؟ ؛ text = re.sub(r"([\.!\?…؟؛])\s+", r"\1", text) parts = [p.strip() for p in text.split("") if p.strip()] # Final cleanup cleaned = [] for s in parts: s = re.sub(r"\s+", " ", s).strip() s = re.sub(r"\s+([،,\.!\?…؟؛:;])", r"\1", s) if s: cleaned.append(s) return cleaned # ----------------- Gradio worker ----------------- def build_zip_arabic(urls_text: str, include_header: bool) -> tuple[str | None, str]: urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()] if not urls: return None, "Paste at least one YouTube URL (one per line)." tmpdir = tempfile.mkdtemp(prefix="yt_captions_ar_") zip_path = os.path.join(tmpdir, "captions_sentences_ar.zip") ok = [] bad = [] with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: for idx, url in enumerate(urls, start=1): vid = extract_video_id(url) if not vid: bad.append(f"{idx}. Could not extract video id: {url}") continue try: raw_text, meta = fetch_arabic_caption_text(vid) sentences = captions_to_ar_sentences(raw_text) if not sentences: bad.append(f"{idx}. Arabic transcript was empty after cleaning: {url}") continue base = safe_filename(url) fname = f"{base}__{vid}__ar.txt" lines = [] if include_header: lines += [ f"URL: {url}", f"VideoID: {vid}", "Language: ar", f"Generated: {meta['is_generated']}", "", ] lines += sentences zf.writestr(fname, "\n".join(lines) + "\n") ok.append( f"{idx}. ✅ {vid} — {len(sentences)} lines (Arabic, {'auto' if meta['is_generated'] else 'manual'})" ) except (TranscriptsDisabled, NoTranscriptFound): bad.append(f"{idx}. No Arabic captions found (manual or auto): {url}") except VideoUnavailable: bad.append(f"{idx}. Video unavailable: {url}") except Exception as e: bad.append(f"{idx}. Error for {url}: {type(e).__name__}: {e}") log = [] if ok: log.append("Downloaded (Arabic only):") log.extend(ok) if bad: log.append("") log.append("Problems:") log.extend(bad) return zip_path, "\n".join(log).strip() or "Done." # ----------------- UI ----------------- with gr.Blocks(title="YouTube Arabic Captions → Sentences", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # YouTube Arabic Captions → One Sentence Per Line Paste YouTube URLs (one per line). The app downloads **Arabic captions only** (prefers **manual** Arabic, falls back to **auto** Arabic), cleans them, and returns a zip. """ ) urls_in = gr.Textbox( label="YouTube URLs (one per line)", placeholder="https://www.youtube.com/watch?v=...\nhttps://youtu.be/...\n...", lines=8, ) include_header_in = gr.Checkbox( label="Include metadata header in each file", value=False, ) run_btn = gr.Button("Download Arabic Captions", variant="primary") out_file = gr.File(label="captions_sentences_ar.zip") out_log = gr.Textbox(label="Log", lines=10) run_btn.click( fn=build_zip_arabic, inputs=[urls_in, include_header_in], outputs=[out_file, out_log], ) if __name__ == "__main__": demo.launch()