Spaces:

AhmedAshrafMarzouk
/

voicer-youtube-txt

Sleeping

File size: 7,530 Bytes
# app.py
# Gradio: paste YouTube URLs (one per line) -> download ARABIC captions only
# Preference: manual Arabic > auto Arabic. If Arabic isn't available, it reports an error for that URL.
# Output: captions_sentences_ar.zip with one sentence per line in each .txt

import os
import re
import zipfile
import tempfile
from urllib.parse import urlparse, parse_qs

import gradio as gr
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
    VideoUnavailable,
)


# ----------------- URL + filename helpers -----------------

def extract_video_id(url: str) -> str | None:
    url = (url or "").strip()
    if not url:
        return None

    m = re.search(r"(?:youtu\.be/)([A-Za-z0-9_-]{6,})", url)
    if m:
        return m.group(1)

    try:
        u = urlparse(url)

        if "youtube.com" in (u.netloc or ""):
            qs = parse_qs(u.query)
            if "v" in qs and qs["v"]:
                return qs["v"][0]

        for pat in (r"(?:/shorts/)([A-Za-z0-9_-]{6,})", r"(?:/embed/)([A-Za-z0-9_-]{6,})"):
            m = re.search(pat, u.path or "")
            if m:
                return m.group(1)

    except Exception:
        pass

    m = re.search(r"([A-Za-z0-9_-]{11})", url)
    return m.group(1) if m else None


def safe_filename(s: str, max_len: int = 120) -> str:
    s = (s or "").strip()
    s = re.sub(r"https?://", "", s)
    s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return (s[:max_len] or "video").rstrip("_")


# ----------------- Arabic transcript fetching (manual > auto) -----------------

def fetch_arabic_caption_text(video_id: str) -> tuple[str, dict]:
    """
    Returns (raw_caption_text, meta) for Arabic only.
    Preference order:
      1) Manually created Arabic transcript ('ar')
      2) Auto-generated Arabic transcript ('ar')
    If Arabic isn't available, raises NoTranscriptFound.
    """
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    # 1) Manual Arabic
    try:
        t = transcript_list.find_manually_created_transcript(["ar"])
        data = t.fetch()
        return "\n".join(item["text"] for item in data), {
            "lang": t.language_code,
            "is_generated": False,
        }
    except Exception:
        pass

    # 2) Auto Arabic
    try:
        t = transcript_list.find_generated_transcript(["ar"])
        data = t.fetch()
        return "\n".join(item["text"] for item in data), {
            "lang": t.language_code,
            "is_generated": True,
        }
    except Exception:
        raise NoTranscriptFound(video_id)


# ----------------- Cleaning: one sentence per line (Arabic-friendly heuristic) -----------------

def captions_to_ar_sentences(raw_text: str) -> list[str]:
    """
    YouTube Arabic captions are often fragmentary.
    We:
      - remove timestamps / bracketed noise
      - normalize whitespace
      - split sentences on Arabic/Latin punctuation: . ! ? … ؟ ؛
    """
    text = raw_text or ""

    # Remove timestamps like [00:01], (00:01), 00:01 --> 00:03
    text = re.sub(r"\[?\(?\d{1,2}:\d{2}(?::\d{2})?\)?\]?", " ", text)
    text = re.sub(r"\d{1,2}:\d{2}\s*-->\s*\d{1,2}:\d{2}", " ", text)

    # Remove common caption artifacts (English + Arabic-ish markers)
    text = re.sub(
        r"\s*\(?(?:applause|music|laughter|cheering|inaudible|تصفيق|موسيقى|ضحك)\)?\s*",
        " ",
        text,
        flags=re.I,
    )

    # Join lines + normalize spaces
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return []

    # Add split markers after sentence end punctuation (Arabic + Latin)
    # Includes: . ! ? … ؟ ؛
    text = re.sub(r"([\.!\?…؟؛])\s+", r"\1<SPLIT>", text)

    parts = [p.strip() for p in text.split("<SPLIT>") if p.strip()]

    # Final cleanup
    cleaned = []
    for s in parts:
        s = re.sub(r"\s+", " ", s).strip()
        s = re.sub(r"\s+([،,\.!\?…؟؛:;])", r"\1", s)
        if s:
            cleaned.append(s)

    return cleaned


# ----------------- Gradio worker -----------------

def build_zip_arabic(urls_text: str, include_header: bool) -> tuple[str | None, str]:
    urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
    if not urls:
        return None, "Paste at least one YouTube URL (one per line)."

    tmpdir = tempfile.mkdtemp(prefix="yt_captions_ar_")
    zip_path = os.path.join(tmpdir, "captions_sentences_ar.zip")

    ok = []
    bad = []

    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for idx, url in enumerate(urls, start=1):
            vid = extract_video_id(url)
            if not vid:
                bad.append(f"{idx}. Could not extract video id: {url}")
                continue

            try:
                raw_text, meta = fetch_arabic_caption_text(vid)
                sentences = captions_to_ar_sentences(raw_text)

                if not sentences:
                    bad.append(f"{idx}. Arabic transcript was empty after cleaning: {url}")
                    continue

                base = safe_filename(url)
                fname = f"{base}__{vid}__ar.txt"

                lines = []
                if include_header:
                    lines += [
                        f"URL: {url}",
                        f"VideoID: {vid}",
                        "Language: ar",
                        f"Generated: {meta['is_generated']}",
                        "",
                    ]
                lines += sentences

                zf.writestr(fname, "\n".join(lines) + "\n")

                ok.append(
                    f"{idx}. ✅ {vid} — {len(sentences)} lines (Arabic, {'auto' if meta['is_generated'] else 'manual'})"
                )

            except (TranscriptsDisabled, NoTranscriptFound):
                bad.append(f"{idx}. No Arabic captions found (manual or auto): {url}")
            except VideoUnavailable:
                bad.append(f"{idx}. Video unavailable: {url}")
            except Exception as e:
                bad.append(f"{idx}. Error for {url}: {type(e).__name__}: {e}")

    log = []
    if ok:
        log.append("Downloaded (Arabic only):")
        log.extend(ok)
    if bad:
        log.append("")
        log.append("Problems:")
        log.extend(bad)

    return zip_path, "\n".join(log).strip() or "Done."


# ----------------- UI -----------------

with gr.Blocks(title="YouTube Arabic Captions → Sentences", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # YouTube Arabic Captions → One Sentence Per Line
        Paste YouTube URLs (one per line). The app downloads **Arabic captions only**
        (prefers **manual** Arabic, falls back to **auto** Arabic), cleans them, and returns a zip.
        """
    )

    urls_in = gr.Textbox(
        label="YouTube URLs (one per line)",
        placeholder="https://www.youtube.com/watch?v=...\nhttps://youtu.be/...\n...",
        lines=8,
    )

    include_header_in = gr.Checkbox(
        label="Include metadata header in each file",
        value=False,
    )

    run_btn = gr.Button("Download Arabic Captions", variant="primary")

    out_file = gr.File(label="captions_sentences_ar.zip")
    out_log = gr.Textbox(label="Log", lines=10)

    run_btn.click(
        fn=build_zip_arabic,
        inputs=[urls_in, include_header_in],
        outputs=[out_file, out_log],
    )

if __name__ == "__main__":
    demo.launch()