Spaces:

IFMedTechdemo
/

medibotOCR

Paused

App Files Files Community

IFMedTechdemo commited on about 1 month ago

Commit

a35d993

verified ·

1 Parent(s): 79c09f5

Update app.py

Browse files

Files changed (1) hide show

app.py +180 -214

app.py CHANGED Viewed

@@ -2,21 +2,20 @@
 import os
-import time
-from threading import Thread
 from typing import Iterable, Dict, Any, Optional, List
 import gradio as gr
 import spaces
 import torch
 from PIL import Image
-import pandas as pd  # Excel read + debug
 from transformers import (
     Qwen3VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
-    TextIteratorStreamer,
 )
 from gradio.themes import Soft
@@ -26,7 +25,6 @@ from gradio.themes.utils import colors, fonts, sizes
 #  Character Error Rate (CER)
 # ============================================================
 def levenshtein(a: str, b: str) -> int:
     """Levenshtein distance to calculate CER."""
     a, b = a.lower(), b.lower()
@@ -63,7 +61,6 @@ from huggingface_hub import hf_hub_download
 REPO_ID = "IFMedTech/Medibot_OCR_model"  # private backend repo
-# Filenames in the repo  →  class names they define
 PY_MODULES: Dict[str, str] = {
     "clinical_NER.py": "ClinicalNER",
     "tf_idf_phonetic.py": "TfidfPhoneticMatcher",
@@ -73,7 +70,6 @@ PY_MODULES: Dict[str, str] = {
 HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")  # must be set in Space secrets
 def _dynamic_import(module_path: str, class_name: str):
     spec = importlib.util.spec_from_file_location(class_name, module_path)
     module = importlib.util.module_from_spec(spec)
@@ -93,7 +89,7 @@ if HF_TOKEN is None:
 else:
     print(f"[Private] Using repo: {REPO_ID}")
-    # 1) Load python modules (best-effort: failure of one file will not block others)
     for fname, cls_name in PY_MODULES.items():
         try:
             print(f"[Private] Downloading module file: {fname}")
@@ -120,8 +116,6 @@ else:
             repo_type="model",
         )
         print(f"[Private] Downloaded Excel at: {drug_xlsx_path}")
-        # Debug: verify read
         df_debug = pd.read_excel(drug_xlsx_path, nrows=3)
         print(
             f"[Private] Excel loaded successfully. "
@@ -238,20 +232,17 @@ DTYPE_BF16 = torch.bfloat16 if use_cuda else torch.float32
 # ============================================================
 #  OCR MODELS: Chandra-OCR + Dots.OCR
 # ============================================================
-# 1) Chandra-OCR (Qwen3VL)
 MODEL_ID_V = "datalab-to/chandra"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen3VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V, trust_remote_code=True, torch_dtype=DTYPE_FP16
 ).to(device).eval()
-# 2) Dots.OCR (flash_attn2 if available, else SDPA)
 MODEL_PATH_D = "prithivMLmods/Dots.OCR-Latest-BF16"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 attn_impl = "sdpa"
 try:
     import flash_attn  # noqa: F401
     if use_cuda:
         attn_impl = "flash_attention_2"
 except Exception:
@@ -268,9 +259,7 @@ if not use_cuda:
     model_d.to(device)
 # ============================================================
-#  GENERATION (OCR → Med extraction → Spell-check + CER)
-#  ClinicalNER is used ONLY for Dots.OCR.
-#  Single output: Markdown only (no raw stream exposed).
 # ============================================================
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
@@ -287,229 +276,208 @@ def generate_image(
     top_k: int,
     repetition_penalty: float,
     spell_algo: str,
-):
     """
     Returns a single Markdown string:
       - Medications (extracted)
       - Spell-check suggestions
     No raw OCR text is returned to the UI.
     """
-    # Always return ONE value (Markdown string)
-    if image is None:
-        yield "Please upload an image."
-        return
-    # Choose processor/model
-    if model_name == "Chandra-OCR":
-        processor, model = processor_v, model_v
-    elif model_name == "Dots.OCR":
-        processor, model = processor_d, model_d
-    else:
-        yield "Invalid model selected."
-        return
-    # Prompt (text is provided via gr.State)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": text},
-            ],
-        }
-    ]
-    prompt_full = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    # Preprocess
-    inputs = processor(
-        text=[prompt_full], images=[image], return_tensors="pt", padding=True
-    )
-    inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}
-    # Streamer
-    tokenizer = getattr(processor, "tokenizer", None) or processor
-    streamer = TextIteratorStreamer(
-        tokenizer, skip_prompt=True, skip_special_tokens=True
-    )
-    gen_kwargs = dict(
-        **inputs,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-    )
-    # Start generation in background thread
-    thread = Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-    # 1) Live loop: we don't show raw text, just a "Processing..." placeholder once
-    buffer = ""
-    first = True
-    for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "")
-        if first:
-            # Only one interim update to UI
-            yield "Processing..."
-            first = False
-        time.sleep(0.01)
-    final_ocr_text = buffer.strip()
-    # --------------------------------------------------------
-    # 2) Medications extraction
-    # --------------------------------------------------------
-    meds: List[str] = []
-    if model_name == "Dots.OCR":
-        # ClinicalNER ONLY for Dots.OCR
-        try:
-            if "ClinicalNER" in priv_classes and HF_TOKEN is not None:
-                ClinicalNER = priv_classes["ClinicalNER"]
-                ner = ClinicalNER(token=HF_TOKEN)
-                ner_output = ner(final_ocr_text) or []
                 meds = [
-                    m.strip()
-                    for m in ner_output
-                    if isinstance(m, str) and m.strip()
                 ]
-                print("[NER] (Dots.OCR) ClinicalNER meds:", meds)
-            else:
-                print("[NER] ClinicalNER unavailable or missing HF token; skipping.")
-        except Exception as e:
-            print(f"[NER] Error running ClinicalNER: {e}")
-        # Fallback if ClinicalNER returns nothing
-        if not meds:
             meds = [
                 line.strip()
                 for line in final_ocr_text.splitlines()
                 if line.strip()
             ]
-            print("[NER] (Dots.OCR) Fallback to lines, count:", len(meds))
-    elif model_name == "Chandra-OCR":
-        # NO ClinicalNER for Chandra; just use text lines
-        meds = [
-            line.strip()
-            for line in final_ocr_text.splitlines()
-            if line.strip()
-        ]
-        print("[NER] (Chandra-OCR) Line-based meds only, count:", len(meds))
-    print("[DEBUG] meds count:", len(meds))
-    print("[DEBUG] drug_xlsx_path in generate_image:", drug_xlsx_path)
-    # --------------------------------------------------------
-    # 3) Build Markdown base: Medications only (no Raw OCR)
-    # --------------------------------------------------------
-    md = "### Medications (extracted)\n"
-    if meds:
-        for m in meds:
-            md += f"- {m}\n"
-    else:
-        md += "- None detected\n"
-    # --------------------------------------------------------
-    # 4) Spell-check (med list) with CER
-    # --------------------------------------------------------
-    spell_section = "\n---\n### Spell-check suggestions (" + spell_algo + ")\n"
-    corr: Dict[str, List] = {}
-    if BACKEND_INIT_ERROR:
-        spell_section += f"- [DEBUG] Backend init error: {BACKEND_INIT_ERROR}\n"
-    try:
-        if meds and drug_xlsx_path:
-            # Optional Excel debug read
-            try:
-                df_dbg = pd.read_excel(drug_xlsx_path)
-                print(
-                    f"[Spell DEBUG] Excel read OK: path={drug_xlsx_path}, "
-                    f"shape={df_dbg.shape}, cols={list(df_dbg.columns)}"
-                )
-                spell_section += (
-                    f"- [DEBUG] Excel read OK; shape={df_dbg.shape}, "
-                    f"cols={list(df_dbg.columns)}\n"
-                )
-            except Exception as e:
-                print(f"[Spell DEBUG] ERROR reading Excel in generate_image: {e}")
-                spell_section += f"- [DEBUG] Excel read error: {e}\n"
-            # Pick matcher based on spell_algo
-            if (
-                spell_algo == "TF-IDF + Phonetic"
-                and "TfidfPhoneticMatcher" in priv_classes
-            ):
-                print("[Spell DEBUG] Using TfidfPhoneticMatcher")
-                Cls = priv_classes["TfidfPhoneticMatcher"]
-                checker = Cls(
-                    xlsx_path=drug_xlsx_path,
-                    column="Combined_Drugs",
-                    ngram_size=3,
-                    phonetic_weight=0.4,
-                )
-                corr = checker.match_list(meds, top_k=5, tfidf_threshold=0.15)
-            elif spell_algo == "SymSpell" and "SymSpellMatcher" in priv_classes:
-                print("[Spell DEBUG] Using SymSpellMatcher")
-                Cls = priv_classes["SymSpellMatcher"]
-                checker = Cls(
-                    xlsx_path=drug_xlsx_path,
-                    column="Combined_Drugs",
-                    max_edit=2,
-                    prefix_len=7,
-                )
-                corr = checker.match_list(meds, top_k=5, min_score=0.4)
-            elif spell_algo == "RapidFuzz" and "RapidFuzzMatcher" in priv_classes:
-                print("[Spell DEBUG] Using RapidFuzzMatcher")
-                Cls = priv_classes["RapidFuzzMatcher"]
-                checker = Cls(xlsx_path=drug_xlsx_path, column="Combined_Drugs")
-                corr = checker.match_list(meds, top_k=5, threshold=70.0)
-            else:
-                spell_section += (
-                    "- Spell-check backend unavailable "
-                    "(no matcher class for selected algorithm).\n"
-                )
-        else:
-            if not meds:
-                spell_section += "- No medications extracted (empty med list).\n"
-            if not drug_xlsx_path:
-                spell_section += (
-                    "- Drug Excel dictionary path missing "
-                    "(drug_xlsx_path is None).\n"
-                )
-    except Exception as e:
-        print(f"[Spell DEBUG] Spell-check error: {e}")
-        spell_section += f"- Spell-check error: {e}\n"
-    # Format suggestions (top-5 per med, with scores + CER)
-    if corr:
-        for raw in meds:
-            suggestions = corr.get(raw, [])
-            if suggestions:
-                spell_section += f"- **{raw}**\n"
-                for cand, score in suggestions:
-                    cer = character_error_rate(cand, raw)
                     spell_section += (
-                        f"  - {cand} (score={score:.3f}, CER={cer:.3f}%)\n"
                     )
             else:
-                spell_section += f"- **{raw}**\n  - (no suggestions)\n"
-    final_md = md + spell_section
-    # Final yield: SINGLE markdown string
-    yield final_md
 # ============================================================
@@ -620,8 +588,6 @@ if __name__ == "__main__":
 ######################################    version  4  #########################################

 import os
 from typing import Iterable, Dict, Any, Optional, List
+from threading import Thread  # no longer needed but harmless if left
+import time  # no longer needed but harmless if left
 import gradio as gr
 import spaces
 import torch
 from PIL import Image
+import pandas as pd
 from transformers import (
     Qwen3VLForConditionalGeneration,
     AutoModelForCausalLM,
     AutoProcessor,
 )
 from gradio.themes import Soft
 #  Character Error Rate (CER)
 # ============================================================
 def levenshtein(a: str, b: str) -> int:
     """Levenshtein distance to calculate CER."""
     a, b = a.lower(), b.lower()
 REPO_ID = "IFMedTech/Medibot_OCR_model"  # private backend repo
 PY_MODULES: Dict[str, str] = {
     "clinical_NER.py": "ClinicalNER",
     "tf_idf_phonetic.py": "TfidfPhoneticMatcher",
 HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")  # must be set in Space secrets
 def _dynamic_import(module_path: str, class_name: str):
     spec = importlib.util.spec_from_file_location(class_name, module_path)
     module = importlib.util.module_from_spec(spec)
 else:
     print(f"[Private] Using repo: {REPO_ID}")
+    # 1) Load python modules (best-effort)
     for fname, cls_name in PY_MODULES.items():
         try:
             print(f"[Private] Downloading module file: {fname}")
             repo_type="model",
         )
         print(f"[Private] Downloaded Excel at: {drug_xlsx_path}")
         df_debug = pd.read_excel(drug_xlsx_path, nrows=3)
         print(
             f"[Private] Excel loaded successfully. "
 # ============================================================
 #  OCR MODELS: Chandra-OCR + Dots.OCR
 # ============================================================
 MODEL_ID_V = "datalab-to/chandra"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen3VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V, trust_remote_code=True, torch_dtype=DTYPE_FP16
 ).to(device).eval()
 MODEL_PATH_D = "prithivMLmods/Dots.OCR-Latest-BF16"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 attn_impl = "sdpa"
 try:
     import flash_attn  # noqa: F401
     if use_cuda:
         attn_impl = "flash_attention_2"
 except Exception:
     model_d.to(device)
 # ============================================================
+#  GENERATION (no raw output UI; one markdown return)
 # ============================================================
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
     top_k: int,
     repetition_penalty: float,
     spell_algo: str,
+) -> str:
     """
     Returns a single Markdown string:
       - Medications (extracted)
       - Spell-check suggestions
     No raw OCR text is returned to the UI.
     """
+    try:
+        if image is None:
+            return "Please upload an image."
+        # Choose processor/model
+        if model_name == "Chandra-OCR":
+            processor, model = processor_v, model_v
+        elif model_name == "Dots.OCR":
+            processor, model = processor_d, model_d
+        else:
+            return "Invalid model selected."
+        # Build prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
+        prompt_full = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        # Preprocess
+        inputs = processor(
+            text=[prompt_full], images=[image], return_tensors="pt", padding=True
+        )
+        inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+        # Generate (no streaming)
+        gen_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+        )
+        outputs = model.generate(**gen_kwargs)
+        tokenizer = getattr(processor, "tokenizer", None) or processor
+        generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+        final_ocr_text = generated.strip()
+        # --------------------------------------------------------
+        # 2) Medications extraction
+        # --------------------------------------------------------
+        meds: List[str] = []
+        if model_name == "Dots.OCR":
+            try:
+                if "ClinicalNER" in priv_classes and HF_TOKEN is not None:
+                    ClinicalNER = priv_classes["ClinicalNER"]
+                    ner = ClinicalNER(token=HF_TOKEN)
+                    ner_output = ner(final_ocr_text) or []
+                    meds = [
+                        m.strip()
+                        for m in ner_output
+                        if isinstance(m, str) and m.strip()
+                    ]
+                    print("[NER] (Dots.OCR) ClinicalNER meds:", meds)
+                else:
+                    print("[NER] ClinicalNER unavailable or missing HF token; skipping.")
+            except Exception as e:
+                print(f"[NER] Error running ClinicalNER: {e}")
+            if not meds:
                 meds = [
+                    line.strip()
+                    for line in final_ocr_text.splitlines()
+                    if line.strip()
                 ]
+                print("[NER] (Dots.OCR) Fallback to lines, count:", len(meds))
+        else:  # Chandra-OCR
             meds = [
                 line.strip()
                 for line in final_ocr_text.splitlines()
                 if line.strip()
             ]
+            print("[NER] (Chandra-OCR) Line-based meds only, count:", len(meds))
+        print("[DEBUG] meds count:", len(meds))
+        print("[DEBUG] drug_xlsx_path in generate_image:", drug_xlsx_path)
+        # --------------------------------------------------------
+        # 3) Markdown: Medications only (no Raw OCR section)
+        # --------------------------------------------------------
+        md = "### Medications (extracted)\n"
+        if meds:
+            for m in meds:
+                md += f"- {m}\n"
+        else:
+            md += "- None detected\n"
+        # --------------------------------------------------------
+        # 4) Spell-check (med list) with CER
+        # --------------------------------------------------------
+        spell_section = "\n---\n### Spell-check suggestions (" + spell_algo + ")\n"
+        corr: Dict[str, List] = {}
+        if BACKEND_INIT_ERROR:
+            spell_section += f"- [DEBUG] Backend init error: {BACKEND_INIT_ERROR}\n"
+        try:
+            if meds and drug_xlsx_path:
+                try:
+                    df_dbg = pd.read_excel(drug_xlsx_path)
+                    print(
+                        f"[Spell DEBUG] Excel read OK: path={drug_xlsx_path}, "
+                        f"shape={df_dbg.shape}, cols={list(df_dbg.columns)}"
+                    )
+                    spell_section += (
+                        f"- [DEBUG] Excel read OK; shape={df_dbg.shape}, "
+                        f"cols={list(df_dbg.columns)}\n"
+                    )
+                except Exception as e:
+                    print(f"[Spell DEBUG] ERROR reading Excel in generate_image: {e}")
+                    spell_section += f"- [DEBUG] Excel read error: {e}\n"
+                if (
+                    spell_algo == "TF-IDF + Phonetic"
+                    and "TfidfPhoneticMatcher" in priv_classes
+                ):
+                    print("[Spell DEBUG] Using TfidfPhoneticMatcher")
+                    Cls = priv_classes["TfidfPhoneticMatcher"]
+                    checker = Cls(
+                        xlsx_path=drug_xlsx_path,
+                        column="Combined_Drugs",
+                        ngram_size=3,
+                        phonetic_weight=0.4,
+                    )
+                    corr = checker.match_list(meds, top_k=5, tfidf_threshold=0.15)
+                elif spell_algo == "SymSpell" and "SymSpellMatcher" in priv_classes:
+                    print("[Spell DEBUG] Using SymSpellMatcher")
+                    Cls = priv_classes["SymSpellMatcher"]
+                    checker = Cls(
+                        xlsx_path=drug_xlsx_path,
+                        column="Combined_Drugs",
+                        max_edit=2,
+                        prefix_len=7,
+                    )
+                    corr = checker.match_list(meds, top_k=5, min_score=0.4)
+                elif spell_algo == "RapidFuzz" and "RapidFuzzMatcher" in priv_classes:
+                    print("[Spell DEBUG] Using RapidFuzzMatcher")
+                    Cls = priv_classes["RapidFuzzMatcher"]
+                    checker = Cls(xlsx_path=drug_xlsx_path, column="Combined_Drugs")
+                    corr = checker.match_list(meds, top_k=5, threshold=70.0)
+                else:
                     spell_section += (
+                        "- Spell-check backend unavailable "
+                        "(no matcher class for selected algorithm).\n"
                     )
             else:
+                if not meds:
+                    spell_section += "- No medications extracted (empty med list).\n"
+                if not drug_xlsx_path:
+                    spell_section += (
+                        "- Drug Excel dictionary path missing "
+                        "(drug_xlsx_path is None).\n"
+                    )
+        except Exception as e:
+            print(f"[Spell DEBUG] Spell-check error: {e}")
+            spell_section += f"- Spell-check error: {e}\n"
+        if corr:
+            for raw in meds:
+                suggestions = corr.get(raw, [])
+                if suggestions:
+                    spell_section += f"- **{raw}**\n"
+                    for cand, score in suggestions:
+                        cer = character_error_rate(cand, raw)
+                        spell_section += (
+                            f"  - {cand} (score={score:.3f}, CER={cer:.3f}%)\n"
+                        )
+                else:
+                    spell_section += f"- **{raw}**\n  - (no suggestions)\n"
+        final_md = md + spell_section
+        return final_md
+    except Exception as e:
+        # Catch-all so the GPU worker does not crash
+        print(f"[ERROR] generate_image crashed: {e}")
+        import traceback
+        traceback.print_exc()
+        return f"Error while processing: {e}"
 # ============================================================
 ######################################    version  4  #########################################