Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Sleeping

App Files Files Community

AbstractPhil commited on Jun 3

Commit

0cba5a9

verified ·

1 Parent(s): b235205

Update app.py

Browse files

Files changed (1) hide show

app.py +422 -97

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py – encoder-only demo for bert-beatrix-2048
 # launch:  python app.py
 # -----------------------------------------------
 import json, re, sys, math
@@ -40,12 +40,6 @@ with cfg_path.open("w") as f: json.dump(cfg,f,indent=2)
 handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
-encoder    = full_model.bert.encoder
-embeddings = full_model.bert.embeddings
-emb_ln     = full_model.bert.emb_ln
-emb_drop   = full_model.bert.emb_drop
-mlm_head   = full_model.cls          # prediction head
 # ------------------------------------------------------------------
 # 2. Symbolic roles -------------------------------------------------
 SYMBOLIC_ROLES = [
@@ -56,112 +50,443 @@ SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
-if any(tokenizer.convert_tokens_to_ids(t)==tokenizer.unk_token_id
-       for t in SYMBOLIC_ROLES):
-    sys.exit("❌ tokenizer missing special tokens")
-# Quick helpers
 MASK = tokenizer.mask_token
 # ------------------------------------------------------------------
-# 3.  Encoder-plus-MLM logic ---------------------------------------
-def cosine(a,b):
-    return torch.nn.functional.cosine_similarity(a,b,dim=-1)
-def pool_accuracy(ids, logits, pool_mask):
     """
-    ids     : (S,)  gold token ids
-    logits  : (S,V) MLM logits
-    pool_mask : bool (S,)  which tokens belong to the candidate pool
-    returns accuracy over masked positions only (if none, return 0)
     """
-    idx = pool_mask.nonzero(as_tuple=False).flatten()
-    if idx.numel()==0: return 0.0
-    preds = logits.argmax(-1)[idx]
-    gold  = ids[idx]
-    return (preds==gold).float().mean().item()
 @spaces.GPU
-def encode_and_trace(text, selected_roles):
     if not selected_roles:
-        selected_roles = SYMBOLIC_ROLES
-    # Convert symbolic role tokens to IDs
-    sel_ids = [tokenizer.convert_tokens_to_ids(t) for t in selected_roles]
-    sel_ids_tensor = torch.tensor(sel_ids, device="cuda").unsqueeze(0)  # shape: (1, R)
-    # Tokenize user prompt
-    batch = tokenizer(text, return_tensors="pt").to("cuda")
-    input_ids, attention_mask = batch.input_ids, batch.attention_mask
-    S = input_ids.shape[1]
-    # === Shared encoder logic with RoPE ===
-    def encode(input_ids, attn_mask):
-        x = embeddings(input_ids)  # (B, S, H)
-        if emb_ln: x = emb_ln(x)
-        if emb_drop: x = emb_drop(x)
-        ext = full_model.bert.get_extended_attention_mask(attn_mask, input_ids.shape)
-        return encoder(x, attention_mask=ext)[0]  # (B, S, H)
-    # Encode prompt
-    encoded_prompt = encode(input_ids, attention_mask)[0]  # (S, H)
-    # Encode symbolic roles through same pipeline
-    symbolic_attn = torch.ones_like(sel_ids_tensor)
-    encoded_roles = encode(sel_ids_tensor, symbolic_attn)[0]  # (R, H)
-    # === Symbolic classification via cosine similarity ===
-    # Compare each token to each symbolic role → shape: (S, R)
-    token_exp = encoded_prompt.unsqueeze(1).expand(-1, encoded_roles.size(0), -1)  # (S, R, H)
-    role_exp  = encoded_roles.unsqueeze(0).expand(encoded_prompt.size(0), -1, -1)  # (S, R, H)
-    sim = F.cosine_similarity(token_exp, role_exp, dim=-1)  # → (S, R)
-    argmax_ids = sim.argmax(dim=-1)            # (S,)
-    max_scores = sim.max(dim=-1).values        # (S,)
-    predicted_roles = [selected_roles[i] for i in argmax_ids.tolist()]
-    decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
-    # === Build readable trace
-    role_trace = [
-        f"{tok:<15} → {role:<22} score={score:.4f}"
-        for tok, role, score in zip(decoded_tokens, predicted_roles, max_scores.tolist())
-    ]
-    # === Final output
-    res_json = {
-        "Prompt": text,
-        "Predicted symbolic roles": predicted_roles,
-        "Max alignment score": f"{max_scores.max().item():.4f}",
-        "Per-token classification": role_trace
-    }
-    return json.dumps(res_json, indent=2), f"{max_scores.max().item():.4f}", len(selected_roles)
 # ------------------------------------------------------------------
-# 4.  Gradio UI -----------------------------------------------------
 def build_interface():
-    with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
-        gr.Markdown("## 🧠 Symbolic Encoder Inspector")
-        with gr.Row():
-            with gr.Column():
-                txt  = gr.Textbox(label="Prompt", lines=3)
-                roles= gr.CheckboxGroup(
-                    choices=SYMBOLIC_ROLES, label="Roles",
-                    value=SYMBOLIC_ROLES   # pre-checked
-                )
-                btn  = gr.Button("Run")
-            with gr.Column():
-                out_json = gr.Textbox(label="Result JSON")
-                out_max  = gr.Textbox(label="Max cos")
-                out_cnt  = gr.Textbox(label="# roles")
-        btn.click(encode_and_trace, [txt,roles], [out_json,out_max,out_cnt])
     return demo
-if __name__=="__main__":
-    build_interface().launch()

+# app.py – FIXED encoder-only demo for bert-beatrix-2048
 # launch:  python app.py
 # -----------------------------------------------
 import json, re, sys, math
 handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
 # ------------------------------------------------------------------
 # 2. Symbolic roles -------------------------------------------------
 SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
+# Verify all symbolic tokens exist in tokenizer
+missing_tokens = []
+symbolic_token_ids = {}
+for token in SYMBOLIC_ROLES:
+    token_id = tokenizer.convert_tokens_to_ids(token)
+    if token_id == tokenizer.unk_token_id:
+        missing_tokens.append(token)
+    else:
+        symbolic_token_ids[token] = token_id
+if missing_tokens:
+    print(f"⚠️ Missing symbolic tokens: {missing_tokens}")
+    print("Available tokens will be used for classification")
 MASK = tokenizer.mask_token
+MASK_ID = tokenizer.mask_token_id
+print(f"✅ Loaded {len(symbolic_token_ids)} symbolic tokens")
 # ------------------------------------------------------------------
+# 3. FIXED MLM-based symbolic classification ----------------------
+def get_symbolic_predictions(input_ids, attention_mask, mask_positions, selected_roles):
     """
+    Proper MLM-based prediction for symbolic tokens at masked positions
+    Args:
+        input_ids: (B, S) token IDs with [MASK] at positions to classify
+        attention_mask: (B, S) attention mask
+        mask_positions: list of positions that are masked
+        selected_roles: list of symbolic role tokens to consider
+    Returns:
+        predictions and probabilities for each masked position
     """
+    # Get MLM logits from the model (this is what it was trained for)
+    with torch.no_grad():
+        outputs = full_model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs.logits  # (B, S, V)
+    # Filter to only selected symbolic role token IDs
+    selected_token_ids = [symbolic_token_ids[role] for role in selected_roles
+                         if role in symbolic_token_ids]
+    if not selected_token_ids:
+        return [], []
+    results = []
+    for pos in mask_positions:
+        # Get logits for this masked position
+        pos_logits = logits[0, pos]  # (V,)
+        # Extract logits for symbolic tokens only
+        symbolic_logits = pos_logits[selected_token_ids]  # (num_symbolic,)
+        # Apply softmax to get probabilities
+        symbolic_probs = F.softmax(symbolic_logits, dim=-1)
+        # Get top predictions
+        top_indices = torch.argsort(symbolic_probs, descending=True)
+        pos_results = []
+        for i in top_indices:
+            token_idx = selected_token_ids[i]
+            token = tokenizer.convert_ids_to_tokens([token_idx])[0]
+            prob = symbolic_probs[i].item()
+            pos_results.append({
+                "token": token,
+                "probability": prob,
+                "token_id": token_idx
+            })
+        results.append({
+            "position": pos,
+            "predictions": pos_results
+        })
+    return results
+def create_strategic_masks(text, tokenizer, strategy="content_words"):
+    """
+    Create strategic mask positions based on different strategies
+    Args:
+        text: input text
+        tokenizer: tokenizer
+        strategy: masking strategy
+    Returns:
+        input_ids with masks, attention_mask, original_tokens, mask_positions
+    """
+    # Tokenize original text
+    batch = tokenizer(text, return_tensors="pt", add_special_tokens=True)
+    input_ids = batch.input_ids[0]  # (S,)
+    attention_mask = batch.attention_mask[0]  # (S,)
+    # Get original tokens for reference
+    original_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    # Find positions to mask based on strategy
+    mask_positions = []
+    if strategy == "content_words":
+        # Mask content words (avoid special tokens, punctuation, common words)
+        skip_tokens = {
+            tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token,
+            ".", ",", "!", "?", ":", ";", "'", '"', "-", "(", ")", "[", "]",
+            "the", "a", "an", "and", "or", "but", "in", "on", "at", "to",
+            "for", "of", "with", "by", "is", "are", "was", "were", "be", "been"
+        }
+        for i, token in enumerate(original_tokens):
+            if (token not in skip_tokens and
+                not token.startswith("##") and  # avoid subword tokens
+                len(token) > 2 and
+                token.isalpha()):
+                mask_positions.append(i)
+    elif strategy == "every_nth":
+        # Mask every 3rd token (avoiding special tokens)
+        for i in range(1, len(original_tokens) - 1, 3):  # skip CLS and SEP
+            mask_positions.append(i)
+    elif strategy == "random":
+        # Randomly mask 15% of tokens
+        import random
+        candidates = list(range(1, len(original_tokens) - 1))  # skip CLS and SEP
+        num_to_mask = max(1, int(len(candidates) * 0.15))
+        mask_positions = random.sample(candidates, min(num_to_mask, len(candidates)))
+        mask_positions.sort()
+    elif strategy == "manual":
+        # For manual specification - return original for now
+        # Users can specify positions in the UI
+        pass
+    # Limit to reasonable number of masks
+    mask_positions = mask_positions[:10]  # Max 10 masks for UI clarity
+    # Create masked input
+    masked_input_ids = input_ids.clone()
+    for pos in mask_positions:
+        masked_input_ids[pos] = MASK_ID
+    return masked_input_ids.unsqueeze(0), attention_mask.unsqueeze(0), original_tokens, mask_positions
 @spaces.GPU
+def symbolic_classification_analysis(text, selected_roles, masking_strategy="content_words", num_predictions=5):
+    """
+    Perform symbolic classification analysis using MLM prediction
+    """
     if not selected_roles:
+        selected_roles = list(symbolic_token_ids.keys())
+    if not text.strip():
+        return "Please enter some text to analyze.", "", 0
+    try:
+        # Create strategically masked input
+        masked_input_ids, attention_mask, original_tokens, mask_positions = create_strategic_masks(
+            text, tokenizer, masking_strategy
+        )
+        if not mask_positions:
+            return "No suitable positions found for masking. Try different text or strategy.", "", 0
+        # Move to device
+        masked_input_ids = masked_input_ids.to("cuda")
+        attention_mask = attention_mask.to("cuda")
+        # Get symbolic predictions
+        predictions = get_symbolic_predictions(
+            masked_input_ids, attention_mask, mask_positions, selected_roles
+        )
+        # Build detailed analysis
+        analysis = {
+            "input_text": text,
+            "masking_strategy": masking_strategy,
+            "total_tokens": len(original_tokens),
+            "masked_positions": len(mask_positions),
+            "available_symbolic_roles": len(selected_roles),
+            "analysis_results": []
+        }
+        for pred_data in predictions:
+            pos = pred_data["position"]
+            original_token = original_tokens[pos]
+            # Show top N predictions
+            top_preds = pred_data["predictions"][:num_predictions]
+            position_analysis = {
+                "position": pos,
+                "original_token": original_token,
+                "top_predictions": []
+            }
+            for pred in top_preds:
+                position_analysis["top_predictions"].append({
+                    "symbolic_role": pred["token"],
+                    "probability": f"{pred['probability']:.4f}",
+                    "confidence": "High" if pred["probability"] > 0.3 else "Medium" if pred["probability"] > 0.1 else "Low"
+                })
+            analysis["analysis_results"].append(position_analysis)
+        # Create readable summary
+        summary_lines = []
+        max_prob = 0
+        best_prediction = None
+        for result in analysis["analysis_results"]:
+            pos = result["position"]
+            orig = result["original_token"]
+            top_pred = result["top_predictions"][0] if result["top_predictions"] else None
+            if top_pred:
+                prob = float(top_pred["probability"])
+                role = top_pred["symbolic_role"]
+                summary_lines.append(
+                    f"Position {pos:2d}: '{orig}' → {role} ({top_pred['probability']}, {top_pred['confidence']})"
+                )
+                if prob > max_prob:
+                    max_prob = prob
+                    best_prediction = f"{role} (confidence: {top_pred['confidence']})"
+        summary = "\n".join(summary_lines)
+        if best_prediction:
+            summary = f"🎯 Best Match: {best_prediction}\n\n" + summary
+        return json.dumps(analysis, indent=2), summary, len(mask_positions)
+    except Exception as e:
+        error_msg = f"Error during analysis: {str(e)}"
+        print(error_msg)
+        return error_msg, "", 0
+def create_manual_mask_analysis(text, mask_positions_str, selected_roles):
+    """
+    Allow manual specification of mask positions
+    """
+    try:
+        # Parse mask positions
+        mask_positions = [int(x.strip()) for x in mask_positions_str.split(",") if x.strip().isdigit()]
+        if not mask_positions:
+            return "Please specify valid mask positions (comma-separated numbers)", "", 0
+        # Tokenize text
+        batch = tokenizer(text, return_tensors="pt", add_special_tokens=True)
+        input_ids = batch.input_ids[0]
+        attention_mask = batch.attention_mask[0]
+        original_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        # Validate positions
+        valid_positions = [pos for pos in mask_positions if 0 <= pos < len(input_ids)]
+        if not valid_positions:
+            return f"Invalid positions. Text has {len(input_ids)} tokens (0-{len(input_ids)-1})", "", 0
+        # Create masked input
+        masked_input_ids = input_ids.clone()
+        for pos in valid_positions:
+            masked_input_ids[pos] = MASK_ID
+        # Run analysis
+        masked_input_ids = masked_input_ids.unsqueeze(0).to("cuda")
+        attention_mask = attention_mask.unsqueeze(0).to("cuda")
+        predictions = get_symbolic_predictions(
+            masked_input_ids, attention_mask, valid_positions, selected_roles
+        )
+        # Format results
+        results = []
+        for pred_data in predictions:
+            pos = pred_data["position"]
+            original = original_tokens[pos]
+            top_pred = pred_data["predictions"][0] if pred_data["predictions"] else None
+            if top_pred:
+                results.append(
+                    f"Pos {pos}: '{original}' → {top_pred['token']} ({top_pred['probability']:.4f})"
+                )
+        return "\n".join(results), f"Analyzed {len(valid_positions)} positions", len(valid_positions)
+    except Exception as e:
+        return f"Error: {str(e)}", "", 0
 # ------------------------------------------------------------------
+# 4. Gradio UI -----------------------------------------------------
 def build_interface():
+    with gr.Blocks(title="🧠 MLM Symbolic Classifier", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🧠 MLM-Based Symbolic Classification")
+        gr.Markdown("Analyze text using masked language modeling to predict symbolic roles at specific positions.")
+        with gr.Tab("Automatic Analysis"):
+            with gr.Row():
+                with gr.Column():
+                    txt_input = gr.Textbox(
+                        label="Input Text",
+                        lines=4,
+                        placeholder="Enter text to analyze for symbolic role classification..."
+                    )
+                    with gr.Row():
+                        masking_strategy = gr.Dropdown(
+                            choices=["content_words", "every_nth", "random"],
+                            value="content_words",
+                            label="Masking Strategy"
+                        )
+                        num_predictions = gr.Slider(
+                            minimum=1, maximum=10, value=5, step=1,
+                            label="Top Predictions per Position"
+                        )
+                    roles_selection = gr.CheckboxGroup(
+                        choices=list(symbolic_token_ids.keys()),
+                        value=list(symbolic_token_ids.keys()),
+                        label="Symbolic Roles to Consider",
+                        max_choices=len(symbolic_token_ids)
+                    )
+                    analyze_btn = gr.Button("🔍 Analyze", variant="primary")
+                with gr.Column():
+                    summary_output = gr.Textbox(
+                        label="Analysis Summary",
+                        lines=10,
+                        max_lines=15
+                    )
+                    with gr.Row():
+                        positions_analyzed = gr.Number(label="Positions Analyzed", precision=0)
+                        max_confidence = gr.Textbox(label="Best Prediction", max_lines=1)
+            detailed_output = gr.JSON(label="Detailed Results")
+        with gr.Tab("Manual Masking"):
+            with gr.Row():
+                with gr.Column():
+                    manual_text = gr.Textbox(
+                        label="Input Text",
+                        lines=3,
+                        placeholder="Enter text for manual analysis..."
+                    )
+                    mask_positions_input = gr.Textbox(
+                        label="Mask Positions (comma-separated)",
+                        placeholder="e.g., 2,5,8,12",
+                        info="Specify token positions to mask (0-based indexing)"
+                    )
+                    manual_roles = gr.CheckboxGroup(
+                        choices=list(symbolic_token_ids.keys()),
+                        value=list(symbolic_token_ids.keys())[:10],  # Default to first 10
+                        label="Symbolic Roles"
+                    )
+                    manual_analyze_btn = gr.Button("🎯 Analyze Specific Positions")
+                with gr.Column():
+                    manual_results = gr.Textbox(
+                        label="Manual Analysis Results",
+                        lines=8
+                    )
+                    manual_summary = gr.Textbox(label="Summary")
+                    manual_count = gr.Number(label="Positions", precision=0)
+        with gr.Tab("Token Inspector"):
+            with gr.Row():
+                with gr.Column():
+                    inspect_text = gr.Textbox(
+                        label="Text to Inspect",
+                        lines=2,
+                        placeholder="Enter text to see tokenization..."
+                    )
+                    inspect_btn = gr.Button("🔍 Inspect Tokens")
+                with gr.Column():
+                    token_breakdown = gr.Textbox(
+                        label="Token Breakdown",
+                        lines=8,
+                        info="Shows how text is tokenized with position indices"
+                    )
+        # Event handlers
+        analyze_btn.click(
+            symbolic_classification_analysis,
+            inputs=[txt_input, roles_selection, masking_strategy, num_predictions],
+            outputs=[detailed_output, summary_output, positions_analyzed]
+        )
+        manual_analyze_btn.click(
+            create_manual_mask_analysis,
+            inputs=[manual_text, mask_positions_input, manual_roles],
+            outputs=[manual_results, manual_summary, manual_count]
+        )
+        def inspect_tokens(text):
+            if not text.strip():
+                return "Enter text to inspect tokenization"
+            tokens = tokenizer.tokenize(text, add_special_tokens=True)
+            result_lines = []
+            for i, token in enumerate(tokens):
+                result_lines.append(f"{i:2d}: '{token}'")
+            return "\n".join(result_lines)
+        inspect_btn.click(
+            inspect_tokens,
+            inputs=[inspect_text],
+            outputs=[token_breakdown]
+        )
     return demo
+if __name__ == "__main__":
+    print("🚀 Starting MLM Symbolic Classifier...")
+    print(f"✅ Model loaded with {len(symbolic_token_ids)} symbolic tokens")
+    print(f"🎯 Available symbolic roles: {list(symbolic_token_ids.keys())[:5]}...")
+    build_interface().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )