Spaces:

tahamueed23
/

Sentiment-Analyzer

Sleeping

App Files Files Community

tahamueed23 commited on 25 days ago

Commit

cfae69a

verified ·

1 Parent(s): 0780c88

Update app.py

Browse files

Files changed (1) hide show

app.py +369 -126

app.py CHANGED Viewed

@@ -1,76 +1,256 @@
 import gradio as gr
-from transformers import pipeline
 import pandas as pd
 import os
 import re
 from filelock import FileLock
 # -----------------------------
-# Load Transformer Models
 # -----------------------------
-english_model = pipeline(
-    "sentiment-analysis",
-    model="siebert/sentiment-roberta-large-english"
-)
-urdu_model = pipeline(
-    "sentiment-analysis",
-    model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
-)
-roman_urdu_model = pipeline(
-    "sentiment-analysis",
-    model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
-)
-# -----------------------------
-# CSV Setup
-# -----------------------------
-SAVE_FILE = "sentiment_logs.csv"
-LOCK_FILE = SAVE_FILE + ".lock"
-if not os.path.exists(SAVE_FILE):
-    pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"]).to_csv(
-        SAVE_FILE, index=False, encoding="utf-8-sig"
     )
 # -----------------------------
-# Improved Language Detection
 # -----------------------------
-def detect_language(text):
-    urdu_script = re.compile(r"[\u0600-\u06FF]")
-    if urdu_script.search(text):
-        return "Urdu"
-    roman_urdu_patterns = [
-        r"\b(hai|hain|tha|thi|parhta|parhai|acha|bura|bohot|zabardast)\b",
-        r"\b(sir|madam|ustad|class|parh|samajh)\b",
-    ]
-    text_l = text.lower()
-    for p in roman_urdu_patterns:
-        if re.search(p, text_l):
-            return "Roman Urdu"
     return "English"
 # -----------------------------
-# Roman Urdu Normalization
 # -----------------------------
-def normalize_roman_urdu(text):
-    text = text.lower()
-    text = text.replace("hy", "hai").replace("h", "hai")
-    text = re.sub(r"\bnhi\b|\bnai\b|\bnhi\b", "nahi", text)
     return text
 # -----------------------------
 # Normalize Labels
 # -----------------------------
 def normalize_label(label):
-    label = label.lower()
-    if "pos" in label or "positive" in label:
         return "Positive"
-    elif "neg" in label or "negative" in label:
         return "Negative"
     else:
         return "Neutral"
@@ -78,117 +258,180 @@ def normalize_label(label):
 # -----------------------------
 # Polarity Explanation
 # -----------------------------
-def polarity_explanation(text, sentiment):
     explanations = {
-        "Positive": "Contains praise words or positive evaluation.",
-        "Negative": "Contains criticism or negative expressions.",
-        "Neutral": "Factual statement or balanced observation."
     }
-    return explanations.get(sentiment, "")
 # -----------------------------
-# Ensemble Roman Urdu + Urdu
 # -----------------------------
-def ensemble_roman_urdu(text):
-    ru = roman_urdu_model(text)[0]
-    ur = urdu_model(text)[0]
-    ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
-    if ru_sent == ur_sent:
-        return ru if ru["score"] >= ur["score"] else ur
-    # Weight Roman Urdu higher for Roman Urdu input
-    weight_ru = ru["score"] * 1.25
-    weight_ur = ur["score"]
-    return ru if weight_ru >= weight_ur else ur
-# -----------------------------
-# Adjust sentiment if low intensity
-# -----------------------------
-def adjust_for_neutral(text, sentiment, score):
-    if sentiment in ["Positive", "Negative"] and score < 0.7:
-        return "Neutral", score
-    return sentiment, score
 # -----------------------------
 # Main Analysis Function
 # -----------------------------
-def analyze_sentiment(text, lang_hint):
     if not text.strip():
-        return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
-    lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
-    if lang == "English":
-        result = english_model(text)[0]
-    elif lang == "Urdu":
-        result = urdu_model(text)[0]
-    else:
-        text = normalize_roman_urdu(text)
-        result = ensemble_roman_urdu(text)
-    sentiment = normalize_label(result["label"])
-    score = round(float(result["score"]), 3)
-    sentiment, score = adjust_for_neutral(text, sentiment, score)
-    explanation = polarity_explanation(text, sentiment)
-    # Save logs
-    with FileLock(LOCK_FILE):
-        df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
-            if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
-        new_row = pd.DataFrame([[text, lang, sentiment, score]],
-                               columns=["Sentence", "Language", "Sentiment", "Confidence"])
-        df = pd.concat([df, new_row], ignore_index=True)
-        df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
-    return sentiment, str(score), explanation, SAVE_FILE
 # -----------------------------
 # Show Logs
 # -----------------------------
 def show_logs():
     if os.path.exists(SAVE_FILE):
-        return pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
     else:
-        return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
 # -----------------------------
-# Gradio UI
 # -----------------------------
-with gr.Blocks() as demo:
     gr.Markdown(
-        "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
-        "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
-        "🪶 Improved Roman Urdu normalization + ensemble + polarity explanation.\n"
     )
     with gr.Row():
-        with gr.Column():
-            user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type English, Urdu, or Roman Urdu...")
             lang_dropdown = gr.Dropdown(
                 ["Auto Detect", "English", "Urdu", "Roman Urdu"],
-                value="Auto Detect", label="🌐 Language"
             )
-            btn_analyze = gr.Button("🔍 Analyze Sentiment")
-            btn_show = gr.Button("📂 Show Saved Logs")
-        with gr.Column():
-            out_sent = gr.Textbox(label="Sentiment")
-            out_conf = gr.Textbox(label="Confidence (0–1)")
-            out_exp = gr.Textbox(label="Polarity Explanation")
-            out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
-    logs_df = gr.Dataframe(
-        headers=["Sentence", "Language", "Sentiment", "Confidence"],
-        label="🧾 Sentiment Logs", interactive=False
-    )
-    btn_analyze.click(analyze_sentiment,
-                      inputs=[user_text, lang_dropdown],
-                      outputs=[out_sent, out_conf, out_exp, out_file])
     btn_show.click(show_logs, outputs=[logs_df])
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 import pandas as pd
 import os
 import re
 from filelock import FileLock
+import torch
+import numpy as np
 # -----------------------------
+# Load Models with Error Handling
 # -----------------------------
+try:
+    # English model
+    english_model = pipeline(
+        "sentiment-analysis",
+        model="siebert/sentiment-roberta-large-english",
+        tokenizer="siebert/sentiment-roberta-large-english"
+    )
+    # Urdu model
+    urdu_model = pipeline(
+        "sentiment-analysis",
+        model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
+    )
+    # Roman Urdu model
+    roman_urdu_model = pipeline(
+        "sentiment-analysis",
+        model="tahamueed23/urdu-roman-urdu-sentiment-cardiffnlp"
+    )
+    # Language detection model
+    lang_detector = pipeline(
+        "text-classification",
+        model="papluca/xlm-roberta-base-language-detection"
     )
+except Exception as e:
+    print(f"Error loading models: {e}")
+    raise
 # -----------------------------
+# Enhanced Language Detection
 # -----------------------------
+# Core Roman Urdu keywords (expanded list)
+roman_urdu_core = [
+    "acha", "achy", "achay", "khali", "aain", "aram", "aate", "achi", "aik", "asaani",
+    "aur", "aj", "aya", "baat", "behas", "behtar", "bohot", "chal", "deh", "dala",
+    "dali", "dalta", "deen", "detay", "deta", "deti", "dostana", "di", "diya", "diye",
+    "dilchasp", "fori", "gaya", "ganda", "gaye", "hain", "hai", "hi", "hoslaafzai",
+    "hoti", "hotay", "hua", "huay", "hue", "hosla", "huin", "hal", "hain", "hui",
+    "imtihaan", "ja", "kab", "kabhi", "ka", "kam", "karta", "ke", "kesy", "khrab",
+    "kharab", "kiya", "kun", "ki", "kamzor", "ko", "kuch", "lamba", "lambe", "liye",
+    "madad", "madadgar", "maine", "mehdood", "mein", "mera", "meri", "munsifana",
+    "mutaharrik", "munazzam", "mufeed", "mushkil", "mukhtasir", "mutasir", "mukammal",
+    "na", "namukammal", "nishistain", "naqis", "nahi", "ne", "nisab", "par", "pasand",
+    "paya", "py", "pursukoon", "purani", "purana", "purany", "raha", "roshan", "rakhi",
+    "saka", "samajh", "sarah", "se", "shandaar", "seekha", "sust", "saaf", "suthri",
+    "tareef", "targheeb", "tez", "tha", "thay", "theen", "tulaba", "thein", "thin",
+    "thi", "tor", "tumne", "uljha", "ur", "usne", "ustad", "waqfa", "wala", "wazeh",
+    "zyada", "zabardast", "bohat", "kya", "main", "tum", "wo", "ye", "unhon", "inhon"
+]
+# Compile regex patterns
+roman_urdu_pattern_core = re.compile(r'\b(' + "|".join(roman_urdu_core) + r')\b', re.IGNORECASE)
+def detect_language_enhanced(text):
+    """Enhanced language detection using both model and rule-based approach"""
+    if not text.strip():
+        return "English"
+    text_clean = str(text).strip()
+    # Step 1: Urdu script detection (most reliable)
+    if re.search(r'[\u0600-\u06FF]', text_clean):
+        return "Urdu"
+    # Step 2: Use transformer model for language detection
+    try:
+        lang_result = lang_detector(text_clean[:512])[0]  # Limit text length
+        lang_label = lang_result['label']
+        lang_score = lang_result['score']
+        if lang_label == 'ur' and lang_score > 0.7:
+            return "Urdu"
+        elif lang_label in ['en', 'ur'] and lang_score > 0.6:
+            # Further check for Roman Urdu
+            core_hits = len(re.findall(roman_urdu_pattern_core, text_clean.lower()))
+            tokens = re.findall(r'\b\w+\b', text_clean)
+            total_tokens = len(tokens)
+            # Strong Roman Urdu indicators
+            if core_hits >= 2:
+                return "Roman Urdu"
+            elif core_hits >= 1 and total_tokens <= 6:
+                return "Roman Urdu"
+            elif core_hits / max(total_tokens, 1) > 0.3:  # 30% Roman Urdu words
+                return "Roman Urdu"
+            return "English" if lang_label == 'en' else "Urdu"
+    except Exception as e:
+        print(f"Language detection error: {e}")
+    # Fallback: Rule-based detection
+    return detect_language_fallback(text_clean)
+def detect_language_fallback(text):
+    """Fallback language detection using rules"""
+    text_lower = text.lower()
+    # Urdu script check
+    if re.search(r'[\u0600-\u06FF]', text):
+        return "Urdu"
+    # Count Roman Urdu core words
+    core_hits = len(re.findall(roman_urdu_pattern_core, text_lower))
+    tokens = re.findall(r'\b\w+\b', text_lower)
+    total_tokens = len(tokens)
+    # Roman Urdu detection rules
+    if core_hits >= 2:
+        return "Roman Urdu"
+    elif core_hits >= 1 and total_tokens <= 5:
+        return "Roman Urdu"
+    elif core_hits / max(total_tokens, 1) > 0.25:  # 25% threshold
+        return "Roman Urdu"
     return "English"
 # -----------------------------
+# Enhanced Roman Urdu Normalization
 # -----------------------------
+def normalize_roman_urdu_enhanced(text):
+    """Enhanced Roman Urdu text normalization"""
+    text = text.lower().strip()
+    # Common Roman Urdu variations normalization
+    replacements = {
+        r'\bhy\b': 'hai',
+        r'\bh\b': 'hai',
+        r'\bnhi\b': 'nahi',
+        r'\bnai\b': 'nahi',
+        r'\bna\b': 'nahi',
+        r'\bboht\b': 'bohot',
+        r'\bbhot\b': 'bohot',
+        r'\bzyada\b': 'zyada',
+        r'\bzada\b': 'zyada',
+        r'\bacha\b': 'acha',
+        r'\bachay\b': 'achay',
+        r'\bthy\b': 'thay',
+        r'\bthi\b': 'thi',
+        r'\btha\b': 'tha'
+    }
+    for pattern, replacement in replacements.items():
+        text = re.sub(pattern, replacement, text)
     return text
+# -----------------------------
+# Sentiment Analysis Enhancement
+# -----------------------------
+def get_strong_words(text, language):
+    """Extract strong sentiment-bearing words"""
+    text_lower = text.lower()
+    strong_words = []
+    # Positive indicators
+    positive_patterns = {
+        'english': [r'excellent', r'outstanding', r'amazing', r'wonderful', r'perfect',
+                   r'brilliant', r'fantastic', r'superb', r'terrible', r'awful',
+                   r'horrible', r'disappointing', r'poor', r'bad'],
+        'urdu': [r'زبردست', r'شاندار', r'عمدہ', r'بہترین', r'خراب', r'برا', r'مایوس کن'],
+        'roman_urdu': [r'zabardast', r'shandaar', r'umdah', r'behtareen', r'kharab',
+                      r'bura', r'mayus', r'kamaal']
+    }
+    lang_key = 'english' if language == 'English' else 'urdu' if language == 'Urdu' else 'roman_urdu'
+    for pattern in positive_patterns[lang_key]:
+        matches = re.findall(pattern, text_lower, re.IGNORECASE)
+        strong_words.extend(matches)
+    return strong_words
+def adjust_sentiment_with_context(text, sentiment, score, language):
+    """Adjust sentiment based on context and strong words"""
+    strong_words = get_strong_words(text, language)
+    # If strong negative words present but sentiment is positive/neutral, adjust
+    negative_indicators = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad',
+                          'خراب', 'برا', 'مایوس کن', 'kharab', 'bura', 'mayus']
+    positive_indicators = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect',
+                          'brilliant', 'fantastic', 'superb', 'زبردست', 'شاندار', 'عمدہ',
+                          'zabardast', 'shandaar', 'umdah']
+    strong_negative_present = any(word in strong_words for word in negative_indicators)
+    strong_positive_present = any(word in strong_words for word in positive_indicators)
+    # Adjustment rules
+    if strong_negative_present and sentiment in ["Positive", "Neutral"] and score < 0.8:
+        return "Negative", min(score + 0.2, 0.95)
+    elif strong_positive_present and sentiment in ["Negative", "Neutral"] and score < 0.8:
+        return "Positive", min(score + 0.2, 0.95)
+    # Low confidence adjustment
+    if score < 0.6:
+        return "Neutral", 0.5
+    return sentiment, score
+# -----------------------------
+# Enhanced Ensemble Method
+# -----------------------------
+def ensemble_roman_urdu_enhanced(text):
+    """Enhanced ensemble for Roman Urdu sentiment"""
+    normalized_text = normalize_roman_urdu_enhanced(text)
+    try:
+        ru_result = roman_urdu_model(normalized_text)[0]
+        ur_result = urdu_model(normalized_text)[0]
+        ru_sent = normalize_label(ru_result["label"])
+        ur_sent = normalize_label(ur_result["label"])
+        # If both agree, return the higher confidence one
+        if ru_sent == ur_sent:
+            return ru_result if ru_result["score"] >= ur_result["score"] else ur_result
+        # Weight Roman Urdu model higher for Roman Urdu text
+        ru_weight = ru_result["score"] * 1.3  # Increased weight
+        ur_weight = ur_result["score"]
+        return ru_result if ru_weight >= ur_weight else ur_result
+    except Exception as e:
+        print(f"Ensemble error: {e}")
+        # Fallback to Roman Urdu model
+        return roman_urdu_model(normalized_text)[0]
 # -----------------------------
 # Normalize Labels
 # -----------------------------
 def normalize_label(label):
+    """Normalize sentiment labels across different models"""
+    label = str(label).lower()
+    if any(word in label for word in ["pos", "positive", "positive", "lab"]):
         return "Positive"
+    elif any(word in label for word in ["neg", "negative", "negative"]):
         return "Negative"
     else:
         return "Neutral"
 # -----------------------------
 # Polarity Explanation
 # -----------------------------
+def polarity_explanation_enhanced(text, sentiment, score, language):
+    """Enhanced polarity explanation with examples"""
+    strong_words = get_strong_words(text, language)
     explanations = {
+        "Positive": {
+            "high": "Strong positive sentiment with clear praise words.",
+            "medium": "Moderately positive with some favorable expressions.",
+            "low": "Slightly positive tone."
+        },
+        "Negative": {
+            "high": "Strong negative sentiment with clear criticism.",
+            "medium": "Moderately negative with some critical expressions.",
+            "low": "Slightly negative tone."
+        },
+        "Neutral": {
+            "high": "Clearly neutral or factual statement.",
+            "medium": "Mostly neutral with balanced perspective.",
+            "low": "Weak sentiment leaning neutral."
+        }
     }
+    # Determine confidence level
+    if score >= 0.8:
+        confidence = "high"
+    elif score >= 0.6:
+        confidence = "medium"
+    else:
+        confidence = "low"
+    base_explanation = explanations[sentiment][confidence]
+    if strong_words:
+        base_explanation += f" Key words: {', '.join(strong_words[:3])}."
+    return base_explanation
 # -----------------------------
+# CSV Setup
 # -----------------------------
+SAVE_FILE = "sentiment_logs.csv"
+LOCK_FILE = SAVE_FILE + ".lock"
+if not os.path.exists(SAVE_FILE):
+    pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"]).to_csv(
+        SAVE_FILE, index=False, encoding="utf-8-sig"
+    )
 # -----------------------------
 # Main Analysis Function
 # -----------------------------
+def analyze_sentiment_enhanced(text, lang_hint):
+    """Enhanced sentiment analysis with better language detection and context"""
     if not text.strip():
+        return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, ""
+    # Language detection
+    lang = lang_hint if lang_hint != "Auto Detect" else detect_language_enhanced(text)
+    try:
+        # Sentiment analysis based on language
+        if lang == "English":
+            result = english_model(text[:512])[0]  # Limit text length
+        elif lang == "Urdu":
+            result = urdu_model(text[:512])[0]
+        else:  # Roman Urdu
+            result = ensemble_roman_urdu_enhanced(text)
+        sentiment = normalize_label(result["label"])
+        score = round(float(result["score"]), 3)
+        # Context-aware sentiment adjustment
+        sentiment, score = adjust_sentiment_with_context(text, sentiment, score, lang)
+        # Get strong words and explanation
+        strong_words = get_strong_words(text, lang)
+        explanation = polarity_explanation_enhanced(text, sentiment, score, lang)
+        strong_words_str = ", ".join(strong_words[:5]) if strong_words else "None"
+        # Save logs
+        with FileLock(LOCK_FILE):
+            df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
+                if os.path.exists(SAVE_FILE) else pd.DataFrame(
+                    columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"]
+                )
+            new_row = pd.DataFrame([[text, lang, sentiment, score, strong_words_str]],
+                                 columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"])
+            df = pd.concat([df, new_row], ignore_index=True)
+            df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
+        return sentiment, str(score), explanation, SAVE_FILE, strong_words_str
+    except Exception as e:
+        error_msg = f"Analysis error: {str(e)}"
+        return "Error", "0", error_msg, SAVE_FILE, ""
 # -----------------------------
 # Show Logs
 # -----------------------------
 def show_logs():
     if os.path.exists(SAVE_FILE):
+        df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
+        return df.tail(20)  # Show last 20 entries
     else:
+        return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"])
 # -----------------------------
+# Clear Logs
+# -----------------------------
+def clear_logs():
+    if os.path.exists(SAVE_FILE):
+        os.remove(SAVE_FILE)
+    return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"])
 # -----------------------------
+# Enhanced Gradio UI
+# -----------------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
+        """
+        # 🌍 Enhanced Multilingual Sentiment Analysis
+        **English • Urdu • Roman Urdu**
+        Advanced sentiment detection with:
+        - 🤖 Transformer-based language detection
+        - 🔍 Context-aware sentiment analysis
+        - 💪 Strong word extraction
+        - 🎯 Enhanced Roman Urdu processing
+        """
     )
     with gr.Row():
+        with gr.Column(scale=1):
+            user_text = gr.Textbox(
+                label="✍️ Enter Text",
+                placeholder="Type in English, Urdu, or Roman Urdu...",
+                lines=3
+            )
             lang_dropdown = gr.Dropdown(
                 ["Auto Detect", "English", "Urdu", "Roman Urdu"],
+                value="Auto Detect",
+                label="🌐 Language Selection"
             )
+            with gr.Row():
+                btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary")
+                btn_show = gr.Button("📂 Show Recent Logs")
+                btn_clear = gr.Button("🗑️ Clear Logs", variant="secondary")
+        with gr.Column(scale=1):
+            out_sent = gr.Textbox(label="🎭 Sentiment")
+            out_conf = gr.Textbox(label="📊 Confidence Score")
+            out_exp = gr.Textbox(label="💡 Analysis Explanation")
+            out_strong = gr.Textbox(label="💪 Strong Words Detected")
+            out_file = gr.File(label="⬇️ Download Complete Logs", type="filepath")
+    with gr.Row():
+        logs_df = gr.Dataframe(
+            headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"],
+            label="📋 Recent Sentiment Logs",
+            interactive=False,
+            wrap=True,
+            max_height=400
+        )
+    # Event handlers
+    btn_analyze.click(
+        analyze_sentiment_enhanced,
+        inputs=[user_text, lang_dropdown],
+        outputs=[out_sent, out_conf, out_exp, out_file, out_strong]
+    )
     btn_show.click(show_logs, outputs=[logs_df])
+    btn_clear.click(clear_logs, outputs=[logs_df])
 if __name__ == "__main__":
+    demo.launch(share=False)