Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
| import pandas as pd | |
| import os | |
| import re | |
| from filelock import FileLock | |
| import torch | |
| import numpy as np | |
| # ----------------------------- | |
| # Load Models with Error Handling | |
| # ----------------------------- | |
| try: | |
| # English model | |
| english_model = pipeline( | |
| "sentiment-analysis", | |
| model="siebert/sentiment-roberta-large-english", | |
| tokenizer="siebert/sentiment-roberta-large-english" | |
| ) | |
| # Urdu model | |
| urdu_model = pipeline( | |
| "sentiment-analysis", | |
| model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu" | |
| ) | |
| # Roman Urdu model | |
| roman_urdu_model = pipeline( | |
| "sentiment-analysis", | |
| model="tahamueed23/urdu-roman-urdu-sentiment-cardiffnlp" | |
| ) | |
| # Language detection model | |
| lang_detector = pipeline( | |
| "text-classification", | |
| model="papluca/xlm-roberta-base-language-detection" | |
| ) | |
| except Exception as e: | |
| print(f"Error loading models: {e}") | |
| raise | |
| # ----------------------------- | |
| # Enhanced Language Detection | |
| # ----------------------------- | |
| # Core Roman Urdu keywords (expanded list) | |
| roman_urdu_core = [ | |
| "acha", "achy", "achay", "khali", "aain", "aram", "aate", "achi", "aik", "asaani", | |
| "aur", "aj", "aya", "baat", "behas", "behtar", "bohot", "chal", "deh", "dala", | |
| "dali", "dalta", "deen", "detay", "deta", "deti", "dostana", "di", "diya", "diye", | |
| "dilchasp", "fori", "gaya", "ganda", "gaye", "hain", "hai", "hi", "hoslaafzai", | |
| "hoti", "hotay", "hua", "huay", "hue", "hosla", "huin", "hal", "hain", "hui", | |
| "imtihaan", "ja", "kab", "kabhi", "ka", "kam", "karta", "ke", "kesy", "khrab", | |
| "kharab", "kiya", "kun", "ki", "kamzor", "ko", "kuch", "lamba", "lambe", "liye", | |
| "madad", "madadgar", "maine", "mehdood", "mein", "mera", "meri", "munsifana", | |
| "mutaharrik", "munazzam", "mufeed", "mushkil", "mukhtasir", "mutasir", "mukammal", | |
| "na", "namukammal", "nishistain", "naqis", "nahi", "ne", "nisab", "par", "pasand", | |
| "paya", "py", "pursukoon", "purani", "purana", "purany", "raha", "roshan", "rakhi", | |
| "saka", "samajh", "sarah", "se", "shandaar", "seekha", "sust", "saaf", "suthri", | |
| "tareef", "targheeb", "tez", "tha", "thay", "theen", "tulaba", "thein", "thin", | |
| "thi", "tor", "tumne", "uljha", "ur", "usne", "ustad", "waqfa", "wala", "wazeh", | |
| "zyada", "zabardast", "bohat", "kya", "main", "tum", "wo", "ye", "unhon", "inhon" | |
| ] | |
| # Compile regex patterns | |
| roman_urdu_pattern_core = re.compile(r'\b(' + "|".join(roman_urdu_core) + r')\b', re.IGNORECASE) | |
| def detect_language_enhanced(text): | |
| """Enhanced language detection using both model and rule-based approach""" | |
| if not text.strip(): | |
| return "English" | |
| text_clean = str(text).strip() | |
| # Step 1: Urdu script detection (most reliable) | |
| if re.search(r'[\u0600-\u06FF]', text_clean): | |
| return "Urdu" | |
| # Step 2: Use transformer model for language detection | |
| try: | |
| lang_result = lang_detector(text_clean[:512])[0] # Limit text length | |
| lang_label = lang_result['label'] | |
| lang_score = lang_result['score'] | |
| if lang_label == 'ur' and lang_score > 0.7: | |
| return "Urdu" | |
| elif lang_label in ['en', 'ur'] and lang_score > 0.6: | |
| # Further check for Roman Urdu | |
| core_hits = len(re.findall(roman_urdu_pattern_core, text_clean.lower())) | |
| tokens = re.findall(r'\b\w+\b', text_clean) | |
| total_tokens = len(tokens) | |
| # Strong Roman Urdu indicators | |
| if core_hits >= 2: | |
| return "Roman Urdu" | |
| elif core_hits >= 1 and total_tokens <= 6: | |
| return "Roman Urdu" | |
| elif core_hits / max(total_tokens, 1) > 0.3: # 30% Roman Urdu words | |
| return "Roman Urdu" | |
| return "English" if lang_label == 'en' else "Urdu" | |
| except Exception as e: | |
| print(f"Language detection error: {e}") | |
| # Fallback: Rule-based detection | |
| return detect_language_fallback(text_clean) | |
| def detect_language_fallback(text): | |
| """Fallback language detection using rules""" | |
| text_lower = text.lower() | |
| # Urdu script check | |
| if re.search(r'[\u0600-\u06FF]', text): | |
| return "Urdu" | |
| # Count Roman Urdu core words | |
| core_hits = len(re.findall(roman_urdu_pattern_core, text_lower)) | |
| tokens = re.findall(r'\b\w+\b', text_lower) | |
| total_tokens = len(tokens) | |
| # Roman Urdu detection rules | |
| if core_hits >= 2: | |
| return "Roman Urdu" | |
| elif core_hits >= 1 and total_tokens <= 5: | |
| return "Roman Urdu" | |
| elif core_hits / max(total_tokens, 1) > 0.25: # 25% threshold | |
| return "Roman Urdu" | |
| return "English" | |
| # ----------------------------- | |
| # Enhanced Roman Urdu Normalization | |
| # ----------------------------- | |
| def normalize_roman_urdu_enhanced(text): | |
| """Enhanced Roman Urdu text normalization""" | |
| text = text.lower().strip() | |
| # Common Roman Urdu variations normalization | |
| replacements = { | |
| r'\bhy\b': 'hai', | |
| r'\bh\b': 'hai', | |
| r'\bnhi\b': 'nahi', | |
| r'\bnai\b': 'nahi', | |
| r'\bna\b': 'nahi', | |
| r'\bboht\b': 'bohot', | |
| r'\bbhot\b': 'bohot', | |
| r'\bzyada\b': 'zyada', | |
| r'\bzada\b': 'zyada', | |
| r'\bacha\b': 'acha', | |
| r'\bachay\b': 'achay', | |
| r'\bthy\b': 'thay', | |
| r'\bthi\b': 'thi', | |
| r'\btha\b': 'tha' | |
| } | |
| for pattern, replacement in replacements.items(): | |
| text = re.sub(pattern, replacement, text) | |
| return text | |
| # ----------------------------- | |
| # Sentiment Analysis Enhancement | |
| # ----------------------------- | |
| def get_strong_words(text, language): | |
| """Extract strong sentiment-bearing words""" | |
| text_lower = text.lower() | |
| strong_words = [] | |
| # Positive indicators | |
| positive_patterns = { | |
| 'english': [r'excellent', r'outstanding', r'amazing', r'wonderful', r'perfect', | |
| r'brilliant', r'fantastic', r'superb', r'terrible', r'awful', | |
| r'horrible', r'disappointing', r'poor', r'bad'], | |
| 'urdu': [r'زبردست', r'شاندار', r'عمدہ', r'بہترین', r'خراب', r'برا', r'مایوس کن'], | |
| 'roman_urdu': [r'zabardast', r'shandaar', r'umdah', r'behtareen', r'kharab', | |
| r'bura', r'mayus', r'kamaal'] | |
| } | |
| lang_key = 'english' if language == 'English' else 'urdu' if language == 'Urdu' else 'roman_urdu' | |
| for pattern in positive_patterns[lang_key]: | |
| matches = re.findall(pattern, text_lower, re.IGNORECASE) | |
| strong_words.extend(matches) | |
| return strong_words | |
| def adjust_sentiment_with_context(text, sentiment, score, language): | |
| """Adjust sentiment based on context and strong words""" | |
| strong_words = get_strong_words(text, language) | |
| # If strong negative words present but sentiment is positive/neutral, adjust | |
| negative_indicators = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad', | |
| 'خراب', 'برا', 'مایوس کن', 'kharab', 'bura', 'mayus'] | |
| positive_indicators = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', | |
| 'brilliant', 'fantastic', 'superb', 'زبردست', 'شاندار', 'عمدہ', | |
| 'zabardast', 'shandaar', 'umdah'] | |
| strong_negative_present = any(word in strong_words for word in negative_indicators) | |
| strong_positive_present = any(word in strong_words for word in positive_indicators) | |
| # Adjustment rules | |
| if strong_negative_present and sentiment in ["Positive", "Neutral"] and score < 0.8: | |
| return "Negative", min(score + 0.2, 0.95) | |
| elif strong_positive_present and sentiment in ["Negative", "Neutral"] and score < 0.8: | |
| return "Positive", min(score + 0.2, 0.95) | |
| # Low confidence adjustment | |
| if score < 0.6: | |
| return "Neutral", 0.5 | |
| return sentiment, score | |
| # ----------------------------- | |
| # Enhanced Ensemble Method | |
| # ----------------------------- | |
| def ensemble_roman_urdu_enhanced(text): | |
| """Enhanced ensemble for Roman Urdu sentiment""" | |
| normalized_text = normalize_roman_urdu_enhanced(text) | |
| try: | |
| ru_result = roman_urdu_model(normalized_text)[0] | |
| ur_result = urdu_model(normalized_text)[0] | |
| ru_sent = normalize_label(ru_result["label"]) | |
| ur_sent = normalize_label(ur_result["label"]) | |
| # If both agree, return the higher confidence one | |
| if ru_sent == ur_sent: | |
| return ru_result if ru_result["score"] >= ur_result["score"] else ur_result | |
| # Weight Roman Urdu model higher for Roman Urdu text | |
| ru_weight = ru_result["score"] * 1.3 # Increased weight | |
| ur_weight = ur_result["score"] | |
| return ru_result if ru_weight >= ur_weight else ur_result | |
| except Exception as e: | |
| print(f"Ensemble error: {e}") | |
| # Fallback to Roman Urdu model | |
| return roman_urdu_model(normalized_text)[0] | |
| # ----------------------------- | |
| # Normalize Labels | |
| # ----------------------------- | |
| def normalize_label(label): | |
| """Normalize sentiment labels across different models""" | |
| label = str(label).lower() | |
| if any(word in label for word in ["pos", "positive", "positive", "lab"]): | |
| return "Positive" | |
| elif any(word in label for word in ["neg", "negative", "negative"]): | |
| return "Negative" | |
| else: | |
| return "Neutral" | |
| # ----------------------------- | |
| # Polarity Explanation | |
| # ----------------------------- | |
| def polarity_explanation_enhanced(text, sentiment, score, language): | |
| """Enhanced polarity explanation with examples""" | |
| strong_words = get_strong_words(text, language) | |
| explanations = { | |
| "Positive": { | |
| "high": "Strong positive sentiment with clear praise words.", | |
| "medium": "Moderately positive with some favorable expressions.", | |
| "low": "Slightly positive tone." | |
| }, | |
| "Negative": { | |
| "high": "Strong negative sentiment with clear criticism.", | |
| "medium": "Moderately negative with some critical expressions.", | |
| "low": "Slightly negative tone." | |
| }, | |
| "Neutral": { | |
| "high": "Clearly neutral or factual statement.", | |
| "medium": "Mostly neutral with balanced perspective.", | |
| "low": "Weak sentiment leaning neutral." | |
| } | |
| } | |
| # Determine confidence level | |
| if score >= 0.8: | |
| confidence = "high" | |
| elif score >= 0.6: | |
| confidence = "medium" | |
| else: | |
| confidence = "low" | |
| base_explanation = explanations[sentiment][confidence] | |
| if strong_words: | |
| base_explanation += f" Key words: {', '.join(strong_words[:3])}." | |
| return base_explanation | |
| # ----------------------------- | |
| # CSV Setup | |
| # ----------------------------- | |
| SAVE_FILE = "sentiment_logs.csv" | |
| LOCK_FILE = SAVE_FILE + ".lock" | |
| if not os.path.exists(SAVE_FILE): | |
| pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"]).to_csv( | |
| SAVE_FILE, index=False, encoding="utf-8-sig" | |
| ) | |
| # ----------------------------- | |
| # Main Analysis Function | |
| # ----------------------------- | |
| def analyze_sentiment_enhanced(text, lang_hint): | |
| """Enhanced sentiment analysis with better language detection and context""" | |
| if not text.strip(): | |
| return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, "" | |
| # Language detection | |
| lang = lang_hint if lang_hint != "Auto Detect" else detect_language_enhanced(text) | |
| try: | |
| # Sentiment analysis based on language | |
| if lang == "English": | |
| result = english_model(text[:512])[0] # Limit text length | |
| elif lang == "Urdu": | |
| result = urdu_model(text[:512])[0] | |
| else: # Roman Urdu | |
| result = ensemble_roman_urdu_enhanced(text) | |
| sentiment = normalize_label(result["label"]) | |
| score = round(float(result["score"]), 3) | |
| # Context-aware sentiment adjustment | |
| sentiment, score = adjust_sentiment_with_context(text, sentiment, score, lang) | |
| # Get strong words and explanation | |
| strong_words = get_strong_words(text, lang) | |
| explanation = polarity_explanation_enhanced(text, sentiment, score, lang) | |
| strong_words_str = ", ".join(strong_words[:5]) if strong_words else "None" | |
| # Save logs | |
| with FileLock(LOCK_FILE): | |
| df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \ | |
| if os.path.exists(SAVE_FILE) else pd.DataFrame( | |
| columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"] | |
| ) | |
| new_row = pd.DataFrame([[text, lang, sentiment, score, strong_words_str]], | |
| columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"]) | |
| df = pd.concat([df, new_row], ignore_index=True) | |
| df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig") | |
| return sentiment, str(score), explanation, SAVE_FILE, strong_words_str | |
| except Exception as e: | |
| error_msg = f"Analysis error: {str(e)}" | |
| return "Error", "0", error_msg, SAVE_FILE, "" | |
| # ----------------------------- | |
| # Show Logs | |
| # ----------------------------- | |
| def show_logs(): | |
| if os.path.exists(SAVE_FILE): | |
| df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") | |
| return df.tail(20) # Show last 20 entries | |
| else: | |
| return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"]) | |
| # ----------------------------- | |
| # Clear Logs | |
| # ----------------------------- | |
| def clear_logs(): | |
| if os.path.exists(SAVE_FILE): | |
| os.remove(SAVE_FILE) | |
| return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"]) | |
| # ----------------------------- | |
| # Enhanced Gradio UI | |
| # ----------------------------- | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌍 Enhanced Multilingual Sentiment Analysis | |
| **English • Urdu • Roman Urdu** | |
| Advanced sentiment detection with: | |
| - 🤖 Transformer-based language detection | |
| - 🔍 Context-aware sentiment analysis | |
| - 💪 Strong word extraction | |
| - 🎯 Enhanced Roman Urdu processing | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| user_text = gr.Textbox( | |
| label="✍️ Enter Text", | |
| placeholder="Type in English, Urdu, or Roman Urdu...", | |
| lines=3 | |
| ) | |
| lang_dropdown = gr.Dropdown( | |
| ["Auto Detect", "English", "Urdu", "Roman Urdu"], | |
| value="Auto Detect", | |
| label="🌐 Language Selection" | |
| ) | |
| with gr.Row(): | |
| btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary") | |
| btn_show = gr.Button("📂 Show Recent Logs") | |
| btn_clear = gr.Button("🗑️ Clear Logs", variant="secondary") | |
| with gr.Column(scale=1): | |
| out_sent = gr.Textbox(label="🎭 Sentiment") | |
| out_conf = gr.Textbox(label="📊 Confidence Score") | |
| out_exp = gr.Textbox(label="💡 Analysis Explanation") | |
| out_strong = gr.Textbox(label="💪 Strong Words Detected") | |
| out_file = gr.File(label="⬇️ Download Complete Logs", type="filepath") | |
| with gr.Row(): | |
| logs_df = gr.Dataframe( | |
| headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words"], | |
| label="📋 Recent Sentiment Logs", | |
| interactive=False, | |
| wrap=True, | |
| max_height=400 | |
| ) | |
| # Event handlers | |
| btn_analyze.click( | |
| analyze_sentiment_enhanced, | |
| inputs=[user_text, lang_dropdown], | |
| outputs=[out_sent, out_conf, out_exp, out_file, out_strong] | |
| ) | |
| btn_show.click(show_logs, outputs=[logs_df]) | |
| btn_clear.click(clear_logs, outputs=[logs_df]) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |