Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
| import pandas as pd | |
| import os | |
| import re | |
| from filelock import FileLock | |
| import torch | |
| # ----------------------------- | |
| # Load Models with Error Handling | |
| # ----------------------------- | |
| try: | |
| # English sentiment model | |
| english_model = pipeline( | |
| "sentiment-analysis", | |
| model="siebert/sentiment-roberta-large-english" | |
| ) | |
| # Urdu sentiment model | |
| urdu_model = pipeline( | |
| "sentiment-analysis", | |
| model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu" | |
| ) | |
| # Roman Urdu sentiment model | |
| roman_urdu_model = pipeline( | |
| "sentiment-analysis", | |
| model="tahamueed23/roman-urdu-sentiment" | |
| ) | |
| # Language detection model | |
| lang_detector = pipeline( | |
| "text-classification", | |
| model="papluca/xlm-roberta-base-language-detection" | |
| ) | |
| print("✅ All models loaded successfully!") | |
| except Exception as e: | |
| print(f"❌ Error loading models: {e}") | |
| raise | |
| # ----------------------------- | |
| # Roman Urdu Word Databases | |
| # ----------------------------- | |
| ROMAN_URDU_POSITIVE_WORDS = { | |
| 'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda', | |
| 'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida', | |
| 'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri', | |
| 'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran', | |
| 'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi', | |
| 'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche' | |
| } | |
| ROMAN_URDU_NEGATIVE_WORDS = { | |
| 'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal', | |
| 'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf', | |
| 'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez', | |
| 'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda', | |
| 'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish' | |
| } | |
| ROMAN_URDU_NEUTRAL_WORDS = { | |
| 'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon', | |
| 'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh', | |
| 'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna' | |
| } | |
| # Compile regex patterns for faster matching | |
| roman_urdu_positive_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE) | |
| roman_urdu_negative_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE) | |
| # ----------------------------- | |
| # Enhanced Language Detection | |
| # ----------------------------- | |
| def detect_language_advanced(text): | |
| """Advanced language detection using model + rules""" | |
| if not text.strip(): | |
| return "English" | |
| text_clean = text.strip() | |
| # Step 1: Urdu script detection (most reliable) | |
| if re.search(r'[\u0600-\u06FF]', text_clean): | |
| return "Urdu" | |
| # Step 2: Use transformer model for language detection | |
| try: | |
| # Truncate very long texts to avoid model limits | |
| truncated_text = text_clean[:250] | |
| lang_result = lang_detector(truncated_text)[0] | |
| lang_label = lang_result['label'].upper() | |
| lang_score = lang_result['score'] | |
| # Map model outputs to our language categories | |
| lang_map = { | |
| 'UR': 'Urdu', | |
| 'EN': 'English', | |
| 'Ro-Ur': 'English', # Hindi often mixed with Roman Urdu | |
| } | |
| detected_lang = lang_map.get(lang_label, 'English') | |
| # Step 3: For Urdu/English detection, apply Roman Urdu rules | |
| if detected_lang in ['Urdu', 'English']: | |
| if is_likely_roman_urdu(text_clean): | |
| return "Roman Urdu" | |
| return detected_lang | |
| except Exception as e: | |
| print(f"Language detection model error: {e}") | |
| # Fallback to rule-based detection | |
| return detect_language_fallback(text_clean) | |
| def is_likely_roman_urdu(text): | |
| """Check if text is likely Roman Urdu using comprehensive rules""" | |
| text_lower = text.lower() | |
| # Count Roman Urdu specific words | |
| positive_hits = len(roman_urdu_positive_pattern.findall(text_lower)) | |
| negative_hits = len(roman_urdu_negative_pattern.findall(text_lower)) | |
| total_hits = positive_hits + negative_hits | |
| # Count total words | |
| words = re.findall(r'\b\w+\b', text_lower) | |
| total_words = len(words) | |
| if total_words == 0: | |
| return False | |
| # Rule 1: High percentage of Roman Urdu words | |
| roman_urdu_ratio = total_hits / total_words | |
| if roman_urdu_ratio > 0.3: # 30% threshold | |
| return True | |
| # Rule 2: Specific Roman Urdu sentence structures | |
| roman_urdu_patterns = [ | |
| r"^[a-z ]*(hai|hain|tha|thi|ho|hun|hein)[\s\.\!]*$", | |
| r"^[a-z ]*(main|tum|wo|ye|unhon|inhon)[a-z ]*(hun|hein|ho|hai)[a-z ]*$", | |
| r"^[a-z ]*(acha|bura|kharab|behtar|zabardast)[a-z ]*(hai|hain|tha)[a-z ]*$", | |
| r"^[a-z ]*(kyun|kese|kaise|kisne|kisliye)[a-z ]*\?$", | |
| r"^[a-z ]*(bohat|bahut|zyada|zyda)[a-z ]+(acha|bura|kharab|behtar)" | |
| ] | |
| for pattern in roman_urdu_patterns: | |
| if re.search(pattern, text_lower): | |
| return True | |
| # Rule 3: Presence of key Roman Urdu function words | |
| function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne'] | |
| function_word_count = sum(1 for word in words if word in function_words) | |
| if function_word_count >= 2 and total_words <= 8: | |
| return True | |
| return False | |
| def detect_language_fallback(text): | |
| """Rule-based fallback language detection""" | |
| text_lower = text.lower() | |
| # Urdu script check | |
| if re.search(r'[\u0600-\u06FF]', text): | |
| return "Urdu" | |
| # Roman Urdu detection | |
| if is_likely_roman_urdu(text): | |
| return "Roman Urdu" | |
| return "English" | |
| # ----------------------------- | |
| # Roman Urdu Text Processing | |
| # ----------------------------- | |
| def normalize_roman_urdu(text): | |
| """Normalize Roman Urdu text variations""" | |
| text = text.lower().strip() | |
| # Common Roman Urdu spelling variations | |
| variations = { | |
| r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai', | |
| r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi', | |
| r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot', | |
| r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada', | |
| r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi', | |
| r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi', | |
| r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe', | |
| r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi', | |
| r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab', | |
| r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum', | |
| r'\buss\b': 'us', r'\biss\b': 'is' | |
| } | |
| for pattern, replacement in variations.items(): | |
| text = re.sub(pattern, replacement, text) | |
| return text | |
| # ----------------------------- | |
| # Roman Urdu Sentiment Correction | |
| # ----------------------------- | |
| def correct_roman_urdu_sentiment(text, current_sentiment, current_score): | |
| """Apply Roman Urdu specific sentiment corrections""" | |
| text_lower = text.lower() | |
| normalized_text = normalize_roman_urdu(text_lower) | |
| # Count positive and negative words | |
| positive_matches = roman_urdu_positive_pattern.findall(normalized_text) | |
| negative_matches = roman_urdu_negative_pattern.findall(normalized_text) | |
| positive_count = len(positive_matches) | |
| negative_count = len(negative_matches) | |
| # Strong positive indicators | |
| strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal'] | |
| strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez'] | |
| # Rule 1: If text contains strong positive words but model says negative, correct it | |
| has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators) | |
| has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators) | |
| if has_strong_positive and current_sentiment == "Negative": | |
| return "Positive", max(current_score, 0.85) | |
| if has_strong_negative and current_sentiment == "Positive": | |
| return "Negative", max(current_score, 0.85) | |
| # Rule 2: Word count based correction | |
| if positive_count > negative_count and current_sentiment == "Negative": | |
| new_score = min(0.8 + (positive_count * 0.05), 0.95) | |
| return "Positive", new_score | |
| if negative_count > positive_count and current_sentiment == "Positive": | |
| new_score = min(0.8 + (negative_count * 0.05), 0.95) | |
| return "Negative", new_score | |
| # Rule 3: Mixed sentiments with clear majority | |
| total_sentiment_words = positive_count + negative_count | |
| if total_sentiment_words >= 2: | |
| positive_ratio = positive_count / total_sentiment_words | |
| if positive_ratio >= 0.7 and current_sentiment != "Positive": | |
| return "Positive", 0.8 | |
| elif positive_ratio <= 0.3 and current_sentiment != "Negative": | |
| return "Negative", 0.8 | |
| return current_sentiment, current_score | |
| # ----------------------------- | |
| # Enhanced Ensemble for Roman Urdu | |
| # ----------------------------- | |
| def ensemble_roman_urdu_sentiment(text): | |
| """Advanced ensemble method for Roman Urdu sentiment analysis""" | |
| normalized_text = normalize_roman_urdu(text) | |
| try: | |
| # Get predictions from both Roman Urdu and Urdu models | |
| ru_result = roman_urdu_model(normalized_text)[0] | |
| ur_result = urdu_model(normalized_text)[0] | |
| # Normalize labels | |
| ru_sentiment = normalize_sentiment_label(ru_result["label"]) | |
| ur_sentiment = normalize_sentiment_label(ur_result["label"]) | |
| ru_score = ru_result["score"] | |
| ur_score = ur_result["score"] | |
| # Apply Roman Urdu corrections to both results | |
| ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score) | |
| ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score) | |
| # If both models agree after correction | |
| if ru_sentiment_corrected == ur_sentiment_corrected: | |
| final_score = max(ru_score_corrected, ur_score_corrected) | |
| return {"label": ru_sentiment_corrected, "score": final_score} | |
| # Weighted voting with higher weight for Roman Urdu model | |
| ru_weight = ru_score_corrected * 1.6 # Higher weight for Roman Urdu model | |
| ur_weight = ur_score_corrected * 1.2 | |
| if ru_weight > ur_weight: | |
| return {"label": ru_sentiment_corrected, "score": ru_score_corrected} | |
| else: | |
| return {"label": ur_sentiment_corrected, "score": ur_score_corrected} | |
| except Exception as e: | |
| print(f"Ensemble error: {e}") | |
| # Fallback to Roman Urdu model with correction | |
| try: | |
| result = roman_urdu_model(normalize_roman_urdu(text))[0] | |
| corrected_sentiment, corrected_score = correct_roman_urdu_sentiment( | |
| text, normalize_sentiment_label(result["label"]), result["score"] | |
| ) | |
| return {"label": corrected_sentiment, "score": corrected_score} | |
| except: | |
| return {"label": "Neutral", "score": 0.5} | |
| # ----------------------------- | |
| # Sentiment Analysis Core Functions | |
| # ----------------------------- | |
| def normalize_sentiment_label(label): | |
| """Normalize sentiment labels from different models""" | |
| label = str(label).lower() | |
| if any(word in label for word in ["pos", "positive", "positive", "lab"]): | |
| return "Positive" | |
| elif any(word in label for word in ["neg", "negative", "negative"]): | |
| return "Negative" | |
| else: | |
| return "Neutral" | |
| def get_strong_sentiment_words(text, language): | |
| """Extract strong sentiment-bearing words""" | |
| text_lower = text.lower() | |
| strong_words = [] | |
| if language == "Roman Urdu": | |
| # Use our Roman Urdu word databases | |
| positive_matches = roman_urdu_positive_pattern.findall(text_lower) | |
| negative_matches = roman_urdu_negative_pattern.findall(text_lower) | |
| strong_words = positive_matches + negative_matches | |
| elif language == "Urdu": | |
| # Urdu strong words (you can expand this list) | |
| urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا'] | |
| urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار'] | |
| for word in urdu_positive + urdu_negative: | |
| if word in text: | |
| strong_words.append(word) | |
| else: # English | |
| english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great'] | |
| english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad'] | |
| for word in english_positive + english_negative: | |
| if re.search(r'\b' + re.escape(word) + r'\b', text_lower): | |
| strong_words.append(word) | |
| return list(set(strong_words))[:5] # Return unique words, max 5 | |
| def generate_detailed_explanation(text, sentiment, score, language, strong_words): | |
| """Generate detailed explanation for sentiment analysis""" | |
| confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low" | |
| base_explanations = { | |
| "Positive": { | |
| "High": "Strong positive sentiment with clear positive expressions.", | |
| "Medium": "Moderately positive sentiment with favorable tone.", | |
| "Low": "Slightly positive leaning with some positive indicators." | |
| }, | |
| "Negative": { | |
| "High": "Strong negative sentiment with clear criticism.", | |
| "Medium": "Moderately negative sentiment with critical tone.", | |
| "Low": "Slightly negative leaning with some concerning indicators." | |
| }, | |
| "Neutral": { | |
| "High": "Clearly neutral or factual statement.", | |
| "Medium": "Mostly neutral with balanced perspective.", | |
| "Low": "Weak sentiment leaning neutral." | |
| } | |
| } | |
| explanation = base_explanations[sentiment][confidence_level] | |
| # Add language specific notes | |
| if language == "Roman Urdu": | |
| explanation += " Analyzed with Roman Urdu specific rules." | |
| # Special note for common corrections | |
| if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']): | |
| if sentiment == "Positive": | |
| explanation += " Words like 'acha' correctly identified as positive." | |
| # Add strong words information | |
| if strong_words: | |
| explanation += f" Key sentiment words: {', '.join(strong_words)}." | |
| explanation += f" Confidence: {score:.3f}" | |
| return explanation | |
| # ----------------------------- | |
| # Main Analysis Function | |
| # ----------------------------- | |
| SAVE_FILE = "sentiment_logs.csv" | |
| LOCK_FILE = SAVE_FILE + ".lock" | |
| if not os.path.exists(SAVE_FILE): | |
| pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv( | |
| SAVE_FILE, index=False, encoding="utf-8-sig" | |
| ) | |
| def analyze_sentiment_complete(text, lang_hint): | |
| """Complete sentiment analysis pipeline""" | |
| if not text.strip(): | |
| return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, "" | |
| # Detect language | |
| language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text) | |
| try: | |
| # Perform sentiment analysis based on language | |
| if language == "English": | |
| result = english_model(text[:512])[0] | |
| sentiment = normalize_sentiment_label(result["label"]) | |
| score = round(float(result["score"]), 3) | |
| elif language == "Urdu": | |
| result = urdu_model(text[:512])[0] | |
| sentiment = normalize_sentiment_label(result["label"]) | |
| score = round(float(result["score"]), 3) | |
| else: # Roman Urdu | |
| result = ensemble_roman_urdu_sentiment(text) | |
| sentiment = result["label"] | |
| score = round(float(result["score"]), 3) | |
| # Get strong words | |
| strong_words = get_strong_sentiment_words(text, language) | |
| strong_words_str = ", ".join(strong_words) if strong_words else "None" | |
| # Generate explanation | |
| explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words) | |
| # Save to CSV | |
| with FileLock(LOCK_FILE): | |
| df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame( | |
| columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"] | |
| ) | |
| new_row = pd.DataFrame([[ | |
| text, language, sentiment, score, strong_words_str, pd.Timestamp.now() | |
| ]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]) | |
| df = pd.concat([df, new_row], ignore_index=True) | |
| df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig") | |
| return sentiment, str(score), explanation, SAVE_FILE, strong_words_str | |
| except Exception as e: | |
| error_msg = f"Analysis error: {str(e)}" | |
| return "Error", "0", error_msg, SAVE_FILE, "" | |
| # ----------------------------- | |
| # Gradio Interface | |
| # ----------------------------- | |
| def show_logs(): | |
| if os.path.exists(SAVE_FILE): | |
| df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") | |
| return df.tail(20) | |
| else: | |
| return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]) | |
| def clear_logs(): | |
| if os.path.exists(SAVE_FILE): | |
| os.remove(SAVE_FILE) | |
| return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]) | |
| with gr.Blocks(title="Multilingual Sentiment Analysis") as demo: | |
| gr.Markdown(""" | |
| # 🌍 Advanced Multilingual Sentiment Analysis | |
| **English • Urdu • Roman Urdu** | |
| Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling. | |
| **Used models:** | |
| - English: siebert/sentiment-roberta-large-english | |
| - Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu | |
| - Roman Urdu: tahamueed23/roman-urdu-sentiment | |
| - Language detection: papluca/xlm-roberta-base-language-detection | |
| """) | |
| # Top row with two columns | |
| with gr.Row(): | |
| # Left column - Input section | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📥 Input Section") | |
| user_text = gr.Textbox( | |
| label="✍️ Enter Text", | |
| placeholder="Type in English, Urdu, or Roman Urdu...", | |
| lines=3 | |
| ) | |
| lang_dropdown = gr.Dropdown( | |
| ["Auto Detect", "English", "Urdu", "Roman Urdu"], | |
| value="Auto Detect", | |
| label="🌐 Language Selection" | |
| ) | |
| with gr.Row(): | |
| btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary") | |
| btn_show = gr.Button("📂 Show Logs") | |
| btn_clear = gr.Button("🗑️ Clear Logs") | |
| # Right column - Results section | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📊 Results") | |
| with gr.Row(): | |
| with gr.Column(): | |
| out_sent = gr.Textbox(label="🎭 Sentiment") | |
| out_conf = gr.Textbox(label="📊 Confidence Score") | |
| with gr.Column(): | |
| out_strong = gr.Textbox(label="💪 Strong Words") | |
| out_file = gr.File(label="⬇️ Download Logs") | |
| out_exp = gr.Textbox(label="💡 Detailed Explanation", lines=3) | |
| # Bottom row with analysis history taking most of the space | |
| with gr.Row(): | |
| with gr.Column(scale=3): # Takes more space (75%) | |
| gr.Markdown("### 📋 Analysis History") | |
| logs_df = gr.Dataframe( | |
| headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"], | |
| label="", | |
| interactive=False, | |
| wrap=True | |
| ) | |
| with gr.Column(scale=1): # Takes less space (25%) | |
| gr.Markdown("### ℹ️ Information") | |
| gr.Markdown(""" | |
| **How to use:** | |
| 1. Enter text in any supported language | |
| 2. Select language or use Auto Detect | |
| 3. Click Analyze Sentiment | |
| 4. View results and history | |
| **Supported Languages:** | |
| - English | |
| - Urdu (Script) | |
| - Roman Urdu (Latin script) | |
| **Note:** Auto Detect works best with clear text samples. | |
| """) | |
| # Event handlers | |
| btn_analyze.click( | |
| analyze_sentiment_complete, | |
| inputs=[user_text, lang_dropdown], | |
| outputs=[out_sent, out_conf, out_exp, out_file, out_strong] | |
| ) | |
| btn_show.click(show_logs, outputs=[logs_df]) | |
| btn_clear.click(clear_logs, outputs=[logs_df]) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |