import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import pandas as pd import os import re from filelock import FileLock import torch # ----------------------------- # Load Models with Error Handling # ----------------------------- try: # English sentiment model english_model = pipeline( "sentiment-analysis", model="siebert/sentiment-roberta-large-english" ) # Urdu sentiment model urdu_model = pipeline( "sentiment-analysis", model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu" ) # Roman Urdu sentiment model roman_urdu_model = pipeline( "sentiment-analysis", model="tahamueed23/roman-urdu-sentiment" ) # Language detection model lang_detector = pipeline( "text-classification", model="papluca/xlm-roberta-base-language-detection" ) print("✅ All models loaded successfully!") except Exception as e: print(f"❌ Error loading models: {e}") raise # ----------------------------- # Roman Urdu Word Databases # ----------------------------- ROMAN_URDU_POSITIVE_WORDS = { 'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda', 'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida', 'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri', 'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran', 'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi', 'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche' } ROMAN_URDU_NEGATIVE_WORDS = { 'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal', 'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf', 'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez', 'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda', 'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish' } ROMAN_URDU_NEUTRAL_WORDS = { 'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon', 'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh', 'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna' } # Compile regex patterns for faster matching roman_urdu_positive_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE) roman_urdu_negative_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE) # ----------------------------- # Enhanced Language Detection # ----------------------------- def detect_language_advanced(text): """Advanced language detection using model + rules""" if not text.strip(): return "English" text_clean = text.strip() # Step 1: Urdu script detection (most reliable) if re.search(r'[\u0600-\u06FF]', text_clean): return "Urdu" # Step 2: Use transformer model for language detection try: # Truncate very long texts to avoid model limits truncated_text = text_clean[:250] lang_result = lang_detector(truncated_text)[0] lang_label = lang_result['label'].upper() lang_score = lang_result['score'] # Map model outputs to our language categories lang_map = { 'UR': 'Urdu', 'EN': 'English', 'Ro-Ur': 'English', # Hindi often mixed with Roman Urdu } detected_lang = lang_map.get(lang_label, 'English') # Step 3: For Urdu/English detection, apply Roman Urdu rules if detected_lang in ['Urdu', 'English']: if is_likely_roman_urdu(text_clean): return "Roman Urdu" return detected_lang except Exception as e: print(f"Language detection model error: {e}") # Fallback to rule-based detection return detect_language_fallback(text_clean) def is_likely_roman_urdu(text): """Check if text is likely Roman Urdu using comprehensive rules""" text_lower = text.lower() # Count Roman Urdu specific words positive_hits = len(roman_urdu_positive_pattern.findall(text_lower)) negative_hits = len(roman_urdu_negative_pattern.findall(text_lower)) total_hits = positive_hits + negative_hits # Count total words words = re.findall(r'\b\w+\b', text_lower) total_words = len(words) if total_words == 0: return False # Rule 1: High percentage of Roman Urdu words roman_urdu_ratio = total_hits / total_words if roman_urdu_ratio > 0.3: # 30% threshold return True # Rule 2: Specific Roman Urdu sentence structures roman_urdu_patterns = [ r"^[a-z ]*(hai|hain|tha|thi|ho|hun|hein)[\s\.\!]*$", r"^[a-z ]*(main|tum|wo|ye|unhon|inhon)[a-z ]*(hun|hein|ho|hai)[a-z ]*$", r"^[a-z ]*(acha|bura|kharab|behtar|zabardast)[a-z ]*(hai|hain|tha)[a-z ]*$", r"^[a-z ]*(kyun|kese|kaise|kisne|kisliye)[a-z ]*\?$", r"^[a-z ]*(bohat|bahut|zyada|zyda)[a-z ]+(acha|bura|kharab|behtar)" ] for pattern in roman_urdu_patterns: if re.search(pattern, text_lower): return True # Rule 3: Presence of key Roman Urdu function words function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne'] function_word_count = sum(1 for word in words if word in function_words) if function_word_count >= 2 and total_words <= 8: return True return False def detect_language_fallback(text): """Rule-based fallback language detection""" text_lower = text.lower() # Urdu script check if re.search(r'[\u0600-\u06FF]', text): return "Urdu" # Roman Urdu detection if is_likely_roman_urdu(text): return "Roman Urdu" return "English" # ----------------------------- # Roman Urdu Text Processing # ----------------------------- def normalize_roman_urdu(text): """Normalize Roman Urdu text variations""" text = text.lower().strip() # Common Roman Urdu spelling variations variations = { r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai', r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi', r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot', r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada', r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi', r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi', r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe', r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi', r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab', r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum', r'\buss\b': 'us', r'\biss\b': 'is' } for pattern, replacement in variations.items(): text = re.sub(pattern, replacement, text) return text # ----------------------------- # Roman Urdu Sentiment Correction # ----------------------------- def correct_roman_urdu_sentiment(text, current_sentiment, current_score): """Apply Roman Urdu specific sentiment corrections""" text_lower = text.lower() normalized_text = normalize_roman_urdu(text_lower) # Count positive and negative words positive_matches = roman_urdu_positive_pattern.findall(normalized_text) negative_matches = roman_urdu_negative_pattern.findall(normalized_text) positive_count = len(positive_matches) negative_count = len(negative_matches) # Strong positive indicators strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal'] strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez'] # Rule 1: If text contains strong positive words but model says negative, correct it has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators) has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators) if has_strong_positive and current_sentiment == "Negative": return "Positive", max(current_score, 0.85) if has_strong_negative and current_sentiment == "Positive": return "Negative", max(current_score, 0.85) # Rule 2: Word count based correction if positive_count > negative_count and current_sentiment == "Negative": new_score = min(0.8 + (positive_count * 0.05), 0.95) return "Positive", new_score if negative_count > positive_count and current_sentiment == "Positive": new_score = min(0.8 + (negative_count * 0.05), 0.95) return "Negative", new_score # Rule 3: Mixed sentiments with clear majority total_sentiment_words = positive_count + negative_count if total_sentiment_words >= 2: positive_ratio = positive_count / total_sentiment_words if positive_ratio >= 0.7 and current_sentiment != "Positive": return "Positive", 0.8 elif positive_ratio <= 0.3 and current_sentiment != "Negative": return "Negative", 0.8 return current_sentiment, current_score # ----------------------------- # Enhanced Ensemble for Roman Urdu # ----------------------------- def ensemble_roman_urdu_sentiment(text): """Advanced ensemble method for Roman Urdu sentiment analysis""" normalized_text = normalize_roman_urdu(text) try: # Get predictions from both Roman Urdu and Urdu models ru_result = roman_urdu_model(normalized_text)[0] ur_result = urdu_model(normalized_text)[0] # Normalize labels ru_sentiment = normalize_sentiment_label(ru_result["label"]) ur_sentiment = normalize_sentiment_label(ur_result["label"]) ru_score = ru_result["score"] ur_score = ur_result["score"] # Apply Roman Urdu corrections to both results ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score) ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score) # If both models agree after correction if ru_sentiment_corrected == ur_sentiment_corrected: final_score = max(ru_score_corrected, ur_score_corrected) return {"label": ru_sentiment_corrected, "score": final_score} # Weighted voting with higher weight for Roman Urdu model ru_weight = ru_score_corrected * 1.6 # Higher weight for Roman Urdu model ur_weight = ur_score_corrected * 1.2 if ru_weight > ur_weight: return {"label": ru_sentiment_corrected, "score": ru_score_corrected} else: return {"label": ur_sentiment_corrected, "score": ur_score_corrected} except Exception as e: print(f"Ensemble error: {e}") # Fallback to Roman Urdu model with correction try: result = roman_urdu_model(normalize_roman_urdu(text))[0] corrected_sentiment, corrected_score = correct_roman_urdu_sentiment( text, normalize_sentiment_label(result["label"]), result["score"] ) return {"label": corrected_sentiment, "score": corrected_score} except: return {"label": "Neutral", "score": 0.5} # ----------------------------- # Sentiment Analysis Core Functions # ----------------------------- def normalize_sentiment_label(label): """Normalize sentiment labels from different models""" label = str(label).lower() if any(word in label for word in ["pos", "positive", "positive", "lab"]): return "Positive" elif any(word in label for word in ["neg", "negative", "negative"]): return "Negative" else: return "Neutral" def get_strong_sentiment_words(text, language): """Extract strong sentiment-bearing words""" text_lower = text.lower() strong_words = [] if language == "Roman Urdu": # Use our Roman Urdu word databases positive_matches = roman_urdu_positive_pattern.findall(text_lower) negative_matches = roman_urdu_negative_pattern.findall(text_lower) strong_words = positive_matches + negative_matches elif language == "Urdu": # Urdu strong words (you can expand this list) urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا'] urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار'] for word in urdu_positive + urdu_negative: if word in text: strong_words.append(word) else: # English english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great'] english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad'] for word in english_positive + english_negative: if re.search(r'\b' + re.escape(word) + r'\b', text_lower): strong_words.append(word) return list(set(strong_words))[:5] # Return unique words, max 5 def generate_detailed_explanation(text, sentiment, score, language, strong_words): """Generate detailed explanation for sentiment analysis""" confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low" base_explanations = { "Positive": { "High": "Strong positive sentiment with clear positive expressions.", "Medium": "Moderately positive sentiment with favorable tone.", "Low": "Slightly positive leaning with some positive indicators." }, "Negative": { "High": "Strong negative sentiment with clear criticism.", "Medium": "Moderately negative sentiment with critical tone.", "Low": "Slightly negative leaning with some concerning indicators." }, "Neutral": { "High": "Clearly neutral or factual statement.", "Medium": "Mostly neutral with balanced perspective.", "Low": "Weak sentiment leaning neutral." } } explanation = base_explanations[sentiment][confidence_level] # Add language specific notes if language == "Roman Urdu": explanation += " Analyzed with Roman Urdu specific rules." # Special note for common corrections if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']): if sentiment == "Positive": explanation += " Words like 'acha' correctly identified as positive." # Add strong words information if strong_words: explanation += f" Key sentiment words: {', '.join(strong_words)}." explanation += f" Confidence: {score:.3f}" return explanation # ----------------------------- # Main Analysis Function # ----------------------------- SAVE_FILE = "sentiment_logs.csv" LOCK_FILE = SAVE_FILE + ".lock" if not os.path.exists(SAVE_FILE): pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv( SAVE_FILE, index=False, encoding="utf-8-sig" ) def analyze_sentiment_complete(text, lang_hint): """Complete sentiment analysis pipeline""" if not text.strip(): return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, "" # Detect language language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text) try: # Perform sentiment analysis based on language if language == "English": result = english_model(text[:512])[0] sentiment = normalize_sentiment_label(result["label"]) score = round(float(result["score"]), 3) elif language == "Urdu": result = urdu_model(text[:512])[0] sentiment = normalize_sentiment_label(result["label"]) score = round(float(result["score"]), 3) else: # Roman Urdu result = ensemble_roman_urdu_sentiment(text) sentiment = result["label"] score = round(float(result["score"]), 3) # Get strong words strong_words = get_strong_sentiment_words(text, language) strong_words_str = ", ".join(strong_words) if strong_words else "None" # Generate explanation explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words) # Save to CSV with FileLock(LOCK_FILE): df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame( columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"] ) new_row = pd.DataFrame([[ text, language, sentiment, score, strong_words_str, pd.Timestamp.now() ]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]) df = pd.concat([df, new_row], ignore_index=True) df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig") return sentiment, str(score), explanation, SAVE_FILE, strong_words_str except Exception as e: error_msg = f"Analysis error: {str(e)}" return "Error", "0", error_msg, SAVE_FILE, "" # ----------------------------- # Gradio Interface # ----------------------------- def show_logs(): if os.path.exists(SAVE_FILE): df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") return df.tail(20) else: return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]) def clear_logs(): if os.path.exists(SAVE_FILE): os.remove(SAVE_FILE) return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]) with gr.Blocks(title="Multilingual Sentiment Analysis") as demo: gr.Markdown(""" # 🌍 Advanced Multilingual Sentiment Analysis **English • Urdu • Roman Urdu** Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling. **Used models:** - English: siebert/sentiment-roberta-large-english - Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu - Roman Urdu: tahamueed23/roman-urdu-sentiment - Language detection: papluca/xlm-roberta-base-language-detection """) # Top row with two columns with gr.Row(): # Left column - Input section with gr.Column(scale=1): gr.Markdown("### 📥 Input Section") user_text = gr.Textbox( label="✍️ Enter Text", placeholder="Type in English, Urdu, or Roman Urdu...", lines=3 ) lang_dropdown = gr.Dropdown( ["Auto Detect", "English", "Urdu", "Roman Urdu"], value="Auto Detect", label="🌐 Language Selection" ) with gr.Row(): btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary") btn_show = gr.Button("📂 Show Logs") btn_clear = gr.Button("🗑️ Clear Logs") # Right column - Results section with gr.Column(scale=1): gr.Markdown("### 📊 Results") with gr.Row(): with gr.Column(): out_sent = gr.Textbox(label="🎭 Sentiment") out_conf = gr.Textbox(label="📊 Confidence Score") with gr.Column(): out_strong = gr.Textbox(label="💪 Strong Words") out_file = gr.File(label="⬇️ Download Logs") out_exp = gr.Textbox(label="💡 Detailed Explanation", lines=3) # Bottom row with analysis history taking most of the space with gr.Row(): with gr.Column(scale=3): # Takes more space (75%) gr.Markdown("### 📋 Analysis History") logs_df = gr.Dataframe( headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"], label="", interactive=False, wrap=True ) with gr.Column(scale=1): # Takes less space (25%) gr.Markdown("### ℹ️ Information") gr.Markdown(""" **How to use:** 1. Enter text in any supported language 2. Select language or use Auto Detect 3. Click Analyze Sentiment 4. View results and history **Supported Languages:** - English - Urdu (Script) - Roman Urdu (Latin script) **Note:** Auto Detect works best with clear text samples. """) # Event handlers btn_analyze.click( analyze_sentiment_complete, inputs=[user_text, lang_dropdown], outputs=[out_sent, out_conf, out_exp, out_file, out_strong] ) btn_show.click(show_logs, outputs=[logs_df]) btn_clear.click(clear_logs, outputs=[logs_df]) if __name__ == "__main__": demo.launch(share=False)