tahamueed23's picture
Update app.py
f864a10 verified
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import os
import re
from filelock import FileLock
import torch
# -----------------------------
# Load Models with Error Handling
# -----------------------------
try:
# English sentiment model
english_model = pipeline(
"sentiment-analysis",
model="siebert/sentiment-roberta-large-english"
)
# Urdu sentiment model
urdu_model = pipeline(
"sentiment-analysis",
model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
)
# Roman Urdu sentiment model
roman_urdu_model = pipeline(
"sentiment-analysis",
model="tahamueed23/roman-urdu-sentiment"
)
# Language detection model
lang_detector = pipeline(
"text-classification",
model="papluca/xlm-roberta-base-language-detection"
)
print("✅ All models loaded successfully!")
except Exception as e:
print(f"❌ Error loading models: {e}")
raise
# -----------------------------
# Roman Urdu Word Databases
# -----------------------------
ROMAN_URDU_POSITIVE_WORDS = {
'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda',
'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida',
'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri',
'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran',
'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi',
'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche'
}
ROMAN_URDU_NEGATIVE_WORDS = {
'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal',
'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf',
'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez',
'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda',
'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish'
}
ROMAN_URDU_NEUTRAL_WORDS = {
'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon',
'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh',
'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna'
}
# Compile regex patterns for faster matching
roman_urdu_positive_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE)
roman_urdu_negative_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE)
# -----------------------------
# Enhanced Language Detection
# -----------------------------
def detect_language_advanced(text):
"""Advanced language detection using model + rules"""
if not text.strip():
return "English"
text_clean = text.strip()
# Step 1: Urdu script detection (most reliable)
if re.search(r'[\u0600-\u06FF]', text_clean):
return "Urdu"
# Step 2: Use transformer model for language detection
try:
# Truncate very long texts to avoid model limits
truncated_text = text_clean[:250]
lang_result = lang_detector(truncated_text)[0]
lang_label = lang_result['label'].upper()
lang_score = lang_result['score']
# Map model outputs to our language categories
lang_map = {
'UR': 'Urdu',
'EN': 'English',
'Ro-Ur': 'English', # Hindi often mixed with Roman Urdu
}
detected_lang = lang_map.get(lang_label, 'English')
# Step 3: For Urdu/English detection, apply Roman Urdu rules
if detected_lang in ['Urdu', 'English']:
if is_likely_roman_urdu(text_clean):
return "Roman Urdu"
return detected_lang
except Exception as e:
print(f"Language detection model error: {e}")
# Fallback to rule-based detection
return detect_language_fallback(text_clean)
def is_likely_roman_urdu(text):
"""Check if text is likely Roman Urdu using comprehensive rules"""
text_lower = text.lower()
# Count Roman Urdu specific words
positive_hits = len(roman_urdu_positive_pattern.findall(text_lower))
negative_hits = len(roman_urdu_negative_pattern.findall(text_lower))
total_hits = positive_hits + negative_hits
# Count total words
words = re.findall(r'\b\w+\b', text_lower)
total_words = len(words)
if total_words == 0:
return False
# Rule 1: High percentage of Roman Urdu words
roman_urdu_ratio = total_hits / total_words
if roman_urdu_ratio > 0.3: # 30% threshold
return True
# Rule 2: Specific Roman Urdu sentence structures
roman_urdu_patterns = [
r"^[a-z ]*(hai|hain|tha|thi|ho|hun|hein)[\s\.\!]*$",
r"^[a-z ]*(main|tum|wo|ye|unhon|inhon)[a-z ]*(hun|hein|ho|hai)[a-z ]*$",
r"^[a-z ]*(acha|bura|kharab|behtar|zabardast)[a-z ]*(hai|hain|tha)[a-z ]*$",
r"^[a-z ]*(kyun|kese|kaise|kisne|kisliye)[a-z ]*\?$",
r"^[a-z ]*(bohat|bahut|zyada|zyda)[a-z ]+(acha|bura|kharab|behtar)"
]
for pattern in roman_urdu_patterns:
if re.search(pattern, text_lower):
return True
# Rule 3: Presence of key Roman Urdu function words
function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne']
function_word_count = sum(1 for word in words if word in function_words)
if function_word_count >= 2 and total_words <= 8:
return True
return False
def detect_language_fallback(text):
"""Rule-based fallback language detection"""
text_lower = text.lower()
# Urdu script check
if re.search(r'[\u0600-\u06FF]', text):
return "Urdu"
# Roman Urdu detection
if is_likely_roman_urdu(text):
return "Roman Urdu"
return "English"
# -----------------------------
# Roman Urdu Text Processing
# -----------------------------
def normalize_roman_urdu(text):
"""Normalize Roman Urdu text variations"""
text = text.lower().strip()
# Common Roman Urdu spelling variations
variations = {
r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai',
r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi',
r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot',
r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada',
r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi',
r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi',
r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe',
r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi',
r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab',
r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum',
r'\buss\b': 'us', r'\biss\b': 'is'
}
for pattern, replacement in variations.items():
text = re.sub(pattern, replacement, text)
return text
# -----------------------------
# Roman Urdu Sentiment Correction
# -----------------------------
def correct_roman_urdu_sentiment(text, current_sentiment, current_score):
"""Apply Roman Urdu specific sentiment corrections"""
text_lower = text.lower()
normalized_text = normalize_roman_urdu(text_lower)
# Count positive and negative words
positive_matches = roman_urdu_positive_pattern.findall(normalized_text)
negative_matches = roman_urdu_negative_pattern.findall(normalized_text)
positive_count = len(positive_matches)
negative_count = len(negative_matches)
# Strong positive indicators
strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal']
strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez']
# Rule 1: If text contains strong positive words but model says negative, correct it
has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators)
has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators)
if has_strong_positive and current_sentiment == "Negative":
return "Positive", max(current_score, 0.85)
if has_strong_negative and current_sentiment == "Positive":
return "Negative", max(current_score, 0.85)
# Rule 2: Word count based correction
if positive_count > negative_count and current_sentiment == "Negative":
new_score = min(0.8 + (positive_count * 0.05), 0.95)
return "Positive", new_score
if negative_count > positive_count and current_sentiment == "Positive":
new_score = min(0.8 + (negative_count * 0.05), 0.95)
return "Negative", new_score
# Rule 3: Mixed sentiments with clear majority
total_sentiment_words = positive_count + negative_count
if total_sentiment_words >= 2:
positive_ratio = positive_count / total_sentiment_words
if positive_ratio >= 0.7 and current_sentiment != "Positive":
return "Positive", 0.8
elif positive_ratio <= 0.3 and current_sentiment != "Negative":
return "Negative", 0.8
return current_sentiment, current_score
# -----------------------------
# Enhanced Ensemble for Roman Urdu
# -----------------------------
def ensemble_roman_urdu_sentiment(text):
"""Advanced ensemble method for Roman Urdu sentiment analysis"""
normalized_text = normalize_roman_urdu(text)
try:
# Get predictions from both Roman Urdu and Urdu models
ru_result = roman_urdu_model(normalized_text)[0]
ur_result = urdu_model(normalized_text)[0]
# Normalize labels
ru_sentiment = normalize_sentiment_label(ru_result["label"])
ur_sentiment = normalize_sentiment_label(ur_result["label"])
ru_score = ru_result["score"]
ur_score = ur_result["score"]
# Apply Roman Urdu corrections to both results
ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score)
ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score)
# If both models agree after correction
if ru_sentiment_corrected == ur_sentiment_corrected:
final_score = max(ru_score_corrected, ur_score_corrected)
return {"label": ru_sentiment_corrected, "score": final_score}
# Weighted voting with higher weight for Roman Urdu model
ru_weight = ru_score_corrected * 1.6 # Higher weight for Roman Urdu model
ur_weight = ur_score_corrected * 1.2
if ru_weight > ur_weight:
return {"label": ru_sentiment_corrected, "score": ru_score_corrected}
else:
return {"label": ur_sentiment_corrected, "score": ur_score_corrected}
except Exception as e:
print(f"Ensemble error: {e}")
# Fallback to Roman Urdu model with correction
try:
result = roman_urdu_model(normalize_roman_urdu(text))[0]
corrected_sentiment, corrected_score = correct_roman_urdu_sentiment(
text, normalize_sentiment_label(result["label"]), result["score"]
)
return {"label": corrected_sentiment, "score": corrected_score}
except:
return {"label": "Neutral", "score": 0.5}
# -----------------------------
# Sentiment Analysis Core Functions
# -----------------------------
def normalize_sentiment_label(label):
"""Normalize sentiment labels from different models"""
label = str(label).lower()
if any(word in label for word in ["pos", "positive", "positive", "lab"]):
return "Positive"
elif any(word in label for word in ["neg", "negative", "negative"]):
return "Negative"
else:
return "Neutral"
def get_strong_sentiment_words(text, language):
"""Extract strong sentiment-bearing words"""
text_lower = text.lower()
strong_words = []
if language == "Roman Urdu":
# Use our Roman Urdu word databases
positive_matches = roman_urdu_positive_pattern.findall(text_lower)
negative_matches = roman_urdu_negative_pattern.findall(text_lower)
strong_words = positive_matches + negative_matches
elif language == "Urdu":
# Urdu strong words (you can expand this list)
urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا']
urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار']
for word in urdu_positive + urdu_negative:
if word in text:
strong_words.append(word)
else: # English
english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great']
english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad']
for word in english_positive + english_negative:
if re.search(r'\b' + re.escape(word) + r'\b', text_lower):
strong_words.append(word)
return list(set(strong_words))[:5] # Return unique words, max 5
def generate_detailed_explanation(text, sentiment, score, language, strong_words):
"""Generate detailed explanation for sentiment analysis"""
confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low"
base_explanations = {
"Positive": {
"High": "Strong positive sentiment with clear positive expressions.",
"Medium": "Moderately positive sentiment with favorable tone.",
"Low": "Slightly positive leaning with some positive indicators."
},
"Negative": {
"High": "Strong negative sentiment with clear criticism.",
"Medium": "Moderately negative sentiment with critical tone.",
"Low": "Slightly negative leaning with some concerning indicators."
},
"Neutral": {
"High": "Clearly neutral or factual statement.",
"Medium": "Mostly neutral with balanced perspective.",
"Low": "Weak sentiment leaning neutral."
}
}
explanation = base_explanations[sentiment][confidence_level]
# Add language specific notes
if language == "Roman Urdu":
explanation += " Analyzed with Roman Urdu specific rules."
# Special note for common corrections
if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']):
if sentiment == "Positive":
explanation += " Words like 'acha' correctly identified as positive."
# Add strong words information
if strong_words:
explanation += f" Key sentiment words: {', '.join(strong_words)}."
explanation += f" Confidence: {score:.3f}"
return explanation
# -----------------------------
# Main Analysis Function
# -----------------------------
SAVE_FILE = "sentiment_logs.csv"
LOCK_FILE = SAVE_FILE + ".lock"
if not os.path.exists(SAVE_FILE):
pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv(
SAVE_FILE, index=False, encoding="utf-8-sig"
)
def analyze_sentiment_complete(text, lang_hint):
"""Complete sentiment analysis pipeline"""
if not text.strip():
return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, ""
# Detect language
language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text)
try:
# Perform sentiment analysis based on language
if language == "English":
result = english_model(text[:512])[0]
sentiment = normalize_sentiment_label(result["label"])
score = round(float(result["score"]), 3)
elif language == "Urdu":
result = urdu_model(text[:512])[0]
sentiment = normalize_sentiment_label(result["label"])
score = round(float(result["score"]), 3)
else: # Roman Urdu
result = ensemble_roman_urdu_sentiment(text)
sentiment = result["label"]
score = round(float(result["score"]), 3)
# Get strong words
strong_words = get_strong_sentiment_words(text, language)
strong_words_str = ", ".join(strong_words) if strong_words else "None"
# Generate explanation
explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words)
# Save to CSV
with FileLock(LOCK_FILE):
df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame(
columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]
)
new_row = pd.DataFrame([[
text, language, sentiment, score, strong_words_str, pd.Timestamp.now()
]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
df = pd.concat([df, new_row], ignore_index=True)
df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
return sentiment, str(score), explanation, SAVE_FILE, strong_words_str
except Exception as e:
error_msg = f"Analysis error: {str(e)}"
return "Error", "0", error_msg, SAVE_FILE, ""
# -----------------------------
# Gradio Interface
# -----------------------------
def show_logs():
if os.path.exists(SAVE_FILE):
df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
return df.tail(20)
else:
return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
def clear_logs():
if os.path.exists(SAVE_FILE):
os.remove(SAVE_FILE)
return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
gr.Markdown("""
# 🌍 Advanced Multilingual Sentiment Analysis
**English • Urdu • Roman Urdu**
Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling.
**Used models:**
- English: siebert/sentiment-roberta-large-english
- Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
- Roman Urdu: tahamueed23/roman-urdu-sentiment
- Language detection: papluca/xlm-roberta-base-language-detection
""")
# Top row with two columns
with gr.Row():
# Left column - Input section
with gr.Column(scale=1):
gr.Markdown("### 📥 Input Section")
user_text = gr.Textbox(
label="✍️ Enter Text",
placeholder="Type in English, Urdu, or Roman Urdu...",
lines=3
)
lang_dropdown = gr.Dropdown(
["Auto Detect", "English", "Urdu", "Roman Urdu"],
value="Auto Detect",
label="🌐 Language Selection"
)
with gr.Row():
btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary")
btn_show = gr.Button("📂 Show Logs")
btn_clear = gr.Button("🗑️ Clear Logs")
# Right column - Results section
with gr.Column(scale=1):
gr.Markdown("### 📊 Results")
with gr.Row():
with gr.Column():
out_sent = gr.Textbox(label="🎭 Sentiment")
out_conf = gr.Textbox(label="📊 Confidence Score")
with gr.Column():
out_strong = gr.Textbox(label="💪 Strong Words")
out_file = gr.File(label="⬇️ Download Logs")
out_exp = gr.Textbox(label="💡 Detailed Explanation", lines=3)
# Bottom row with analysis history taking most of the space
with gr.Row():
with gr.Column(scale=3): # Takes more space (75%)
gr.Markdown("### 📋 Analysis History")
logs_df = gr.Dataframe(
headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
label="",
interactive=False,
wrap=True
)
with gr.Column(scale=1): # Takes less space (25%)
gr.Markdown("### ℹ️ Information")
gr.Markdown("""
**How to use:**
1. Enter text in any supported language
2. Select language or use Auto Detect
3. Click Analyze Sentiment
4. View results and history
**Supported Languages:**
- English
- Urdu (Script)
- Roman Urdu (Latin script)
**Note:** Auto Detect works best with clear text samples.
""")
# Event handlers
btn_analyze.click(
analyze_sentiment_complete,
inputs=[user_text, lang_dropdown],
outputs=[out_sent, out_conf, out_exp, out_file, out_strong]
)
btn_show.click(show_logs, outputs=[logs_df])
btn_clear.click(clear_logs, outputs=[logs_df])
if __name__ == "__main__":
demo.launch(share=False)