Spaces:

tahamueed23
/

Sentiment-Analyzer

Sleeping

App Files Files Community

Sentiment-Analyzer / app.py

tahamueed23

Update app.py

f864a10 verified 18 days ago

raw

history blame contribute delete

21.6 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	import pandas as pd
	import os
	import re
	from filelock import FileLock
	import torch

	# -----------------------------
	# Load Models with Error Handling
	# -----------------------------
	try:
	# English sentiment model
	english_model = pipeline(
	"sentiment-analysis",
	model="siebert/sentiment-roberta-large-english"
	)

	# Urdu sentiment model
	urdu_model = pipeline(
	"sentiment-analysis",
	model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
	)

	# Roman Urdu sentiment model
	roman_urdu_model = pipeline(
	"sentiment-analysis",
	model="tahamueed23/roman-urdu-sentiment"
	)

	# Language detection model
	lang_detector = pipeline(
	"text-classification",
	model="papluca/xlm-roberta-base-language-detection"
	)

	print("✅ All models loaded successfully!")

	except Exception as e:
	print(f"❌ Error loading models: {e}")
	raise

	# -----------------------------
	# Roman Urdu Word Databases
	# -----------------------------
	ROMAN_URDU_POSITIVE_WORDS = {
	'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda',
	'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida',
	'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri',
	'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran',
	'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi',
	'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche'
	}

	ROMAN_URDU_NEGATIVE_WORDS = {
	'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal',
	'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf',
	'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez',
	'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda',
	'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish'
	}

	ROMAN_URDU_NEUTRAL_WORDS = {
	'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon',
	'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh',
	'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna'
	}

	# Compile regex patterns for faster matching
	roman_urdu_positive_pattern = re.compile(r'\b(' + '\|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE)
	roman_urdu_negative_pattern = re.compile(r'\b(' + '\|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE)

	# -----------------------------
	# Enhanced Language Detection
	# -----------------------------
	def detect_language_advanced(text):
	"""Advanced language detection using model + rules"""
	if not text.strip():
	return "English"

	text_clean = text.strip()

	# Step 1: Urdu script detection (most reliable)
	if re.search(r'[\u0600-\u06FF]', text_clean):
	return "Urdu"

	# Step 2: Use transformer model for language detection
	try:
	# Truncate very long texts to avoid model limits
	truncated_text = text_clean[:250]
	lang_result = lang_detector(truncated_text)[0]
	lang_label = lang_result['label'].upper()
	lang_score = lang_result['score']

	# Map model outputs to our language categories
	lang_map = {
	'UR': 'Urdu',
	'EN': 'English',
	'Ro-Ur': 'English', # Hindi often mixed with Roman Urdu
	}

	detected_lang = lang_map.get(lang_label, 'English')

	# Step 3: For Urdu/English detection, apply Roman Urdu rules
	if detected_lang in ['Urdu', 'English']:
	if is_likely_roman_urdu(text_clean):
	return "Roman Urdu"

	return detected_lang

	except Exception as e:
	print(f"Language detection model error: {e}")
	# Fallback to rule-based detection
	return detect_language_fallback(text_clean)

	def is_likely_roman_urdu(text):
	"""Check if text is likely Roman Urdu using comprehensive rules"""
	text_lower = text.lower()

	# Count Roman Urdu specific words
	positive_hits = len(roman_urdu_positive_pattern.findall(text_lower))
	negative_hits = len(roman_urdu_negative_pattern.findall(text_lower))
	total_hits = positive_hits + negative_hits

	# Count total words
	words = re.findall(r'\b\w+\b', text_lower)
	total_words = len(words)

	if total_words == 0:
	return False

	# Rule 1: High percentage of Roman Urdu words
	roman_urdu_ratio = total_hits / total_words
	if roman_urdu_ratio > 0.3: # 30% threshold
	return True

	# Rule 2: Specific Roman Urdu sentence structures
	roman_urdu_patterns = [
	r"^[a-z ](hai\|hain\|tha\|thi\|ho\|hun\|hein)[\s\.\!]$",
	r"^[a-z ](main\|tum\|wo\|ye\|unhon\|inhon)[a-z ](hun\|hein\|ho\|hai)[a-z ]*$",
	r"^[a-z ](acha\|bura\|kharab\|behtar\|zabardast)[a-z ](hai\|hain\|tha)[a-z ]*$",
	r"^[a-z ](kyun\|kese\|kaise\|kisne\|kisliye)[a-z ]\?$",
	r"^[a-z ]*(bohat\|bahut\|zyada\|zyda)[a-z ]+(acha\|bura\|kharab\|behtar)"
	]

	for pattern in roman_urdu_patterns:
	if re.search(pattern, text_lower):
	return True

	# Rule 3: Presence of key Roman Urdu function words
	function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne']
	function_word_count = sum(1 for word in words if word in function_words)

	if function_word_count >= 2 and total_words <= 8:
	return True

	return False

	def detect_language_fallback(text):
	"""Rule-based fallback language detection"""
	text_lower = text.lower()

	# Urdu script check
	if re.search(r'[\u0600-\u06FF]', text):
	return "Urdu"

	# Roman Urdu detection
	if is_likely_roman_urdu(text):
	return "Roman Urdu"

	return "English"

	# -----------------------------
	# Roman Urdu Text Processing
	# -----------------------------
	def normalize_roman_urdu(text):
	"""Normalize Roman Urdu text variations"""
	text = text.lower().strip()

	# Common Roman Urdu spelling variations
	variations = {
	r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai',
	r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi',
	r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot',
	r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada',
	r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi',
	r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi',
	r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe',
	r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi',
	r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab',
	r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum',
	r'\buss\b': 'us', r'\biss\b': 'is'
	}

	for pattern, replacement in variations.items():
	text = re.sub(pattern, replacement, text)

	return text

	# -----------------------------
	# Roman Urdu Sentiment Correction
	# -----------------------------
	def correct_roman_urdu_sentiment(text, current_sentiment, current_score):
	"""Apply Roman Urdu specific sentiment corrections"""
	text_lower = text.lower()
	normalized_text = normalize_roman_urdu(text_lower)

	# Count positive and negative words
	positive_matches = roman_urdu_positive_pattern.findall(normalized_text)
	negative_matches = roman_urdu_negative_pattern.findall(normalized_text)

	positive_count = len(positive_matches)
	negative_count = len(negative_matches)

	# Strong positive indicators
	strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal']
	strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez']

	# Rule 1: If text contains strong positive words but model says negative, correct it
	has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators)
	has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators)

	if has_strong_positive and current_sentiment == "Negative":
	return "Positive", max(current_score, 0.85)

	if has_strong_negative and current_sentiment == "Positive":
	return "Negative", max(current_score, 0.85)

	# Rule 2: Word count based correction
	if positive_count > negative_count and current_sentiment == "Negative":
	new_score = min(0.8 + (positive_count * 0.05), 0.95)
	return "Positive", new_score

	if negative_count > positive_count and current_sentiment == "Positive":
	new_score = min(0.8 + (negative_count * 0.05), 0.95)
	return "Negative", new_score

	# Rule 3: Mixed sentiments with clear majority
	total_sentiment_words = positive_count + negative_count
	if total_sentiment_words >= 2:
	positive_ratio = positive_count / total_sentiment_words

	if positive_ratio >= 0.7 and current_sentiment != "Positive":
	return "Positive", 0.8
	elif positive_ratio <= 0.3 and current_sentiment != "Negative":
	return "Negative", 0.8

	return current_sentiment, current_score

	# -----------------------------
	# Enhanced Ensemble for Roman Urdu
	# -----------------------------
	def ensemble_roman_urdu_sentiment(text):
	"""Advanced ensemble method for Roman Urdu sentiment analysis"""
	normalized_text = normalize_roman_urdu(text)

	try:
	# Get predictions from both Roman Urdu and Urdu models
	ru_result = roman_urdu_model(normalized_text)[0]
	ur_result = urdu_model(normalized_text)[0]

	# Normalize labels
	ru_sentiment = normalize_sentiment_label(ru_result["label"])
	ur_sentiment = normalize_sentiment_label(ur_result["label"])
	ru_score = ru_result["score"]
	ur_score = ur_result["score"]

	# Apply Roman Urdu corrections to both results
	ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score)
	ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score)

	# If both models agree after correction
	if ru_sentiment_corrected == ur_sentiment_corrected:
	final_score = max(ru_score_corrected, ur_score_corrected)
	return {"label": ru_sentiment_corrected, "score": final_score}

	# Weighted voting with higher weight for Roman Urdu model
	ru_weight = ru_score_corrected * 1.6 # Higher weight for Roman Urdu model
	ur_weight = ur_score_corrected * 1.2

	if ru_weight > ur_weight:
	return {"label": ru_sentiment_corrected, "score": ru_score_corrected}
	else:
	return {"label": ur_sentiment_corrected, "score": ur_score_corrected}

	except Exception as e:
	print(f"Ensemble error: {e}")
	# Fallback to Roman Urdu model with correction
	try:
	result = roman_urdu_model(normalize_roman_urdu(text))[0]
	corrected_sentiment, corrected_score = correct_roman_urdu_sentiment(
	text, normalize_sentiment_label(result["label"]), result["score"]
	)
	return {"label": corrected_sentiment, "score": corrected_score}
	except:
	return {"label": "Neutral", "score": 0.5}

	# -----------------------------
	# Sentiment Analysis Core Functions
	# -----------------------------
	def normalize_sentiment_label(label):
	"""Normalize sentiment labels from different models"""
	label = str(label).lower()

	if any(word in label for word in ["pos", "positive", "positive", "lab"]):
	return "Positive"
	elif any(word in label for word in ["neg", "negative", "negative"]):
	return "Negative"
	else:
	return "Neutral"

	def get_strong_sentiment_words(text, language):
	"""Extract strong sentiment-bearing words"""
	text_lower = text.lower()
	strong_words = []

	if language == "Roman Urdu":
	# Use our Roman Urdu word databases
	positive_matches = roman_urdu_positive_pattern.findall(text_lower)
	negative_matches = roman_urdu_negative_pattern.findall(text_lower)
	strong_words = positive_matches + negative_matches
	elif language == "Urdu":
	# Urdu strong words (you can expand this list)
	urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا']
	urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار']
	for word in urdu_positive + urdu_negative:
	if word in text:
	strong_words.append(word)
	else: # English
	english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great']
	english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad']
	for word in english_positive + english_negative:
	if re.search(r'\b' + re.escape(word) + r'\b', text_lower):
	strong_words.append(word)

	return list(set(strong_words))[:5] # Return unique words, max 5

	def generate_detailed_explanation(text, sentiment, score, language, strong_words):
	"""Generate detailed explanation for sentiment analysis"""

	confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low"

	base_explanations = {
	"Positive": {
	"High": "Strong positive sentiment with clear positive expressions.",
	"Medium": "Moderately positive sentiment with favorable tone.",
	"Low": "Slightly positive leaning with some positive indicators."
	},
	"Negative": {
	"High": "Strong negative sentiment with clear criticism.",
	"Medium": "Moderately negative sentiment with critical tone.",
	"Low": "Slightly negative leaning with some concerning indicators."
	},
	"Neutral": {
	"High": "Clearly neutral or factual statement.",
	"Medium": "Mostly neutral with balanced perspective.",
	"Low": "Weak sentiment leaning neutral."
	}
	}

	explanation = base_explanations[sentiment][confidence_level]

	# Add language specific notes
	if language == "Roman Urdu":
	explanation += " Analyzed with Roman Urdu specific rules."

	# Special note for common corrections
	if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']):
	if sentiment == "Positive":
	explanation += " Words like 'acha' correctly identified as positive."

	# Add strong words information
	if strong_words:
	explanation += f" Key sentiment words: {', '.join(strong_words)}."

	explanation += f" Confidence: {score:.3f}"

	return explanation

	# -----------------------------
	# Main Analysis Function
	# -----------------------------
	SAVE_FILE = "sentiment_logs.csv"
	LOCK_FILE = SAVE_FILE + ".lock"

	if not os.path.exists(SAVE_FILE):
	pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv(
	SAVE_FILE, index=False, encoding="utf-8-sig"
	)

	def analyze_sentiment_complete(text, lang_hint):
	"""Complete sentiment analysis pipeline"""
	if not text.strip():
	return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, ""

	# Detect language
	language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text)

	try:
	# Perform sentiment analysis based on language
	if language == "English":
	result = english_model(text[:512])[0]
	sentiment = normalize_sentiment_label(result["label"])
	score = round(float(result["score"]), 3)

	elif language == "Urdu":
	result = urdu_model(text[:512])[0]
	sentiment = normalize_sentiment_label(result["label"])
	score = round(float(result["score"]), 3)

	else: # Roman Urdu
	result = ensemble_roman_urdu_sentiment(text)
	sentiment = result["label"]
	score = round(float(result["score"]), 3)

	# Get strong words
	strong_words = get_strong_sentiment_words(text, language)
	strong_words_str = ", ".join(strong_words) if strong_words else "None"

	# Generate explanation
	explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words)

	# Save to CSV
	with FileLock(LOCK_FILE):
	df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame(
	columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]
	)
	new_row = pd.DataFrame([[
	text, language, sentiment, score, strong_words_str, pd.Timestamp.now()
	]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
	df = pd.concat([df, new_row], ignore_index=True)
	df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")

	return sentiment, str(score), explanation, SAVE_FILE, strong_words_str

	except Exception as e:
	error_msg = f"Analysis error: {str(e)}"
	return "Error", "0", error_msg, SAVE_FILE, ""

	# -----------------------------
	# Gradio Interface
	# -----------------------------
	def show_logs():
	if os.path.exists(SAVE_FILE):
	df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
	return df.tail(20)
	else:
	return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])

	def clear_logs():
	if os.path.exists(SAVE_FILE):
	os.remove(SAVE_FILE)
	return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])

	with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
	gr.Markdown("""
	# 🌍 Advanced Multilingual Sentiment Analysis
	English • Urdu • Roman Urdu

	Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling.

	Used models:
	- English: siebert/sentiment-roberta-large-english
	- Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
	- Roman Urdu: tahamueed23/roman-urdu-sentiment
	- Language detection: papluca/xlm-roberta-base-language-detection
	""")

	# Top row with two columns
	with gr.Row():
	# Left column - Input section
	with gr.Column(scale=1):
	gr.Markdown("### 📥 Input Section")
	user_text = gr.Textbox(
	label="✍️ Enter Text",
	placeholder="Type in English, Urdu, or Roman Urdu...",
	lines=3
	)
	lang_dropdown = gr.Dropdown(
	["Auto Detect", "English", "Urdu", "Roman Urdu"],
	value="Auto Detect",
	label="🌐 Language Selection"
	)

	with gr.Row():
	btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary")
	btn_show = gr.Button("📂 Show Logs")
	btn_clear = gr.Button("🗑️ Clear Logs")

	# Right column - Results section
	with gr.Column(scale=1):
	gr.Markdown("### 📊 Results")
	with gr.Row():
	with gr.Column():
	out_sent = gr.Textbox(label="🎭 Sentiment")
	out_conf = gr.Textbox(label="📊 Confidence Score")
	with gr.Column():
	out_strong = gr.Textbox(label="💪 Strong Words")
	out_file = gr.File(label="⬇️ Download Logs")

	out_exp = gr.Textbox(label="💡 Detailed Explanation", lines=3)

	# Bottom row with analysis history taking most of the space
	with gr.Row():
	with gr.Column(scale=3): # Takes more space (75%)
	gr.Markdown("### 📋 Analysis History")
	logs_df = gr.Dataframe(
	headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
	label="",
	interactive=False,
	wrap=True
	)
	with gr.Column(scale=1): # Takes less space (25%)
	gr.Markdown("### ℹ️ Information")
	gr.Markdown("""
	How to use:
	1. Enter text in any supported language
	2. Select language or use Auto Detect
	3. Click Analyze Sentiment
	4. View results and history

	Supported Languages:
	- English
	- Urdu (Script)
	- Roman Urdu (Latin script)

	Note: Auto Detect works best with clear text samples.
	""")

	# Event handlers
	btn_analyze.click(
	analyze_sentiment_complete,
	inputs=[user_text, lang_dropdown],
	outputs=[out_sent, out_conf, out_exp, out_file, out_strong]
	)
	btn_show.click(show_logs, outputs=[logs_df])
	btn_clear.click(clear_logs, outputs=[logs_df])

	if __name__ == "__main__":
	demo.launch(share=False)