import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import os
import re
from filelock import FileLock
import torch

# -----------------------------
# Load Models with Error Handling
# -----------------------------
try:
    # English sentiment model
    english_model = pipeline(
        "sentiment-analysis",
        model="siebert/sentiment-roberta-large-english"
    )
    
    # Urdu sentiment model
    urdu_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
    )
    
    # Roman Urdu sentiment model
    roman_urdu_model = pipeline(
        "sentiment-analysis",
        model="tahamueed23/roman-urdu-sentiment"
    )
    
    # Language detection model
    lang_detector = pipeline(
        "text-classification",
        model="papluca/xlm-roberta-base-language-detection"
    )
    
    print("✅ All models loaded successfully!")
    
except Exception as e:
    print(f"❌ Error loading models: {e}")
    raise

# -----------------------------
# Roman Urdu Word Databases
# -----------------------------
ROMAN_URDU_POSITIVE_WORDS = {
    'acha', 'achy', 'achay', 'achi', 'behtar', 'zabardast', 'shandaar', 'umdah', 'umda',
    'behtareen', 'kamaal', 'lajawab', 'mazedar', 'khush', 'khushi', 'pasand', 'pasandida',
    'pyaara', 'pyaari', 'dilchasp', 'mufeed', 'pursukoon', 'roshan', 'saaf', 'suthri',
    'tareef', 'targheeb', 'madadgar', 'dostana', 'jawab', 'khoob', 'khoobsurat', 'heran',
    'mast', 'rangeen', 'sundar', 'sohna', 'sohni', 'pyara', 'pyari', 'meetha', 'meethi',
    'mitha', 'mithi', 'azhar', 'badtameez', 'accha', 'acchi', 'acche'
}

ROMAN_URDU_NEGATIVE_WORDS = {
    'kharab', 'bura', 'ganda', 'sust', 'kamzor', 'mushkil', 'naqis', 'namukammal',
    'mayus', 'nakara', 'bekaar', 'bemisi', 'bepanah', 'beparwah', 'behos', 'bekhauf',
    'bekhudi', 'bekhabar', 'bekasoor', 'bekar', 'bemari', 'bezaar', 'badsurat', 'badtameez',
    'kameena', 'nalaiq', 'nakara', 'ghatiya', 'bakwas', 'bewakoof', 'ahmaq', 'murda',
    'zaleel', 'kambakht', 'laanat', 'harami', 'bad', 'worst', 'waste', 'rubbish'
}

ROMAN_URDU_NEUTRAL_WORDS = {
    'hai', 'hain', 'tha', 'thi', 'ho', 'hun', 'hein', 'main', 'tum', 'wo', 'ye', 'unhon',
    'inhon', 'sath', 'lekin', 'kyun', 'jaisa', 'waisa', 'jese', 'wese', 'phir', 'ab', 'toh',
    'ka', 'ki', 'ke', 'ko', 'se', 'mein', 'par', 'aur', 'ya', 'kya', 'kuch', 'sab', 'apna'
}

# Compile regex patterns for faster matching
roman_urdu_positive_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_POSITIVE_WORDS) + r')\b', re.IGNORECASE)
roman_urdu_negative_pattern = re.compile(r'\b(' + '|'.join(ROMAN_URDU_NEGATIVE_WORDS) + r')\b', re.IGNORECASE)

# -----------------------------
# Enhanced Language Detection
# -----------------------------
def detect_language_advanced(text):
    """Advanced language detection using model + rules"""
    if not text.strip():
        return "English"
    
    text_clean = text.strip()
    
    # Step 1: Urdu script detection (most reliable)
    if re.search(r'[\u0600-\u06FF]', text_clean):
        return "Urdu"
    
    # Step 2: Use transformer model for language detection
    try:
        # Truncate very long texts to avoid model limits
        truncated_text = text_clean[:250]
        lang_result = lang_detector(truncated_text)[0]
        lang_label = lang_result['label'].upper()
        lang_score = lang_result['score']
        
        # Map model outputs to our language categories
        lang_map = {
            'UR': 'Urdu',
            'EN': 'English',
            'Ro-Ur': 'English',  # Hindi often mixed with Roman Urdu
        }
        
        detected_lang = lang_map.get(lang_label, 'English')
        
        # Step 3: For Urdu/English detection, apply Roman Urdu rules
        if detected_lang in ['Urdu', 'English']:
            if is_likely_roman_urdu(text_clean):
                return "Roman Urdu"
        
        return detected_lang
        
    except Exception as e:
        print(f"Language detection model error: {e}")
        # Fallback to rule-based detection
        return detect_language_fallback(text_clean)

def is_likely_roman_urdu(text):
    """Check if text is likely Roman Urdu using comprehensive rules"""
    text_lower = text.lower()
    
    # Count Roman Urdu specific words
    positive_hits = len(roman_urdu_positive_pattern.findall(text_lower))
    negative_hits = len(roman_urdu_negative_pattern.findall(text_lower))
    total_hits = positive_hits + negative_hits
    
    # Count total words
    words = re.findall(r'\b\w+\b', text_lower)
    total_words = len(words)
    
    if total_words == 0:
        return False
    
    # Rule 1: High percentage of Roman Urdu words
    roman_urdu_ratio = total_hits / total_words
    if roman_urdu_ratio > 0.3:  # 30% threshold
        return True
    
    # Rule 2: Specific Roman Urdu sentence structures
    roman_urdu_patterns = [
        r"^[a-z ]*(hai|hain|tha|thi|ho|hun|hein)[\s\.\!]*$",
        r"^[a-z ]*(main|tum|wo|ye|unhon|inhon)[a-z ]*(hun|hein|ho|hai)[a-z ]*$",
        r"^[a-z ]*(acha|bura|kharab|behtar|zabardast)[a-z ]*(hai|hain|tha)[a-z ]*$",
        r"^[a-z ]*(kyun|kese|kaise|kisne|kisliye)[a-z ]*\?$",
        r"^[a-z ]*(bohat|bahut|zyada|zyda)[a-z ]+(acha|bura|kharab|behtar)"
    ]
    
    for pattern in roman_urdu_patterns:
        if re.search(pattern, text_lower):
            return True
    
    # Rule 3: Presence of key Roman Urdu function words
    function_words = ['hai', 'hain', 'tha', 'thi', 'ka', 'ki', 'ke', 'ko', 'se', 'ne']
    function_word_count = sum(1 for word in words if word in function_words)
    
    if function_word_count >= 2 and total_words <= 8:
        return True
    
    return False

def detect_language_fallback(text):
    """Rule-based fallback language detection"""
    text_lower = text.lower()
    
    # Urdu script check
    if re.search(r'[\u0600-\u06FF]', text):
        return "Urdu"
    
    # Roman Urdu detection
    if is_likely_roman_urdu(text):
        return "Roman Urdu"
    
    return "English"

# -----------------------------
# Roman Urdu Text Processing
# -----------------------------
def normalize_roman_urdu(text):
    """Normalize Roman Urdu text variations"""
    text = text.lower().strip()
    
    # Common Roman Urdu spelling variations
    variations = {
        r'\bhy\b': 'hai', r'\bh\b': 'hai', r'\bhe\b': 'hai',
        r'\bnhi\b': 'nahi', r'\bnai\b': 'nahi', r'\bna\b': 'nahi',
        r'\bboht\b': 'bohot', r'\bbhot\b': 'bohot', r'\bbahut\b': 'bohot',
        r'\bzyada\b': 'zyada', r'\bzada\b': 'zyada', r'\bzyda\b': 'zyada',
        r'\bacha\b': 'acha', r'\bachay\b': 'achay', r'\bacchi\b': 'achi',
        r'\bacche\b': 'achay', r'\bthy\b': 'thay', r'\bthi\b': 'thi',
        r'\btha\b': 'tha', r'\bmje\b': 'mujhe', r'\btuje\b': 'tujhe',
        r'\busi\b': 'ussi', r'\besi\b': 'essi', r'\bwohi\b': 'wohi',
        r'\bkisi\b': 'kisi', r'\bkuch\b': 'kuch', r'\bsab\b': 'sab',
        r'\bme\b': 'main', r'\bmai\b': 'main', r'\btu\b': 'tum',
        r'\buss\b': 'us', r'\biss\b': 'is'
    }
    
    for pattern, replacement in variations.items():
        text = re.sub(pattern, replacement, text)
    
    return text

# -----------------------------
# Roman Urdu Sentiment Correction
# -----------------------------
def correct_roman_urdu_sentiment(text, current_sentiment, current_score):
    """Apply Roman Urdu specific sentiment corrections"""
    text_lower = text.lower()
    normalized_text = normalize_roman_urdu(text_lower)
    
    # Count positive and negative words
    positive_matches = roman_urdu_positive_pattern.findall(normalized_text)
    negative_matches = roman_urdu_negative_pattern.findall(normalized_text)
    
    positive_count = len(positive_matches)
    negative_count = len(negative_matches)
    
    # Strong positive indicators
    strong_positive_indicators = ['acha', 'achy', 'achay', 'achi', 'zabardast', 'shandaar', 'kamaal']
    strong_negative_indicators = ['kharab', 'bura', 'ganda', 'bekaar', 'badtameez']
    
    # Rule 1: If text contains strong positive words but model says negative, correct it
    has_strong_positive = any(indicator in normalized_text for indicator in strong_positive_indicators)
    has_strong_negative = any(indicator in normalized_text for indicator in strong_negative_indicators)
    
    if has_strong_positive and current_sentiment == "Negative":
        return "Positive", max(current_score, 0.85)
    
    if has_strong_negative and current_sentiment == "Positive":
        return "Negative", max(current_score, 0.85)
    
    # Rule 2: Word count based correction
    if positive_count > negative_count and current_sentiment == "Negative":
        new_score = min(0.8 + (positive_count * 0.05), 0.95)
        return "Positive", new_score
    
    if negative_count > positive_count and current_sentiment == "Positive":
        new_score = min(0.8 + (negative_count * 0.05), 0.95)
        return "Negative", new_score
    
    # Rule 3: Mixed sentiments with clear majority
    total_sentiment_words = positive_count + negative_count
    if total_sentiment_words >= 2:
        positive_ratio = positive_count / total_sentiment_words
        
        if positive_ratio >= 0.7 and current_sentiment != "Positive":
            return "Positive", 0.8
        elif positive_ratio <= 0.3 and current_sentiment != "Negative":
            return "Negative", 0.8
    
    return current_sentiment, current_score

# -----------------------------
# Enhanced Ensemble for Roman Urdu
# -----------------------------
def ensemble_roman_urdu_sentiment(text):
    """Advanced ensemble method for Roman Urdu sentiment analysis"""
    normalized_text = normalize_roman_urdu(text)
    
    try:
        # Get predictions from both Roman Urdu and Urdu models
        ru_result = roman_urdu_model(normalized_text)[0]
        ur_result = urdu_model(normalized_text)[0]
        
        # Normalize labels
        ru_sentiment = normalize_sentiment_label(ru_result["label"])
        ur_sentiment = normalize_sentiment_label(ur_result["label"])
        ru_score = ru_result["score"]
        ur_score = ur_result["score"]
        
        # Apply Roman Urdu corrections to both results
        ru_sentiment_corrected, ru_score_corrected = correct_roman_urdu_sentiment(text, ru_sentiment, ru_score)
        ur_sentiment_corrected, ur_score_corrected = correct_roman_urdu_sentiment(text, ur_sentiment, ur_score)
        
        # If both models agree after correction
        if ru_sentiment_corrected == ur_sentiment_corrected:
            final_score = max(ru_score_corrected, ur_score_corrected)
            return {"label": ru_sentiment_corrected, "score": final_score}
        
        # Weighted voting with higher weight for Roman Urdu model
        ru_weight = ru_score_corrected * 1.6  # Higher weight for Roman Urdu model
        ur_weight = ur_score_corrected * 1.2
        
        if ru_weight > ur_weight:
            return {"label": ru_sentiment_corrected, "score": ru_score_corrected}
        else:
            return {"label": ur_sentiment_corrected, "score": ur_score_corrected}
            
    except Exception as e:
        print(f"Ensemble error: {e}")
        # Fallback to Roman Urdu model with correction
        try:
            result = roman_urdu_model(normalize_roman_urdu(text))[0]
            corrected_sentiment, corrected_score = correct_roman_urdu_sentiment(
                text, normalize_sentiment_label(result["label"]), result["score"]
            )
            return {"label": corrected_sentiment, "score": corrected_score}
        except:
            return {"label": "Neutral", "score": 0.5}

# -----------------------------
# Sentiment Analysis Core Functions
# -----------------------------
def normalize_sentiment_label(label):
    """Normalize sentiment labels from different models"""
    label = str(label).lower()
    
    if any(word in label for word in ["pos", "positive", "positive", "lab"]):
        return "Positive"
    elif any(word in label for word in ["neg", "negative", "negative"]):
        return "Negative"
    else:
        return "Neutral"

def get_strong_sentiment_words(text, language):
    """Extract strong sentiment-bearing words"""
    text_lower = text.lower()
    strong_words = []
    
    if language == "Roman Urdu":
        # Use our Roman Urdu word databases
        positive_matches = roman_urdu_positive_pattern.findall(text_lower)
        negative_matches = roman_urdu_negative_pattern.findall(text_lower)
        strong_words = positive_matches + negative_matches
    elif language == "Urdu":
        # Urdu strong words (you can expand this list)
        urdu_positive = ['زبردست', 'شاندار', 'عمدہ', 'بہترین', 'اچھا']
        urdu_negative = ['خراب', 'برا', 'مایوس کن', 'بیکار']
        for word in urdu_positive + urdu_negative:
            if word in text:
                strong_words.append(word)
    else:  # English
        english_positive = ['excellent', 'outstanding', 'amazing', 'wonderful', 'perfect', 'great']
        english_negative = ['terrible', 'awful', 'horrible', 'disappointing', 'poor', 'bad']
        for word in english_positive + english_negative:
            if re.search(r'\b' + re.escape(word) + r'\b', text_lower):
                strong_words.append(word)
    
    return list(set(strong_words))[:5]  # Return unique words, max 5

def generate_detailed_explanation(text, sentiment, score, language, strong_words):
    """Generate detailed explanation for sentiment analysis"""
    
    confidence_level = "High" if score >= 0.8 else "Medium" if score >= 0.6 else "Low"
    
    base_explanations = {
        "Positive": {
            "High": "Strong positive sentiment with clear positive expressions.",
            "Medium": "Moderately positive sentiment with favorable tone.",
            "Low": "Slightly positive leaning with some positive indicators."
        },
        "Negative": {
            "High": "Strong negative sentiment with clear criticism.",
            "Medium": "Moderately negative sentiment with critical tone.", 
            "Low": "Slightly negative leaning with some concerning indicators."
        },
        "Neutral": {
            "High": "Clearly neutral or factual statement.",
            "Medium": "Mostly neutral with balanced perspective.",
            "Low": "Weak sentiment leaning neutral."
        }
    }
    
    explanation = base_explanations[sentiment][confidence_level]
    
    # Add language specific notes
    if language == "Roman Urdu":
        explanation += " Analyzed with Roman Urdu specific rules."
        
        # Special note for common corrections
        if any(word in text.lower() for word in ['acha', 'achy', 'achay', 'achi']):
            if sentiment == "Positive":
                explanation += " Words like 'acha' correctly identified as positive."
    
    # Add strong words information
    if strong_words:
        explanation += f" Key sentiment words: {', '.join(strong_words)}."
    
    explanation += f" Confidence: {score:.3f}"
    
    return explanation

# -----------------------------
# Main Analysis Function
# -----------------------------
SAVE_FILE = "sentiment_logs.csv"
LOCK_FILE = SAVE_FILE + ".lock"

if not os.path.exists(SAVE_FILE):
    pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]).to_csv(
        SAVE_FILE, index=False, encoding="utf-8-sig"
    )

def analyze_sentiment_complete(text, lang_hint):
    """Complete sentiment analysis pipeline"""
    if not text.strip():
        return "⚠️ Please enter a sentence.", "", "", SAVE_FILE, ""
    
    # Detect language
    language = lang_hint if lang_hint != "Auto Detect" else detect_language_advanced(text)
    
    try:
        # Perform sentiment analysis based on language
        if language == "English":
            result = english_model(text[:512])[0]
            sentiment = normalize_sentiment_label(result["label"])
            score = round(float(result["score"]), 3)
            
        elif language == "Urdu":
            result = urdu_model(text[:512])[0]
            sentiment = normalize_sentiment_label(result["label"])
            score = round(float(result["score"]), 3)
            
        else:  # Roman Urdu
            result = ensemble_roman_urdu_sentiment(text)
            sentiment = result["label"]
            score = round(float(result["score"]), 3)
        
        # Get strong words
        strong_words = get_strong_sentiment_words(text, language)
        strong_words_str = ", ".join(strong_words) if strong_words else "None"
        
        # Generate explanation
        explanation = generate_detailed_explanation(text, sentiment, score, language, strong_words)
        
        # Save to CSV
        with FileLock(LOCK_FILE):
            df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") if os.path.exists(SAVE_FILE) else pd.DataFrame(
                columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"]
            )
            new_row = pd.DataFrame([[
                text, language, sentiment, score, strong_words_str, pd.Timestamp.now()
            ]], columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])
            df = pd.concat([df, new_row], ignore_index=True)
            df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
        
        return sentiment, str(score), explanation, SAVE_FILE, strong_words_str
        
    except Exception as e:
        error_msg = f"Analysis error: {str(e)}"
        return "Error", "0", error_msg, SAVE_FILE, ""

# -----------------------------
# Gradio Interface
# -----------------------------
def show_logs():
    if os.path.exists(SAVE_FILE):
        df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig")
        return df.tail(20)
    else:
        return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])

def clear_logs():
    if os.path.exists(SAVE_FILE):
        os.remove(SAVE_FILE)
    return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"])

with gr.Blocks(title="Multilingual Sentiment Analysis") as demo:
    gr.Markdown("""
    # 🌍 Advanced Multilingual Sentiment Analysis
    **English • Urdu • Roman Urdu**

    Uses transformer models for accurate language detection and sentiment analysis with specialized Roman Urdu handling.
    
    **Used models:** 
    - English: siebert/sentiment-roberta-large-english
    - Urdu: tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu
    - Roman Urdu: tahamueed23/roman-urdu-sentiment
    - Language detection: papluca/xlm-roberta-base-language-detection
    """)
    
    # Top row with two columns
    with gr.Row():
        # Left column - Input section
        with gr.Column(scale=1):
            gr.Markdown("### 📥 Input Section")
            user_text = gr.Textbox(
                label="✍️ Enter Text",
                placeholder="Type in English, Urdu, or Roman Urdu...",
                lines=3
            )
            lang_dropdown = gr.Dropdown(
                ["Auto Detect", "English", "Urdu", "Roman Urdu"],
                value="Auto Detect",
                label="🌐 Language Selection"
            )
            
            with gr.Row():
                btn_analyze = gr.Button("🔍 Analyze Sentiment", variant="primary")
                btn_show = gr.Button("📂 Show Logs")
                btn_clear = gr.Button("🗑️ Clear Logs")
        
        # Right column - Results section
        with gr.Column(scale=1):
            gr.Markdown("### 📊 Results")
            with gr.Row():
                with gr.Column():
                    out_sent = gr.Textbox(label="🎭 Sentiment")
                    out_conf = gr.Textbox(label="📊 Confidence Score")
                with gr.Column():
                    out_strong = gr.Textbox(label="💪 Strong Words")
                    out_file = gr.File(label="⬇️ Download Logs")
            
            out_exp = gr.Textbox(label="💡 Detailed Explanation", lines=3)
    
    # Bottom row with analysis history taking most of the space
    with gr.Row():
        with gr.Column(scale=3):  # Takes more space (75%)
            gr.Markdown("### 📋 Analysis History")
            logs_df = gr.Dataframe(
                headers=["Sentence", "Language", "Sentiment", "Confidence", "Strong_Words", "Timestamp"],
                label="",
                interactive=False,
                wrap=True
            )
        with gr.Column(scale=1):  # Takes less space (25%)
            gr.Markdown("### ℹ️ Information")
            gr.Markdown("""
            **How to use:**
            1. Enter text in any supported language
            2. Select language or use Auto Detect
            3. Click Analyze Sentiment
            4. View results and history
            
            **Supported Languages:**
            - English
            - Urdu (Script)
            - Roman Urdu (Latin script)
            
            **Note:** Auto Detect works best with clear text samples.
            """)
    
    # Event handlers
    btn_analyze.click(
        analyze_sentiment_complete,
        inputs=[user_text, lang_dropdown],
        outputs=[out_sent, out_conf, out_exp, out_file, out_strong]
    )
    btn_show.click(show_logs, outputs=[logs_df])
    btn_clear.click(clear_logs, outputs=[logs_df])

if __name__ == "__main__":
    demo.launch(share=False)