# 1. INSTALL LIBRARIES
# Make sure to run this in its own cell first if you haven't already
# !pip install gradio transformers torch torchaudio librosa soundfile datasets accelerate

# 2. IMPORT EVERYTHING
import gradio as gr
import json
from transformers import pipeline
import warnings

# Suppress harmless warnings
warnings.filterwarnings("ignore")

print("Setting up the analysis pipelines... (This may take a moment)")

# 3. LOAD THE MODELS (This is our backend logic from before)
try:
    # Speech Recognition pipeline
    speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-base")
    print("✅ Speech recognizer loaded.")
    
    # Audio Classification pipeline
    sound_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
    print("✅ Sound classifier loaded.")
except Exception as e:
    print(f"❌ Error loading models: {e}")

# 4. DEFINE THE CORE ANALYSIS FUNCTION
def analyze_audio_holistically(audio_path):
    """
    Takes an audio file path and returns a holistic analysis dictionary.
    """
    if audio_path is None:
        return {"Error": "No audio file provided. Please upload a file."}
        
    print(f"Analysing audio file: {audio_path}...")
    try:
        # Get transcription
        transcription_result = speech_recognizer(audio_path)
        transcription = transcription_result['text'].strip()
        
        # Get acoustic events
        acoustic_results = sound_classifier(audio_path, top_k=3)
        detected_sounds = {item['label']: round(item['score'], 2) for item in acoustic_results}
        
        # Fuse the results
        holistic_understanding = {
            "Transcribed Text": transcription,
            "Detected Sounds": detected_sounds,
            "Insight": f"The model detected speech saying '{transcription}' in an environment with sounds like: {', '.join(detected_sounds.keys())}."
        }
        
        return holistic_understanding
    except Exception as e:
        return {"Error": f"Could not process the audio file. Details: {str(e)}"}

# 5. CREATE AND LAUNCH THE GRADIO INTERFACE
print("🚀 Launching Gradio Web Demo...")

iface = gr.Interface(
    fn=analyze_audio_holistically,
    inputs=gr.Audio(type="filepath", label="Upload your Audio File (.wav, .mp3)"),
    outputs=gr.JSON(label="Holistic Analysis Result"),
    title="🔊 Audio Language Model (ALM) Demo By Algo Rangers",
    description="""
    This demo showcases a holistic Audio Language Model. 
    Upload an audio file to see a combined analysis of both spoken words (speech) and background sounds (non-speech). 
    Built by combining OpenAI's Whisper and MIT's AST model.
    """,
    # --- THIS LINE IS NOW FIXED ---
    # Make sure you have a file named "example.wav" in the same directory.
    examples=[["example.wav"]] 
)

# This will create the web UI
iface.launch(debug=True)