# 1. INSTALL LIBRARIES # Make sure to run this in its own cell first if you haven't already # !pip install gradio transformers torch torchaudio librosa soundfile datasets accelerate # 2. IMPORT EVERYTHING import gradio as gr import json from transformers import pipeline import warnings # Suppress harmless warnings warnings.filterwarnings("ignore") print("Setting up the analysis pipelines... (This may take a moment)") # 3. LOAD THE MODELS (This is our backend logic from before) try: # Speech Recognition pipeline speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-base") print("✅ Speech recognizer loaded.") # Audio Classification pipeline sound_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593") print("✅ Sound classifier loaded.") except Exception as e: print(f"❌ Error loading models: {e}") # 4. DEFINE THE CORE ANALYSIS FUNCTION def analyze_audio_holistically(audio_path): """ Takes an audio file path and returns a holistic analysis dictionary. """ if audio_path is None: return {"Error": "No audio file provided. Please upload a file."} print(f"Analysing audio file: {audio_path}...") try: # Get transcription transcription_result = speech_recognizer(audio_path) transcription = transcription_result['text'].strip() # Get acoustic events acoustic_results = sound_classifier(audio_path, top_k=3) detected_sounds = {item['label']: round(item['score'], 2) for item in acoustic_results} # Fuse the results holistic_understanding = { "Transcribed Text": transcription, "Detected Sounds": detected_sounds, "Insight": f"The model detected speech saying '{transcription}' in an environment with sounds like: {', '.join(detected_sounds.keys())}." } return holistic_understanding except Exception as e: return {"Error": f"Could not process the audio file. Details: {str(e)}"} # 5. CREATE AND LAUNCH THE GRADIO INTERFACE print("🚀 Launching Gradio Web Demo...") iface = gr.Interface( fn=analyze_audio_holistically, inputs=gr.Audio(type="filepath", label="Upload your Audio File (.wav, .mp3)"), outputs=gr.JSON(label="Holistic Analysis Result"), title="🔊 Audio Language Model (ALM) Demo By Algo Rangers", description=""" This demo showcases a holistic Audio Language Model. Upload an audio file to see a combined analysis of both spoken words (speech) and background sounds (non-speech). Built by combining OpenAI's Whisper and MIT's AST model. """, # --- THIS LINE IS NOW FIXED --- # Make sure you have a file named "example.wav" in the same directory. examples=[["example.wav"]] ) # This will create the web UI iface.launch(debug=True)