Spaces:
Sleeping
Sleeping
| # 1. INSTALL LIBRARIES | |
| # Make sure to run this in its own cell first if you haven't already | |
| # !pip install gradio transformers torch torchaudio librosa soundfile datasets accelerate | |
| # 2. IMPORT EVERYTHING | |
| import gradio as gr | |
| import json | |
| from transformers import pipeline | |
| import warnings | |
| # Suppress harmless warnings | |
| warnings.filterwarnings("ignore") | |
| print("Setting up the analysis pipelines... (This may take a moment)") | |
| # 3. LOAD THE MODELS (This is our backend logic from before) | |
| try: | |
| # Speech Recognition pipeline | |
| speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-base") | |
| print("β Speech recognizer loaded.") | |
| # Audio Classification pipeline | |
| sound_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593") | |
| print("β Sound classifier loaded.") | |
| except Exception as e: | |
| print(f"β Error loading models: {e}") | |
| # 4. DEFINE THE CORE ANALYSIS FUNCTION | |
| def analyze_audio_holistically(audio_path): | |
| """ | |
| Takes an audio file path and returns a holistic analysis dictionary. | |
| """ | |
| if audio_path is None: | |
| return {"Error": "No audio file provided. Please upload a file."} | |
| print(f"Analysing audio file: {audio_path}...") | |
| try: | |
| # Get transcription | |
| transcription_result = speech_recognizer(audio_path) | |
| transcription = transcription_result['text'].strip() | |
| # Get acoustic events | |
| acoustic_results = sound_classifier(audio_path, top_k=3) | |
| detected_sounds = {item['label']: round(item['score'], 2) for item in acoustic_results} | |
| # Fuse the results | |
| holistic_understanding = { | |
| "Transcribed Text": transcription, | |
| "Detected Sounds": detected_sounds, | |
| "Insight": f"The model detected speech saying '{transcription}' in an environment with sounds like: {', '.join(detected_sounds.keys())}." | |
| } | |
| return holistic_understanding | |
| except Exception as e: | |
| return {"Error": f"Could not process the audio file. Details: {str(e)}"} | |
| # 5. CREATE AND LAUNCH THE GRADIO INTERFACE | |
| print("π Launching Gradio Web Demo...") | |
| iface = gr.Interface( | |
| fn=analyze_audio_holistically, | |
| inputs=gr.Audio(type="filepath", label="Upload your Audio File (.wav, .mp3)"), | |
| outputs=gr.JSON(label="Holistic Analysis Result"), | |
| title="π Audio Language Model (ALM) Demo By Algo Rangers", | |
| description=""" | |
| This demo showcases a holistic Audio Language Model. | |
| Upload an audio file to see a combined analysis of both spoken words (speech) and background sounds (non-speech). | |
| Built by combining OpenAI's Whisper and MIT's AST model. | |
| """, | |
| # --- THIS LINE IS NOW FIXED --- | |
| # Make sure you have a file named "example.wav" in the same directory. | |
| examples=[["example.wav"]] | |
| ) | |
| # This will create the web UI | |
| iface.launch(debug=True) | |