Sridhar44's picture
Update app.py
d92a045 verified
# 1. INSTALL LIBRARIES
# Make sure to run this in its own cell first if you haven't already
# !pip install gradio transformers torch torchaudio librosa soundfile datasets accelerate
# 2. IMPORT EVERYTHING
import gradio as gr
import json
from transformers import pipeline
import warnings
# Suppress harmless warnings
warnings.filterwarnings("ignore")
print("Setting up the analysis pipelines... (This may take a moment)")
# 3. LOAD THE MODELS (This is our backend logic from before)
try:
# Speech Recognition pipeline
speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-base")
print("βœ… Speech recognizer loaded.")
# Audio Classification pipeline
sound_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
print("βœ… Sound classifier loaded.")
except Exception as e:
print(f"❌ Error loading models: {e}")
# 4. DEFINE THE CORE ANALYSIS FUNCTION
def analyze_audio_holistically(audio_path):
"""
Takes an audio file path and returns a holistic analysis dictionary.
"""
if audio_path is None:
return {"Error": "No audio file provided. Please upload a file."}
print(f"Analysing audio file: {audio_path}...")
try:
# Get transcription
transcription_result = speech_recognizer(audio_path)
transcription = transcription_result['text'].strip()
# Get acoustic events
acoustic_results = sound_classifier(audio_path, top_k=3)
detected_sounds = {item['label']: round(item['score'], 2) for item in acoustic_results}
# Fuse the results
holistic_understanding = {
"Transcribed Text": transcription,
"Detected Sounds": detected_sounds,
"Insight": f"The model detected speech saying '{transcription}' in an environment with sounds like: {', '.join(detected_sounds.keys())}."
}
return holistic_understanding
except Exception as e:
return {"Error": f"Could not process the audio file. Details: {str(e)}"}
# 5. CREATE AND LAUNCH THE GRADIO INTERFACE
print("πŸš€ Launching Gradio Web Demo...")
iface = gr.Interface(
fn=analyze_audio_holistically,
inputs=gr.Audio(type="filepath", label="Upload your Audio File (.wav, .mp3)"),
outputs=gr.JSON(label="Holistic Analysis Result"),
title="πŸ”Š Audio Language Model (ALM) Demo By Algo Rangers",
description="""
This demo showcases a holistic Audio Language Model.
Upload an audio file to see a combined analysis of both spoken words (speech) and background sounds (non-speech).
Built by combining OpenAI's Whisper and MIT's AST model.
""",
# --- THIS LINE IS NOW FIXED ---
# Make sure you have a file named "example.wav" in the same directory.
examples=[["example.wav"]]
)
# This will create the web UI
iface.launch(debug=True)