Spaces:

Sridhar44
/

Algo_Rangers_ALM

Sleeping

App Files Files Community

Sridhar44 commited on Sep 28

Commit

e131c23

verified ·

1 Parent(s): 56813a3

Create app.py

Browse files

Files changed (1) hide show

app.py +74 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# 1. INSTALL LIBRARIES
+# Make sure to run this in its own cell first if you haven't already
+# !pip install gradio transformers torch torchaudio librosa soundfile datasets accelerate
+# 2. IMPORT EVERYTHING
+import gradio as gr
+import json
+from transformers import pipeline
+import warnings
+# Suppress harmless warnings
+warnings.filterwarnings("ignore")
+print("Setting up the analysis pipelines... (This may take a moment)")
+# 3. LOAD THE MODELS (This is our backend logic from before)
+try:
+    # Speech Recognition pipeline
+    speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-base")
+    print("✅ Speech recognizer loaded.")
+    # Audio Classification pipeline
+    sound_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
+    print("✅ Sound classifier loaded.")
+except Exception as e:
+    print(f"❌ Error loading models: {e}")
+# 4. DEFINE THE CORE ANALYSIS FUNCTION
+def analyze_audio_holistically(audio_path):
+    """
+    Takes an audio file path and returns a holistic analysis dictionary.
+    """
+    if audio_path is None:
+        return {"Error": "No audio file provided. Please upload a file."}
+    print(f"Analysing audio file: {audio_path}...")
+    try:
+        # Get transcription
+        transcription_result = speech_recognizer(audio_path)
+        transcription = transcription_result['text'].strip()
+        # Get acoustic events
+        acoustic_results = sound_classifier(audio_path, top_k=3)
+        detected_sounds = {item['label']: round(item['score'], 2) for item in acoustic_results}
+        # Fuse the results
+        holistic_understanding = {
+            "Transcribed Text": transcription,
+            "Detected Sounds": detected_sounds,
+            "Insight": f"The model detected speech saying '{transcription}' in an environment with sounds like: {', '.join(detected_sounds.keys())}."
+        }
+        return holistic_understanding
+    except Exception as e:
+        return {"Error": f"Could not process the audio file. Details: {str(e)}"}
+# 5. CREATE AND LAUNCH THE GRADIO INTERFACE
+print("🚀 Launching Gradio Web Demo...")
+iface = gr.Interface(
+    fn=analyze_audio_holistically,
+    inputs=gr.Audio(type="filepath", label="Upload your Audio File (.wav, .mp3)"),
+    outputs=gr.JSON(label="Holistic Analysis Result"),
+    title="🔊 Audio Language Model (ALM) Demo",
+    description="""
+    This demo showcases a holistic Audio Language Model.
+    Upload an audio file to see a combined analysis of both spoken words (speech) and background sounds (non-speech).
+    Built by combining OpenAI's Whisper and MIT's AST model.
+    """,
+    examples=[["path/to/your/example.wav"]] # Optional: add paths to example files
+)
+# This will create the web UI
+iface.launch(debug=True)