Sridhar44 commited on
Commit
e131c23
Β·
verified Β·
1 Parent(s): 56813a3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. INSTALL LIBRARIES
2
+ # Make sure to run this in its own cell first if you haven't already
3
+ # !pip install gradio transformers torch torchaudio librosa soundfile datasets accelerate
4
+
5
+ # 2. IMPORT EVERYTHING
6
+ import gradio as gr
7
+ import json
8
+ from transformers import pipeline
9
+ import warnings
10
+
11
+ # Suppress harmless warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ print("Setting up the analysis pipelines... (This may take a moment)")
15
+
16
+ # 3. LOAD THE MODELS (This is our backend logic from before)
17
+ try:
18
+ # Speech Recognition pipeline
19
+ speech_recognizer = pipeline("automatic-speech-recognition", model="openai/whisper-base")
20
+ print("βœ… Speech recognizer loaded.")
21
+
22
+ # Audio Classification pipeline
23
+ sound_classifier = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")
24
+ print("βœ… Sound classifier loaded.")
25
+ except Exception as e:
26
+ print(f"❌ Error loading models: {e}")
27
+
28
+ # 4. DEFINE THE CORE ANALYSIS FUNCTION
29
+ def analyze_audio_holistically(audio_path):
30
+ """
31
+ Takes an audio file path and returns a holistic analysis dictionary.
32
+ """
33
+ if audio_path is None:
34
+ return {"Error": "No audio file provided. Please upload a file."}
35
+
36
+ print(f"Analysing audio file: {audio_path}...")
37
+ try:
38
+ # Get transcription
39
+ transcription_result = speech_recognizer(audio_path)
40
+ transcription = transcription_result['text'].strip()
41
+
42
+ # Get acoustic events
43
+ acoustic_results = sound_classifier(audio_path, top_k=3)
44
+ detected_sounds = {item['label']: round(item['score'], 2) for item in acoustic_results}
45
+
46
+ # Fuse the results
47
+ holistic_understanding = {
48
+ "Transcribed Text": transcription,
49
+ "Detected Sounds": detected_sounds,
50
+ "Insight": f"The model detected speech saying '{transcription}' in an environment with sounds like: {', '.join(detected_sounds.keys())}."
51
+ }
52
+
53
+ return holistic_understanding
54
+ except Exception as e:
55
+ return {"Error": f"Could not process the audio file. Details: {str(e)}"}
56
+
57
+ # 5. CREATE AND LAUNCH THE GRADIO INTERFACE
58
+ print("πŸš€ Launching Gradio Web Demo...")
59
+
60
+ iface = gr.Interface(
61
+ fn=analyze_audio_holistically,
62
+ inputs=gr.Audio(type="filepath", label="Upload your Audio File (.wav, .mp3)"),
63
+ outputs=gr.JSON(label="Holistic Analysis Result"),
64
+ title="πŸ”Š Audio Language Model (ALM) Demo",
65
+ description="""
66
+ This demo showcases a holistic Audio Language Model.
67
+ Upload an audio file to see a combined analysis of both spoken words (speech) and background sounds (non-speech).
68
+ Built by combining OpenAI's Whisper and MIT's AST model.
69
+ """,
70
+ examples=[["path/to/your/example.wav"]] # Optional: add paths to example files
71
+ )
72
+
73
+ # This will create the web UI
74
+ iface.launch(debug=True)