Spaces:
Sleeping
Sleeping
fix voice of tone2
Browse files
mrrrme/audio/voice_emotion.py
CHANGED
|
@@ -70,7 +70,7 @@ class VoiceEmotionWorker:
|
|
| 70 |
self.last_error_message = ""
|
| 71 |
self.last_error_time = 0
|
| 72 |
|
| 73 |
-
# Check audio device (informational)
|
| 74 |
try:
|
| 75 |
devices = sd.query_devices()
|
| 76 |
if self.device is not None:
|
|
@@ -137,13 +137,18 @@ class VoiceEmotionWorker:
|
|
| 137 |
"""
|
| 138 |
# Lazy start: Ensure inference loop is running
|
| 139 |
if not self.running:
|
|
|
|
|
|
|
|
|
|
| 140 |
self._start_inference_thread()
|
| 141 |
-
self.running = True
|
| 142 |
|
| 143 |
try:
|
| 144 |
self._process_audio_chunk_logic(audio_data)
|
| 145 |
except Exception as e:
|
| 146 |
self._log_error(f"External audio processing error: {e}")
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
def _process_audio_chunk_logic(self, mono_data):
|
| 149 |
"""Internal logic to slice audio into blocks and run VAD"""
|
|
@@ -154,6 +159,10 @@ class VoiceEmotionWorker:
|
|
| 154 |
self.audio_chunks_received += 1
|
| 155 |
hop = int(AUDIO_SR * AUDIO_BLOCK)
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
for i in range(0, len(mono_data) - hop + 1, hop):
|
| 158 |
chunk = mono_data[i:i+hop]
|
| 159 |
|
|
@@ -162,6 +171,13 @@ class VoiceEmotionWorker:
|
|
| 162 |
try:
|
| 163 |
pcm16 = np.clip(chunk * 32768, -32768, 32767).astype(np.int16).tobytes()
|
| 164 |
is_speech = self.vad.is_speech(pcm16, sample_rate=AUDIO_SR)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
except Exception as e:
|
| 166 |
self._log_error(f"VAD error: {e}")
|
| 167 |
|
|
|
|
| 70 |
self.last_error_message = ""
|
| 71 |
self.last_error_time = 0
|
| 72 |
|
| 73 |
+
# Check audio device (informational only)
|
| 74 |
try:
|
| 75 |
devices = sd.query_devices()
|
| 76 |
if self.device is not None:
|
|
|
|
| 137 |
"""
|
| 138 |
# Lazy start: Ensure inference loop is running
|
| 139 |
if not self.running:
|
| 140 |
+
print("[VoiceEmotion] π Auto-starting inference thread (Server Mode detected)")
|
| 141 |
+
# β
CRITICAL FIX: Set running=True BEFORE starting the thread
|
| 142 |
+
self.running = True
|
| 143 |
self._start_inference_thread()
|
|
|
|
| 144 |
|
| 145 |
try:
|
| 146 |
self._process_audio_chunk_logic(audio_data)
|
| 147 |
except Exception as e:
|
| 148 |
self._log_error(f"External audio processing error: {e}")
|
| 149 |
+
|
| 150 |
+
# Alias for compatibility with backend/processing/audio.py
|
| 151 |
+
process_external_audio = add_audio
|
| 152 |
|
| 153 |
def _process_audio_chunk_logic(self, mono_data):
|
| 154 |
"""Internal logic to slice audio into blocks and run VAD"""
|
|
|
|
| 159 |
self.audio_chunks_received += 1
|
| 160 |
hop = int(AUDIO_SR * AUDIO_BLOCK)
|
| 161 |
|
| 162 |
+
# Handle small chunks by buffering if necessary (basic handling here)
|
| 163 |
+
# Note: If chunks are consistently smaller than hop, they might be skipped.
|
| 164 |
+
# WebSocket usually sends ~4096 bytes which is larger than hop (320 samples).
|
| 165 |
+
|
| 166 |
for i in range(0, len(mono_data) - hop + 1, hop):
|
| 167 |
chunk = mono_data[i:i+hop]
|
| 168 |
|
|
|
|
| 171 |
try:
|
| 172 |
pcm16 = np.clip(chunk * 32768, -32768, 32767).astype(np.int16).tobytes()
|
| 173 |
is_speech = self.vad.is_speech(pcm16, sample_rate=AUDIO_SR)
|
| 174 |
+
|
| 175 |
+
# Debug VAD once every ~100 chunks to check signal
|
| 176 |
+
if self.audio_chunks_received % 100 == 0:
|
| 177 |
+
amp = np.max(np.abs(chunk))
|
| 178 |
+
if amp > 0.05:
|
| 179 |
+
print(f"[VoiceEmotion] π VAD Check: Amp={amp:.3f}, Speech={is_speech}")
|
| 180 |
+
|
| 181 |
except Exception as e:
|
| 182 |
self._log_error(f"VAD error: {e}")
|
| 183 |
|
mrrrme/backend/processing/audio.py
CHANGED
|
@@ -17,14 +17,18 @@ async def process_audio_chunk(audio_data_b64: str) -> dict:
|
|
| 17 |
# 2. Push to Voice Worker (VITAL FIX for Server Mode)
|
| 18 |
# This triggers the VAD, speech tracking, and Ring Buffer update
|
| 19 |
if voice_worker:
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# 3. Maintain legacy buffer for throttling response rate
|
| 23 |
audio_buffer.append(audio_data)
|
| 24 |
|
| 25 |
if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
|
| 26 |
# Now get_probs() will actually return fresh data because
|
| 27 |
-
#
|
| 28 |
voice_probs, voice_emotion = voice_worker.get_probs()
|
| 29 |
audio_buffer = audio_buffer[-AUDIO_BUFFER_KEEP:]
|
| 30 |
|
|
|
|
| 17 |
# 2. Push to Voice Worker (VITAL FIX for Server Mode)
|
| 18 |
# This triggers the VAD, speech tracking, and Ring Buffer update
|
| 19 |
if voice_worker:
|
| 20 |
+
# Convert raw bytes to numpy float32 for consistency if not handled elsewhere
|
| 21 |
+
# Assuming raw PCM 16-bit here
|
| 22 |
+
if len(audio_data) % 2 == 0:
|
| 23 |
+
audio_float = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
|
| 24 |
+
voice_worker.add_audio(audio_float)
|
| 25 |
|
| 26 |
# 3. Maintain legacy buffer for throttling response rate
|
| 27 |
audio_buffer.append(audio_data)
|
| 28 |
|
| 29 |
if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
|
| 30 |
# Now get_probs() will actually return fresh data because
|
| 31 |
+
# add_audio has been feeding the worker
|
| 32 |
voice_probs, voice_emotion = voice_worker.get_probs()
|
| 33 |
audio_buffer = audio_buffer[-AUDIO_BUFFER_KEEP:]
|
| 34 |
|