Spaces:

michon
/

mrrrme-emotion-ai

Sleeping

App Files Files Community

michon commited on 6 days ago

Commit

df33afb

1 Parent(s): d423871

fix voice of tone2

Browse files

Files changed (2) hide show

mrrrme/audio/voice_emotion.py +18 -2
mrrrme/backend/processing/audio.py +6 -2

mrrrme/audio/voice_emotion.py CHANGED Viewed

@@ -70,7 +70,7 @@ class VoiceEmotionWorker:
         self.last_error_message = ""
         self.last_error_time = 0
-        # Check audio device (informational)
         try:
             devices = sd.query_devices()
             if self.device is not None:
@@ -137,13 +137,18 @@ class VoiceEmotionWorker:
         """
         # Lazy start: Ensure inference loop is running
         if not self.running:
             self._start_inference_thread()
-            self.running = True
         try:
             self._process_audio_chunk_logic(audio_data)
         except Exception as e:
             self._log_error(f"External audio processing error: {e}")
     def _process_audio_chunk_logic(self, mono_data):
         """Internal logic to slice audio into blocks and run VAD"""
@@ -154,6 +159,10 @@ class VoiceEmotionWorker:
         self.audio_chunks_received += 1
         hop = int(AUDIO_SR * AUDIO_BLOCK)
         for i in range(0, len(mono_data) - hop + 1, hop):
             chunk = mono_data[i:i+hop]
@@ -162,6 +171,13 @@ class VoiceEmotionWorker:
             try:
                 pcm16 = np.clip(chunk * 32768, -32768, 32767).astype(np.int16).tobytes()
                 is_speech = self.vad.is_speech(pcm16, sample_rate=AUDIO_SR)
             except Exception as e:
                 self._log_error(f"VAD error: {e}")

         self.last_error_message = ""
         self.last_error_time = 0
+        # Check audio device (informational only)
         try:
             devices = sd.query_devices()
             if self.device is not None:
         """
         # Lazy start: Ensure inference loop is running
         if not self.running:
+            print("[VoiceEmotion] 🚀 Auto-starting inference thread (Server Mode detected)")
+            # ✅ CRITICAL FIX: Set running=True BEFORE starting the thread
+            self.running = True
             self._start_inference_thread()
         try:
             self._process_audio_chunk_logic(audio_data)
         except Exception as e:
             self._log_error(f"External audio processing error: {e}")
+    # Alias for compatibility with backend/processing/audio.py
+    process_external_audio = add_audio
     def _process_audio_chunk_logic(self, mono_data):
         """Internal logic to slice audio into blocks and run VAD"""
         self.audio_chunks_received += 1
         hop = int(AUDIO_SR * AUDIO_BLOCK)
+        # Handle small chunks by buffering if necessary (basic handling here)
+        # Note: If chunks are consistently smaller than hop, they might be skipped.
+        # WebSocket usually sends ~4096 bytes which is larger than hop (320 samples).
         for i in range(0, len(mono_data) - hop + 1, hop):
             chunk = mono_data[i:i+hop]
             try:
                 pcm16 = np.clip(chunk * 32768, -32768, 32767).astype(np.int16).tobytes()
                 is_speech = self.vad.is_speech(pcm16, sample_rate=AUDIO_SR)
+                # Debug VAD once every ~100 chunks to check signal
+                if self.audio_chunks_received % 100 == 0:
+                    amp = np.max(np.abs(chunk))
+                    if amp > 0.05:
+                        print(f"[VoiceEmotion] 🔍 VAD Check: Amp={amp:.3f}, Speech={is_speech}")
             except Exception as e:
                 self._log_error(f"VAD error: {e}")

mrrrme/backend/processing/audio.py CHANGED Viewed

@@ -17,14 +17,18 @@ async def process_audio_chunk(audio_data_b64: str) -> dict:
         # 2. Push to Voice Worker (VITAL FIX for Server Mode)
         # This triggers the VAD, speech tracking, and Ring Buffer update
         if voice_worker:
-            voice_worker.process_external_audio(audio_data)
         # 3. Maintain legacy buffer for throttling response rate
         audio_buffer.append(audio_data)
         if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
             # Now get_probs() will actually return fresh data because
-            # process_external_audio has been feeding the worker
             voice_probs, voice_emotion = voice_worker.get_probs()
             audio_buffer = audio_buffer[-AUDIO_BUFFER_KEEP:]

         # 2. Push to Voice Worker (VITAL FIX for Server Mode)
         # This triggers the VAD, speech tracking, and Ring Buffer update
         if voice_worker:
+            # Convert raw bytes to numpy float32 for consistency if not handled elsewhere
+            # Assuming raw PCM 16-bit here
+            if len(audio_data) % 2 == 0:
+                 audio_float = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
+                 voice_worker.add_audio(audio_float)
         # 3. Maintain legacy buffer for throttling response rate
         audio_buffer.append(audio_data)
         if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
             # Now get_probs() will actually return fresh data because
+            # add_audio has been feeding the worker
             voice_probs, voice_emotion = voice_worker.get_probs()
             audio_buffer = audio_buffer[-AUDIO_BUFFER_KEEP:]