michon commited on
Commit
df33afb
Β·
1 Parent(s): d423871

fix voice of tone2

Browse files
mrrrme/audio/voice_emotion.py CHANGED
@@ -70,7 +70,7 @@ class VoiceEmotionWorker:
70
  self.last_error_message = ""
71
  self.last_error_time = 0
72
 
73
- # Check audio device (informational)
74
  try:
75
  devices = sd.query_devices()
76
  if self.device is not None:
@@ -137,13 +137,18 @@ class VoiceEmotionWorker:
137
  """
138
  # Lazy start: Ensure inference loop is running
139
  if not self.running:
 
 
 
140
  self._start_inference_thread()
141
- self.running = True
142
 
143
  try:
144
  self._process_audio_chunk_logic(audio_data)
145
  except Exception as e:
146
  self._log_error(f"External audio processing error: {e}")
 
 
 
147
 
148
  def _process_audio_chunk_logic(self, mono_data):
149
  """Internal logic to slice audio into blocks and run VAD"""
@@ -154,6 +159,10 @@ class VoiceEmotionWorker:
154
  self.audio_chunks_received += 1
155
  hop = int(AUDIO_SR * AUDIO_BLOCK)
156
 
 
 
 
 
157
  for i in range(0, len(mono_data) - hop + 1, hop):
158
  chunk = mono_data[i:i+hop]
159
 
@@ -162,6 +171,13 @@ class VoiceEmotionWorker:
162
  try:
163
  pcm16 = np.clip(chunk * 32768, -32768, 32767).astype(np.int16).tobytes()
164
  is_speech = self.vad.is_speech(pcm16, sample_rate=AUDIO_SR)
 
 
 
 
 
 
 
165
  except Exception as e:
166
  self._log_error(f"VAD error: {e}")
167
 
 
70
  self.last_error_message = ""
71
  self.last_error_time = 0
72
 
73
+ # Check audio device (informational only)
74
  try:
75
  devices = sd.query_devices()
76
  if self.device is not None:
 
137
  """
138
  # Lazy start: Ensure inference loop is running
139
  if not self.running:
140
+ print("[VoiceEmotion] πŸš€ Auto-starting inference thread (Server Mode detected)")
141
+ # βœ… CRITICAL FIX: Set running=True BEFORE starting the thread
142
+ self.running = True
143
  self._start_inference_thread()
 
144
 
145
  try:
146
  self._process_audio_chunk_logic(audio_data)
147
  except Exception as e:
148
  self._log_error(f"External audio processing error: {e}")
149
+
150
+ # Alias for compatibility with backend/processing/audio.py
151
+ process_external_audio = add_audio
152
 
153
  def _process_audio_chunk_logic(self, mono_data):
154
  """Internal logic to slice audio into blocks and run VAD"""
 
159
  self.audio_chunks_received += 1
160
  hop = int(AUDIO_SR * AUDIO_BLOCK)
161
 
162
+ # Handle small chunks by buffering if necessary (basic handling here)
163
+ # Note: If chunks are consistently smaller than hop, they might be skipped.
164
+ # WebSocket usually sends ~4096 bytes which is larger than hop (320 samples).
165
+
166
  for i in range(0, len(mono_data) - hop + 1, hop):
167
  chunk = mono_data[i:i+hop]
168
 
 
171
  try:
172
  pcm16 = np.clip(chunk * 32768, -32768, 32767).astype(np.int16).tobytes()
173
  is_speech = self.vad.is_speech(pcm16, sample_rate=AUDIO_SR)
174
+
175
+ # Debug VAD once every ~100 chunks to check signal
176
+ if self.audio_chunks_received % 100 == 0:
177
+ amp = np.max(np.abs(chunk))
178
+ if amp > 0.05:
179
+ print(f"[VoiceEmotion] πŸ” VAD Check: Amp={amp:.3f}, Speech={is_speech}")
180
+
181
  except Exception as e:
182
  self._log_error(f"VAD error: {e}")
183
 
mrrrme/backend/processing/audio.py CHANGED
@@ -17,14 +17,18 @@ async def process_audio_chunk(audio_data_b64: str) -> dict:
17
  # 2. Push to Voice Worker (VITAL FIX for Server Mode)
18
  # This triggers the VAD, speech tracking, and Ring Buffer update
19
  if voice_worker:
20
- voice_worker.process_external_audio(audio_data)
 
 
 
 
21
 
22
  # 3. Maintain legacy buffer for throttling response rate
23
  audio_buffer.append(audio_data)
24
 
25
  if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
26
  # Now get_probs() will actually return fresh data because
27
- # process_external_audio has been feeding the worker
28
  voice_probs, voice_emotion = voice_worker.get_probs()
29
  audio_buffer = audio_buffer[-AUDIO_BUFFER_KEEP:]
30
 
 
17
  # 2. Push to Voice Worker (VITAL FIX for Server Mode)
18
  # This triggers the VAD, speech tracking, and Ring Buffer update
19
  if voice_worker:
20
+ # Convert raw bytes to numpy float32 for consistency if not handled elsewhere
21
+ # Assuming raw PCM 16-bit here
22
+ if len(audio_data) % 2 == 0:
23
+ audio_float = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
24
+ voice_worker.add_audio(audio_float)
25
 
26
  # 3. Maintain legacy buffer for throttling response rate
27
  audio_buffer.append(audio_data)
28
 
29
  if len(audio_buffer) >= AUDIO_BUFFER_SIZE:
30
  # Now get_probs() will actually return fresh data because
31
+ # add_audio has been feeding the worker
32
  voice_probs, voice_emotion = voice_worker.get_probs()
33
  audio_buffer = audio_buffer[-AUDIO_BUFFER_KEEP:]
34