Spaces:

pollen-robotics
/

reachy_mini_conversation_app

Running

App Files Files Community

Alina commited on Oct 20

Commit

6a4b7f3

unverified ·

2 Parent(s): b30c470 e006190

Merge pull request #57 from pollen-robotics/add-github-workflows

Browse files

Files changed (19) hide show

.github/workflows/tests.yml +39 -0
.github/workflows/typecheck.yml +29 -0
.gitignore +1 -0
pyproject.toml +15 -3
src/reachy_mini_conversation_demo/audio/head_wobbler.py +7 -5
src/reachy_mini_conversation_demo/audio/speech_tapper.py +11 -9
src/reachy_mini_conversation_demo/camera_worker.py +26 -25
src/reachy_mini_conversation_demo/config.py +4 -4
src/reachy_mini_conversation_demo/console.py +7 -6
src/reachy_mini_conversation_demo/dance_emotion_moves.py +21 -18
src/reachy_mini_conversation_demo/main.py +5 -4
src/reachy_mini_conversation_demo/moves.py +29 -28
src/reachy_mini_conversation_demo/openai_realtime.py +54 -43
src/reachy_mini_conversation_demo/tools.py +35 -33
src/reachy_mini_conversation_demo/utils.py +8 -6
src/reachy_mini_conversation_demo/vision/processors.py +16 -14
src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py +15 -9
tests/audio/test_head_wobbler.py +3 -2
uv.lock +0 -0

.github/workflows/tests.yml ADDED Viewed

	@@ -0,0 +1,39 @@

+name: Tests
+on:
+  push:
+  pull_request:
+permissions:
+  contents: read
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  tests:
+    name: pytest (py${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: astral-sh/setup-uv@v5
+      - name: Install (locked)
+        env:
+          GIT_LFS_SKIP_SMUDGE: "1"
+        run: |
+          uv sync --frozen --group dev --extra all_vision
+      - name: Run tests
+        run: uv run pytest -q

.github/workflows/typecheck.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: Type check
+on: [push, pull_request]
+permissions:
+  contents: read
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - uses: astral-sh/setup-uv@v5
+      - name: Install deps (locked) incl. vision extras
+        run: uv sync --frozen --group dev --extra all_vision
+      - name: Run mypy
+        run: uv run mypy --pretty --show-error-codes .

.gitignore CHANGED Viewed

@@ -29,6 +29,7 @@ coverage.xml
 # Linting and formatting
 .ruff_cache/
 # IDE
 .vscode/

 # Linting and formatting
 .ruff_cache/
+.mypy_cache/
 # IDE
 .vscode/

pyproject.toml CHANGED Viewed

@@ -12,7 +12,7 @@ requires-python = ">=3.10"
 dependencies = [
     #Media
     "aiortc>=1.13.0",
-    "fastrtc@git+ssh://git@github.com/gradio-app/fastrtc.git@main",
     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
@@ -23,7 +23,7 @@ dependencies = [
     #OpenAI
     "openai>=2.1",
-    #Reachy mini
     "reachy_mini_dances_library",
     "reachy_mini_toolbox",
     "reachy_mini>=1.0.0.rc4",
@@ -40,7 +40,11 @@ all_vision = [
 ]
 [dependency-groups]
-dev = ["pytest", "ruff==0.12.0"]
 [project.scripts]
 reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
@@ -88,3 +92,11 @@ quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"

 dependencies = [
     #Media
     "aiortc>=1.13.0",
+    "fastrtc>=0.0.33",
     "gradio>=5.49.0",
     "huggingface_hub>=0.34.4",
     "opencv-python>=4.12.0.88",
     #OpenAI
     "openai>=2.1",
+    #Reachy mini
     "reachy_mini_dances_library",
     "reachy_mini_toolbox",
     "reachy_mini>=1.0.0.rc4",
 ]
 [dependency-groups]
+dev = [
+  "pytest",
+  "ruff==0.12.0",
+  "mypy==1.18.2",
+]
 [project.scripts]
 reachy-mini-conversation-demo = "reachy_mini_conversation_demo.main:main"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
+[tool.mypy]
+python_version = "3.12"
+files = ["src/"]
+ignore_missing_imports = true
+strict = true
+show_error_codes = true
+warn_unused_ignores = true

src/reachy_mini_conversation_demo/audio/head_wobbler.py CHANGED Viewed

@@ -5,9 +5,11 @@ import queue
 import base64
 import logging
 import threading
-from typing import Tuple, Optional
 import numpy as np
 from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
@@ -20,13 +22,13 @@ logger = logging.getLogger(__name__)
 class HeadWobbler:
     """Converts audio deltas (base64) into head movement offsets."""
-    def __init__(self, set_speech_offsets):
         """Initialize the head wobbler."""
         self._apply_offsets = set_speech_offsets
-        self._base_ts: Optional[float] = None
         self._hops_done: int = 0
-        self.audio_queue: queue.Queue[Tuple[int, int, np.ndarray]] = queue.Queue()
         self.sway = SwayRollRT()
         # Synchronization primitives
@@ -35,7 +37,7 @@ class HeadWobbler:
         self._generation = 0
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
     def feed(self, delta_b64: str) -> None:
         """Thread-safe: push audio into the consumer queue."""

 import base64
 import logging
 import threading
+from typing import Tuple
+from collections.abc import Callable
 import numpy as np
+from numpy.typing import NDArray
 from reachy_mini_conversation_demo.audio.speech_tapper import HOP_MS, SwayRollRT
 class HeadWobbler:
     """Converts audio deltas (base64) into head movement offsets."""
+    def __init__(self, set_speech_offsets: Callable[[Tuple[float, float, float, float, float, float]], None]) -> None:
         """Initialize the head wobbler."""
         self._apply_offsets = set_speech_offsets
+        self._base_ts: float | None = None
         self._hops_done: int = 0
+        self.audio_queue: "queue.Queue[Tuple[int, int, NDArray[np.int16]]]" = queue.Queue()
         self.sway = SwayRollRT()
         # Synchronization primitives
         self._generation = 0
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
     def feed(self, delta_b64: str) -> None:
         """Thread-safe: push audio into the consumer queue."""

src/reachy_mini_conversation_demo/audio/speech_tapper.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from __future__ import annotations
 import math
-from typing import Dict, List, Optional
 from itertools import islice
 from collections import deque
 import numpy as np
 # Tunables
@@ -48,7 +49,7 @@ SWAY_ATTACK_FR = max(1, int(SWAY_ATTACK_MS / HOP_MS))
 SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
-def _rms_dbfs(x: np.ndarray) -> float:
     """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
     # numerically stable rms (avoid overflow)
     x = x.astype(np.float32, copy=False)
@@ -66,7 +67,7 @@ def _loudness_gain(db: float, offset: float = SENS_DB_OFFSET) -> float:
     return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
-def _to_float32_mono(x: np.ndarray) -> np.ndarray:
     """Convert arbitrary PCM array to float32 mono in [-1,1].
     Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
@@ -94,7 +95,7 @@ def _to_float32_mono(x: np.ndarray) -> np.ndarray:
     return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
-def _resample_linear(x: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
     """Lightweight linear resampler for short buffers."""
     if sr_in == sr_out or x.size == 0:
         return x
@@ -118,8 +119,8 @@ class SwayRollRT:
     def __init__(self, rng_seed: int = 7):
         """Initialize state."""
         self._seed = int(rng_seed)
-        self.samples = deque(maxlen=10 * SR)  # sliding window for VAD/env
-        self.carry = np.zeros(0, dtype=np.float32)
         self.vad_on = False
         self.vad_above = 0
@@ -150,7 +151,7 @@ class SwayRollRT:
         self.sway_down = 0
         self.t = 0.0
-    def feed(self, pcm: np.ndarray, sr: Optional[int]) -> List[Dict[str, float]]:
         """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
         Args:
@@ -177,7 +178,8 @@ class SwayRollRT:
         while self.carry.size >= HOP:
             hop = self.carry[:HOP]
-            self.carry = self.carry[HOP:]
             # keep sliding window for VAD/env computation
             # (deque accepts any iterable; list() for small HOP is fine)
@@ -260,7 +262,7 @@ class SwayRollRT:
                     "x_mm": x_mm,
                     "y_mm": y_mm,
                     "z_mm": z_mm,
-                }
             )
         return out

 from __future__ import annotations
 import math
+from typing import Any, Dict, List
 from itertools import islice
 from collections import deque
 import numpy as np
+from numpy.typing import NDArray
 # Tunables
 SWAY_RELEASE_FR = max(1, int(SWAY_RELEASE_MS / HOP_MS))
+def _rms_dbfs(x: NDArray[np.float32]) -> float:
     """Root-mean-square in dBFS for float32 mono array in [-1,1]."""
     # numerically stable rms (avoid overflow)
     x = x.astype(np.float32, copy=False)
     return t**LOUDNESS_GAMMA if LOUDNESS_GAMMA != 1.0 else t
+def _to_float32_mono(x: NDArray[Any]) -> NDArray[np.float32]:
     """Convert arbitrary PCM array to float32 mono in [-1,1].
     Accepts shapes: (N,), (1,N), (N,1), (C,N), (N,C).
     return a.astype(np.float32) / (scale if scale != 0.0 else 1.0)
+def _resample_linear(x: NDArray[np.float32], sr_in: int, sr_out: int) -> NDArray[np.float32]:
     """Lightweight linear resampler for short buffers."""
     if sr_in == sr_out or x.size == 0:
         return x
     def __init__(self, rng_seed: int = 7):
         """Initialize state."""
         self._seed = int(rng_seed)
+        self.samples: deque[float] = deque(maxlen=10 * SR)  # sliding window for VAD/env
+        self.carry: NDArray[np.float32] = np.zeros(0, dtype=np.float32)
         self.vad_on = False
         self.vad_above = 0
         self.sway_down = 0
         self.t = 0.0
+    def feed(self, pcm: NDArray[Any], sr: int | None) -> List[Dict[str, float]]:
         """Stream in PCM chunk. Returns a list of sway dicts, one per hop (HOP_MS).
         Args:
         while self.carry.size >= HOP:
             hop = self.carry[:HOP]
+            remaining: NDArray[np.float32] = self.carry[HOP:]
+            self.carry = remaining
             # keep sliding window for VAD/env computation
             # (deque accepts any iterable; list() for small HOP is fine)
                     "x_mm": x_mm,
                     "y_mm": y_mm,
                     "z_mm": z_mm,
+                },
             )
         return out

src/reachy_mini_conversation_demo/camera_worker.py CHANGED Viewed

@@ -9,10 +9,11 @@ Ported from main_works.py camera_worker() function to provide:
 import time
 import logging
 import threading
-from typing import Tuple, Optional
 import cv2
 import numpy as np
 from scipy.spatial.transform import Rotation as R
 from reachy_mini import ReachyMini
@@ -25,20 +26,20 @@ logger = logging.getLogger(__name__)
 class CameraWorker:
     """Thread-safe camera worker with frame buffering and face tracking."""
-    def __init__(self, reachy_mini: ReachyMini, head_tracker=None):
         """Initialize."""
         self.reachy_mini = reachy_mini
         self.head_tracker = head_tracker
         # Thread-safe frame storage
-        self.latest_frame: Optional[np.ndarray] = None
         self.frame_lock = threading.Lock()
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
         # Face tracking state
         self.is_head_tracking_enabled = True
-        self.face_tracking_offsets = [
             0.0,
             0.0,
             0.0,
@@ -49,31 +50,31 @@ class CameraWorker:
         self.face_tracking_lock = threading.Lock()
         # Face tracking timing variables (same as main_works.py)
-        self.last_face_detected_time: Optional[float] = None
-        self.interpolation_start_time: Optional[float] = None
-        self.interpolation_start_pose: Optional[np.ndarray] = None
         self.face_lost_delay = 2.0  # seconds to wait before starting interpolation
         self.interpolation_duration = 1.0  # seconds to interpolate back to neutral
         # Track state changes
         self.previous_head_tracking_state = self.is_head_tracking_enabled
-    def get_latest_frame(self) -> Optional[np.ndarray]:
         """Get the latest frame (thread-safe)."""
         with self.frame_lock:
             if self.latest_frame is None:
                 return None
-            else:
-                frame = self.latest_frame.copy()
-                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                return frame
     def get_face_tracking_offsets(
         self,
     ) -> Tuple[float, float, float, float, float, float]:
         """Get current face tracking offsets (thread-safe)."""
         with self.face_tracking_lock:
-            return tuple(self.face_tracking_offsets)
     def set_head_tracking_enabled(self, enabled: bool) -> None:
         """Enable/disable head tracking."""
@@ -168,12 +169,11 @@ class CameraWorker:
                                     rotation[2],  # roll, pitch, yaw
                                 ]
-                        else:
-                            # No face detected while tracking enabled - set face lost timestamp
-                            if self.last_face_detected_time is None or self.last_face_detected_time == current_time:
-                                # Only update if we haven't already set a face lost time
-                                # (current_time check prevents overriding the disable-triggered timestamp)
-                                pass
                     # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
                     if self.last_face_detected_time is not None:
@@ -188,11 +188,12 @@ class CameraWorker:
                                     current_translation = self.face_tracking_offsets[:3]
                                     current_rotation_euler = self.face_tracking_offsets[3:]
                                     # Convert to 4x4 pose matrix
-                                    self.interpolation_start_pose = np.eye(4)
-                                    self.interpolation_start_pose[:3, 3] = current_translation
-                                    self.interpolation_start_pose[:3, :3] = R.from_euler(
-                                        "xyz", current_rotation_euler
                                     ).as_matrix()
                             # Calculate interpolation progress (t from 0 to 1)
                             elapsed_interpolation = current_time - self.interpolation_start_time
@@ -200,7 +201,7 @@ class CameraWorker:
                             # Interpolate between current pose and neutral pose
                             interpolated_pose = linear_pose_interpolation(
-                                self.interpolation_start_pose, neutral_pose, t
                             )
                             # Extract translation and rotation from interpolated pose

 import time
 import logging
 import threading
+from typing import Any, List, Tuple
 import cv2
 import numpy as np
+from numpy.typing import NDArray
 from scipy.spatial.transform import Rotation as R
 from reachy_mini import ReachyMini
 class CameraWorker:
     """Thread-safe camera worker with frame buffering and face tracking."""
+    def __init__(self, reachy_mini: ReachyMini, head_tracker: Any = None) -> None:
         """Initialize."""
         self.reachy_mini = reachy_mini
         self.head_tracker = head_tracker
         # Thread-safe frame storage
+        self.latest_frame: NDArray[np.uint8] | None = None
         self.frame_lock = threading.Lock()
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
         # Face tracking state
         self.is_head_tracking_enabled = True
+        self.face_tracking_offsets: List[float] = [
             0.0,
             0.0,
             0.0,
         self.face_tracking_lock = threading.Lock()
         # Face tracking timing variables (same as main_works.py)
+        self.last_face_detected_time: float | None = None
+        self.interpolation_start_time: float | None = None
+        self.interpolation_start_pose: NDArray[np.float32] | None = None
         self.face_lost_delay = 2.0  # seconds to wait before starting interpolation
         self.interpolation_duration = 1.0  # seconds to interpolate back to neutral
         # Track state changes
         self.previous_head_tracking_state = self.is_head_tracking_enabled
+    def get_latest_frame(self) -> NDArray[np.uint8] | None:
         """Get the latest frame (thread-safe)."""
         with self.frame_lock:
             if self.latest_frame is None:
                 return None
+            frame = self.latest_frame.copy()
+            frame_rgb: NDArray[np.uint8] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # type: ignore[assignment]
+            return frame_rgb
     def get_face_tracking_offsets(
         self,
     ) -> Tuple[float, float, float, float, float, float]:
         """Get current face tracking offsets (thread-safe)."""
         with self.face_tracking_lock:
+            offsets = self.face_tracking_offsets
+            return (offsets[0], offsets[1], offsets[2], offsets[3], offsets[4], offsets[5])
     def set_head_tracking_enabled(self, enabled: bool) -> None:
         """Enable/disable head tracking."""
                                     rotation[2],  # roll, pitch, yaw
                                 ]
+                        # No face detected while tracking enabled - set face lost timestamp
+                        elif self.last_face_detected_time is None or self.last_face_detected_time == current_time:
+                            # Only update if we haven't already set a face lost time
+                            # (current_time check prevents overriding the disable-triggered timestamp)
+                            pass
                     # Handle smooth interpolation (works for both face-lost and tracking-disabled cases)
                     if self.last_face_detected_time is not None:
                                     current_translation = self.face_tracking_offsets[:3]
                                     current_rotation_euler = self.face_tracking_offsets[3:]
                                     # Convert to 4x4 pose matrix
+                                    pose_matrix = np.eye(4, dtype=np.float32)
+                                    pose_matrix[:3, 3] = current_translation
+                                    pose_matrix[:3, :3] = R.from_euler(
+                                        "xyz", current_rotation_euler,
                                     ).as_matrix()
+                                    self.interpolation_start_pose = pose_matrix
                             # Calculate interpolation progress (t from 0 to 1)
                             elapsed_interpolation = current_time - self.interpolation_start_time
                             # Interpolate between current pose and neutral pose
                             interpolated_pose = linear_pose_interpolation(
+                                self.interpolation_start_pose, neutral_pose, t,
                             )
                             # Extract translation and rotation from interpolated pose

src/reachy_mini_conversation_demo/config.py CHANGED Viewed

@@ -13,13 +13,13 @@ if not env_file.exists():
     raise RuntimeError(
         ".env file not found. Please create one based on .env.example:\n"
         "  cp .env.example .env\n"
-        "Then add your OPENAI_API_KEY to the .env file."
     )
 # Load .env and verify it was loaded successfully
 if not load_dotenv():
     raise RuntimeError(
-        "Failed to load .env file. Please ensure the file is readable and properly formatted."
     )
 logger.info("Configuration loaded from .env file")
@@ -33,11 +33,11 @@ class Config:
     if OPENAI_API_KEY is None:
         raise RuntimeError(
             "OPENAI_API_KEY is not set in .env file. Please add it:\n"
-            "  OPENAI_API_KEY=your_api_key_here"
         )
     if not OPENAI_API_KEY.strip():
         raise RuntimeError(
-            "OPENAI_API_KEY is empty in .env file. Please provide a valid API key."
         )
     # Optional

     raise RuntimeError(
         ".env file not found. Please create one based on .env.example:\n"
         "  cp .env.example .env\n"
+        "Then add your OPENAI_API_KEY to the .env file.",
     )
 # Load .env and verify it was loaded successfully
 if not load_dotenv():
     raise RuntimeError(
+        "Failed to load .env file. Please ensure the file is readable and properly formatted.",
     )
 logger.info("Configuration loaded from .env file")
     if OPENAI_API_KEY is None:
         raise RuntimeError(
             "OPENAI_API_KEY is not set in .env file. Please add it:\n"
+            "  OPENAI_API_KEY=your_api_key_here",
         )
     if not OPENAI_API_KEY.strip():
         raise RuntimeError(
+            "OPENAI_API_KEY is empty in .env file. Please provide a valid API key.",
         )
     # Optional

src/reachy_mini_conversation_demo/console.py CHANGED Viewed

@@ -5,6 +5,7 @@ records mic frames to the handler and plays handler audio frames to the speaker.
 import asyncio
 import logging
 import librosa
 from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
@@ -24,9 +25,9 @@ class LocalStream:
         self.handler = handler
         self._robot = robot
         self._stop_event = asyncio.Event()
-        self._tasks = []
         # Allow the handler to flush the player queue when appropriate.
-        self.handler._clear_queue = self.clear_audio_queue  # type: ignore[assignment]
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
@@ -105,12 +106,12 @@ class LocalStream:
             elif isinstance(handler_output, tuple):
                 input_sample_rate, audio_frame = handler_output
                 device_sample_rate = self._robot.media.get_audio_samplerate()
-                audio_frame = audio_to_float32(audio_frame.squeeze())
                 if input_sample_rate != device_sample_rate:
-                    audio_frame = librosa.resample(
-                        audio_frame, orig_sr=input_sample_rate, target_sr=device_sample_rate
                     )
-                self._robot.media.push_audio_sample(audio_frame)
             else:
                 logger.debug("Ignoring output type=%s", type(handler_output).__name__)

 import asyncio
 import logging
+from typing import List
 import librosa
 from fastrtc import AdditionalOutputs, audio_to_int16, audio_to_float32
         self.handler = handler
         self._robot = robot
         self._stop_event = asyncio.Event()
+        self._tasks: List[asyncio.Task[None]] = []
         # Allow the handler to flush the player queue when appropriate.
+        self.handler._clear_queue = self.clear_audio_queue
     def launch(self) -> None:
         """Start the recorder/player and run the async processing loops."""
             elif isinstance(handler_output, tuple):
                 input_sample_rate, audio_frame = handler_output
                 device_sample_rate = self._robot.media.get_audio_samplerate()
+                audio_frame_float = audio_to_float32(audio_frame.squeeze())
                 if input_sample_rate != device_sample_rate:
+                    audio_frame_float = librosa.resample(
+                        audio_frame_float, orig_sr=input_sample_rate, target_sr=device_sample_rate,
                     )
+                self._robot.media.push_audio_sample(audio_frame_float)
             else:
                 logger.debug("Ignoring output type=%s", type(handler_output).__name__)

src/reachy_mini_conversation_demo/dance_emotion_moves.py CHANGED Viewed

@@ -9,6 +9,7 @@ import logging
 from typing import Tuple
 import numpy as np
 from reachy_mini.motion.move import Move
 from reachy_mini.motion.recorded_move import RecordedMoves
@@ -18,7 +19,7 @@ from reachy_mini_dances_library.dance_move import DanceMove
 logger = logging.getLogger(__name__)
-class DanceQueueMove(Move):
     """Wrapper for dance moves to work with the movement queue system."""
     def __init__(self, move_name: str):
@@ -29,9 +30,9 @@ class DanceQueueMove(Move):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
-        return self.dance_move.duration
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate dance move at time t."""
         try:
             # Get the pose from the dance move
@@ -49,10 +50,10 @@ class DanceQueueMove(Move):
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
-            return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
-class EmotionQueueMove(Move):
     """Wrapper for emotion moves to work with the movement queue system."""
     def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
@@ -63,9 +64,9 @@ class EmotionQueueMove(Move):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
-        return self.emotion_move.duration
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate emotion move at time t."""
         try:
             # Get the pose from the emotion move
@@ -83,20 +84,20 @@ class EmotionQueueMove(Move):
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
-            return (neutral_head_pose, np.array([0.0, 0.0]), 0.0)
-class GotoQueueMove(Move):
     """Wrapper for goto moves to work with the movement queue system."""
     def __init__(
         self,
-        target_head_pose: np.ndarray,
-        start_head_pose: np.ndarray = None,
         target_antennas: Tuple[float, float] = (0, 0),
-        start_antennas: Tuple[float, float] = None,
         target_body_yaw: float = 0,
-        start_body_yaw: float = None,
         duration: float = 1.0,
     ):
         """Initialize a GotoQueueMove."""
@@ -113,7 +114,7 @@ class GotoQueueMove(Move):
         """Duration property required by official Move interface."""
         return self._duration
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate goto move at time t using linear interpolation."""
         try:
             from reachy_mini.utils import create_head_pose
@@ -136,7 +137,8 @@ class GotoQueueMove(Move):
                 [
                     self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
                     self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
-                ]
             )
             # Interpolate body yaw
@@ -146,6 +148,7 @@ class GotoQueueMove(Move):
         except Exception as e:
             logger.error(f"Error evaluating goto move at t={t}: {e}")
-            # Return target pose on error - convert antennas to numpy array
-            target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]])
-            return (self.target_head_pose, target_antennas_array, self.target_body_yaw)

 from typing import Tuple
 import numpy as np
+from numpy.typing import NDArray
 from reachy_mini.motion.move import Move
 from reachy_mini.motion.recorded_move import RecordedMoves
 logger = logging.getLogger(__name__)
+class DanceQueueMove(Move):  # type: ignore
     """Wrapper for dance moves to work with the movement queue system."""
     def __init__(self, move_name: str):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
+        return float(self.dance_move.duration)
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate dance move at time t."""
         try:
             # Get the pose from the dance move
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+            return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
+class EmotionQueueMove(Move):  # type: ignore
     """Wrapper for emotion moves to work with the movement queue system."""
     def __init__(self, emotion_name: str, recorded_moves: RecordedMoves):
     @property
     def duration(self) -> float:
         """Duration property required by official Move interface."""
+        return float(self.emotion_move.duration)
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate emotion move at time t."""
         try:
             # Get the pose from the emotion move
             from reachy_mini.utils import create_head_pose
             neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+            return (neutral_head_pose, np.array([0.0, 0.0], dtype=np.float64), 0.0)
+class GotoQueueMove(Move):  # type: ignore
     """Wrapper for goto moves to work with the movement queue system."""
     def __init__(
         self,
+        target_head_pose: NDArray[np.float32],
+        start_head_pose: NDArray[np.float32] | None = None,
         target_antennas: Tuple[float, float] = (0, 0),
+        start_antennas: Tuple[float, float] | None = None,
         target_body_yaw: float = 0,
+        start_body_yaw: float | None = None,
         duration: float = 1.0,
     ):
         """Initialize a GotoQueueMove."""
         """Duration property required by official Move interface."""
         return self._duration
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate goto move at time t using linear interpolation."""
         try:
             from reachy_mini.utils import create_head_pose
                 [
                     self.start_antennas[0] + (self.target_antennas[0] - self.start_antennas[0]) * t_clamped,
                     self.start_antennas[1] + (self.target_antennas[1] - self.start_antennas[1]) * t_clamped,
+                ],
+                dtype=np.float64,
             )
             # Interpolate body yaw
         except Exception as e:
             logger.error(f"Error evaluating goto move at t={t}: {e}")
+            # Return target pose on error - convert to float64
+            target_head_pose_f64 = self.target_head_pose.astype(np.float64)
+            target_antennas_array = np.array([self.target_antennas[0], self.target_antennas[1]], dtype=np.float64)
+            return (target_head_pose_f64, target_antennas_array, self.target_body_yaw)

src/reachy_mini_conversation_demo/main.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import sys
 import gradio as gr
 from fastapi import FastAPI
@@ -20,13 +21,13 @@ from reachy_mini_conversation_demo.openai_realtime import OpenaiRealtimeHandler
 from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
-def update_chatbot(chatbot: list[dict], response: dict):
     """Update the chatbot with AdditionalOutputs."""
     chatbot.append(response)
     return chatbot
-def main():
     """Entrypoint for the Reachy Mini conversation demo."""
     args = parse_args()
@@ -41,7 +42,7 @@ def main():
     # Check if running in simulation mode without --gradio
     if robot.client.get_status()["simulation_enabled"] and not args.gradio:
         logger.error(
-            "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode."
         )
         robot.client.disconnect()
         sys.exit(1)
@@ -76,7 +77,7 @@ def main():
     handler = OpenaiRealtimeHandler(deps)
-    stream_manager = None
     if args.gradio:
         stream = Stream(

 import os
 import sys
+from typing import Any, Dict, List
 import gradio as gr
 from fastapi import FastAPI
 from reachy_mini_conversation_demo.audio.head_wobbler import HeadWobbler
+def update_chatbot(chatbot: List[Dict[str, Any]], response: Dict[str, Any]) -> List[Dict[str, Any]]:
     """Update the chatbot with AdditionalOutputs."""
     chatbot.append(response)
     return chatbot
+def main() -> None:
     """Entrypoint for the Reachy Mini conversation demo."""
     args = parse_args()
     # Check if running in simulation mode without --gradio
     if robot.client.get_status()["simulation_enabled"] and not args.gradio:
         logger.error(
+            "Simulation mode requires Gradio interface. Please use --gradio flag when running in simulation mode.",
         )
         robot.client.disconnect()
         sys.exit(1)
     handler = OpenaiRealtimeHandler(deps)
+    stream_manager: gr.Blocks | LocalStream | None = None
     if args.gradio:
         stream = Stream(

src/reachy_mini_conversation_demo/moves.py CHANGED Viewed

@@ -36,11 +36,12 @@ import time
 import logging
 import threading
 from queue import Empty, Queue
-from typing import Any, Tuple, Optional
 from collections import deque
 from dataclasses import dataclass
 import numpy as np
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
@@ -57,15 +58,15 @@ logger = logging.getLogger(__name__)
 CONTROL_LOOP_FREQUENCY_HZ = 100.0  # Hz - Target frequency for the movement control loop
 # Type definitions
-FullBodyPose = Tuple[np.ndarray, Tuple[float, float], float]  # (head_pose_4x4, antennas, body_yaw)
-class BreathingMove(Move):
     """Breathing move with interpolation to neutral and then continuous breathing patterns."""
     def __init__(
         self,
-        interpolation_start_pose: np.ndarray,
         interpolation_start_antennas: Tuple[float, float],
         interpolation_duration: float = 1.0,
     ):
@@ -96,7 +97,7 @@ class BreathingMove(Move):
         """Duration property required by official Move interface."""
         return float("inf")  # Continuous breathing (never ends naturally)
-    def evaluate(self, t: float) -> tuple[np.ndarray | None, np.ndarray | None, float | None]:
         """Evaluate breathing move at time t."""
         if t < self.interpolation_duration:
             # Phase 1: Interpolate to neutral base position
@@ -104,13 +105,14 @@ class BreathingMove(Move):
             # Interpolate head pose
             head_pose = linear_pose_interpolation(
-                self.interpolation_start_pose, self.neutral_head_pose, interpolation_t
             )
             # Interpolate antennas
-            antennas = (
                 1 - interpolation_t
             ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
         else:
             # Phase 2: Breathing patterns from neutral base
@@ -122,7 +124,7 @@ class BreathingMove(Move):
             # Antenna sway (opposite directions)
             antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
-            antennas = np.array([antenna_sway, -antenna_sway])
         # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
         return (head_pose, antennas, 0.0)
@@ -168,8 +170,8 @@ class MovementState:
     """State tracking for the movement system."""
     # Primary move state
-    current_move: Optional[Move] = None
-    move_start_time: Optional[float] = None
     last_activity_time: float = 0.0
     # Secondary move state (offsets)
@@ -191,7 +193,7 @@ class MovementState:
     )
     # Status flags
-    last_primary_pose: Optional[FullBodyPose] = None
     def update_activity(self) -> None:
         """Update the last activity time."""
@@ -242,7 +244,7 @@ class MovementManager:
     def __init__(
         self,
         current_robot: ReachyMini,
-        camera_worker=None,
     ):
         """Initialize movement manager."""
         self.current_robot = current_robot
@@ -258,7 +260,7 @@ class MovementManager:
         self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
         # Move queue (primary moves)
-        self.move_queue = deque()
         # Configuration
         self.idle_inactivity_delay = 0.3  # seconds
@@ -266,7 +268,7 @@ class MovementManager:
         self.target_period = 1.0 / self.target_frequency
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
         self._is_listening = False
         self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
         self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
@@ -281,7 +283,7 @@ class MovementManager:
         self._set_target_err_suppressed = 0
         # Cross-thread signalling
-        self._command_queue: Queue[tuple[str, Any]] = Queue()
         self._speech_offsets_lock = threading.Lock()
         self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
             0.0,
@@ -383,7 +385,7 @@ class MovementManager:
     def _apply_pending_offsets(self) -> None:
         """Apply the most recent speech/face offset updates."""
-        speech_offsets: Optional[Tuple[float, float, float, float, float, float]] = None
         with self._speech_offsets_lock:
             if self._speech_offsets_dirty:
                 speech_offsets = self._pending_speech_offsets
@@ -393,7 +395,7 @@ class MovementManager:
             self.state.speech_offsets = speech_offsets
             self.state.update_activity()
-        face_offsets: Optional[Tuple[float, float, float, float, float, float]] = None
         with self._face_offsets_lock:
             if self._face_offsets_dirty:
                 face_offsets = self._pending_face_offsets
@@ -549,14 +551,13 @@ class MovementManager:
             )
             self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         else:
-            # Otherwise reuse the last primary pose so we avoid jumps between moves
-            if self.state.last_primary_pose is not None:
-                primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
-            else:
-                neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
-                primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
-                self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         return primary_full_body_pose
@@ -631,7 +632,7 @@ class MovementManager:
         return antennas_cmd
-    def _issue_control_command(self, head: np.ndarray, antennas: Tuple[float, float], body_yaw: float) -> None:
         """Send the fused pose to the robot with throttled error logging."""
         try:
             self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
@@ -651,7 +652,7 @@ class MovementManager:
                 self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
     def _update_frequency_stats(
-        self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats
     ) -> LoopFrequencyStats:
         """Update frequency statistics based on the current loop start time."""
         period = loop_start - prev_loop_start
@@ -664,7 +665,7 @@ class MovementManager:
             stats.min_freq = min(stats.min_freq, stats.last_freq)
         return stats
-    def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> tuple[float, LoopFrequencyStats]:
         """Compute sleep time to maintain target frequency and update potential freq."""
         computation_time = self._now() - loop_start
         stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
@@ -729,7 +730,7 @@ class MovementManager:
             self._thread = None
         logger.debug("Move worker stopped")
-    def get_status(self) -> dict[str, Any]:
         """Return a lightweight status snapshot for observability."""
         with self._status_lock:
             pose_snapshot = clone_full_body_pose(self._last_commanded_pose)

 import logging
 import threading
 from queue import Empty, Queue
+from typing import Any, Dict, Tuple
 from collections import deque
 from dataclasses import dataclass
 import numpy as np
+from numpy.typing import NDArray
 from reachy_mini import ReachyMini
 from reachy_mini.utils import create_head_pose
 CONTROL_LOOP_FREQUENCY_HZ = 100.0  # Hz - Target frequency for the movement control loop
 # Type definitions
+FullBodyPose = Tuple[NDArray[np.float32], Tuple[float, float], float]  # (head_pose_4x4, antennas, body_yaw)
+class BreathingMove(Move):  # type: ignore
     """Breathing move with interpolation to neutral and then continuous breathing patterns."""
     def __init__(
         self,
+        interpolation_start_pose: NDArray[np.float32],
         interpolation_start_antennas: Tuple[float, float],
         interpolation_duration: float = 1.0,
     ):
         """Duration property required by official Move interface."""
         return float("inf")  # Continuous breathing (never ends naturally)
+    def evaluate(self, t: float) -> tuple[NDArray[np.float64] | None, NDArray[np.float64] | None, float | None]:
         """Evaluate breathing move at time t."""
         if t < self.interpolation_duration:
             # Phase 1: Interpolate to neutral base position
             # Interpolate head pose
             head_pose = linear_pose_interpolation(
+                self.interpolation_start_pose, self.neutral_head_pose, interpolation_t,
             )
             # Interpolate antennas
+            antennas_interp = (
                 1 - interpolation_t
             ) * self.interpolation_start_antennas + interpolation_t * self.neutral_antennas
+            antennas = antennas_interp.astype(np.float64)
         else:
             # Phase 2: Breathing patterns from neutral base
             # Antenna sway (opposite directions)
             antenna_sway = self.antenna_sway_amplitude * np.sin(2 * np.pi * self.antenna_frequency * breathing_time)
+            antennas = np.array([antenna_sway, -antenna_sway], dtype=np.float64)
         # Return in official Move interface format: (head_pose, antennas_array, body_yaw)
         return (head_pose, antennas, 0.0)
     """State tracking for the movement system."""
     # Primary move state
+    current_move: Move | None = None
+    move_start_time: float | None = None
     last_activity_time: float = 0.0
     # Secondary move state (offsets)
     )
     # Status flags
+    last_primary_pose: FullBodyPose | None = None
     def update_activity(self) -> None:
         """Update the last activity time."""
     def __init__(
         self,
         current_robot: ReachyMini,
+        camera_worker: "Any" = None,
     ):
         """Initialize movement manager."""
         self.current_robot = current_robot
         self.state.last_primary_pose = (neutral_pose, (0.0, 0.0), 0.0)
         # Move queue (primary moves)
+        self.move_queue: deque[Move] = deque()
         # Configuration
         self.idle_inactivity_delay = 0.3  # seconds
         self.target_period = 1.0 / self.target_frequency
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
         self._is_listening = False
         self._last_commanded_pose: FullBodyPose = clone_full_body_pose(self.state.last_primary_pose)
         self._listening_antennas: Tuple[float, float] = self._last_commanded_pose[1]
         self._set_target_err_suppressed = 0
         # Cross-thread signalling
+        self._command_queue: "Queue[Tuple[str, Any]]" = Queue()
         self._speech_offsets_lock = threading.Lock()
         self._pending_speech_offsets: Tuple[float, float, float, float, float, float] = (
             0.0,
     def _apply_pending_offsets(self) -> None:
         """Apply the most recent speech/face offset updates."""
+        speech_offsets: Tuple[float, float, float, float, float, float] | None = None
         with self._speech_offsets_lock:
             if self._speech_offsets_dirty:
                 speech_offsets = self._pending_speech_offsets
             self.state.speech_offsets = speech_offsets
             self.state.update_activity()
+        face_offsets: Tuple[float, float, float, float, float, float] | None = None
         with self._face_offsets_lock:
             if self._face_offsets_dirty:
                 face_offsets = self._pending_face_offsets
             )
             self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
+        # Otherwise reuse the last primary pose so we avoid jumps between moves
+        elif self.state.last_primary_pose is not None:
+            primary_full_body_pose = clone_full_body_pose(self.state.last_primary_pose)
         else:
+            neutral_head_pose = create_head_pose(0, 0, 0, 0, 0, 0, degrees=True)
+            primary_full_body_pose = (neutral_head_pose, (0.0, 0.0), 0.0)
+            self.state.last_primary_pose = clone_full_body_pose(primary_full_body_pose)
         return primary_full_body_pose
         return antennas_cmd
+    def _issue_control_command(self, head: NDArray[np.float32], antennas: Tuple[float, float], body_yaw: float) -> None:
         """Send the fused pose to the robot with throttled error logging."""
         try:
             self.current_robot.set_target(head=head, antennas=antennas, body_yaw=body_yaw)
                 self._last_commanded_pose = clone_full_body_pose((head, antennas, body_yaw))
     def _update_frequency_stats(
+        self, loop_start: float, prev_loop_start: float, stats: LoopFrequencyStats,
     ) -> LoopFrequencyStats:
         """Update frequency statistics based on the current loop start time."""
         period = loop_start - prev_loop_start
             stats.min_freq = min(stats.min_freq, stats.last_freq)
         return stats
+    def _schedule_next_tick(self, loop_start: float, stats: LoopFrequencyStats) -> Tuple[float, LoopFrequencyStats]:
         """Compute sleep time to maintain target frequency and update potential freq."""
         computation_time = self._now() - loop_start
         stats.potential_freq = 1.0 / computation_time if computation_time > 0 else float("inf")
             self._thread = None
         logger.debug("Move worker stopped")
+    def get_status(self) -> Dict[str, Any]:
         """Return a lightweight status snapshot for observability."""
         with self._status_lock:
             pose_snapshot = clone_full_body_pose(self._last_commanded_pose)

src/reachy_mini_conversation_demo/openai_realtime.py CHANGED Viewed

@@ -2,12 +2,14 @@ import json
 import base64
 import asyncio
 import logging
 from datetime import datetime
 import numpy as np
 import gradio as gr
 from openai import AsyncOpenAI
 from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
 from reachy_mini_conversation_demo.tools import (
     ALL_TOOL_SPECS,
@@ -33,18 +35,18 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         )
         self.deps = deps
-        self.connection = None
-        self.output_queue = asyncio.Queue()
         self.last_activity_time = asyncio.get_event_loop().time()
         self.start_time = asyncio.get_event_loop().time()
         self.is_idle_tool_call = False
-    def copy(self):
         """Create a copy of the handler."""
         return OpenaiRealtimeHandler(self.deps)
-    async def start_up(self):
         """Start the handler."""
         self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
         async with self.client.beta.realtime.connect(model=config.MODEL_NAME) as conn:
@@ -59,10 +61,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     },
                     "voice": "ballad",
                     "instructions": SESSION_INSTRUCTIONS,
-                    "tools": ALL_TOOL_SPECS,
                     "tool_choice": "auto",
                     "temperature": 0.7,
-                }
             )
             # Manage event received from the openai server
@@ -70,9 +72,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
             async for event in self.connection:
                 logger.debug(f"OpenAI event: {event.type}")
                 if event.type == "input_audio_buffer.speech_started":
-                    if hasattr(self, '_clear_queue'):
                         self._clear_queue()
-                    self.deps.head_wobbler.reset()
                     self.deps.movement_manager.set_listening(True)
                     logger.debug("User speech started")
@@ -83,7 +86,8 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 if event.type in ("response.audio.completed", "response.completed"):
                     # Doesn't seem to be called
                     logger.debug("response completed")
-                    self.deps.head_wobbler.reset()
                 if event.type == "response.created":
                     logger.debug("Response created")
@@ -91,7 +95,6 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 if event.type == "response.done":
                     # Doesn't mean the audio is done playing
                     logger.debug("Response done")
-                    pass
                 if event.type == "conversation.item.input_audio_transcription.completed":
                     logger.debug(f"User transcript: {event.transcript}")
@@ -102,7 +105,8 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
                 if event.type == "response.audio.delta":
-                    self.deps.head_wobbler.feed(event.delta)
                     self.last_activity_time = asyncio.get_event_loop().time()
                     logger.debug("last activity time updated to %s", self.last_activity_time)
                     await self.output_queue.put(
@@ -118,6 +122,10 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     args_json_str = getattr(event, "arguments", None)
                     call_id = getattr(event, "call_id", None)
                     try:
                         tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
                         logger.debug("Tool '%s' executed successfully", tool_name)
@@ -127,22 +135,23 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                         tool_result = {"error": str(e)}
                     # send the tool result back
-                    await self.connection.conversation.item.create(
-                        item={
-                            "type": "function_call_output",
-                            "call_id": call_id,
-                            "output": json.dumps(tool_result),
-                        }
-                    )
                     await self.output_queue.put(
                         AdditionalOutputs(
                             {
                                 "role": "assistant",
                                 "content": json.dumps(tool_result),
-                                "metadata": {"title": "🛠️ Used tool " + tool_name, "status": "done"},
                             },
-                        )
                     )
                     if tool_name == "camera" and "b64_im" in tool_result:
@@ -157,37 +166,39 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                                 "role": "user",
                                 "content": [
                                     {
-                                        "type": "input_image",
                                         "image_url": f"data:image/jpeg;base64,{b64_im}",
-                                    }
                                 ],
-                            }
                         )
                         logger.info("Added camera image to conversation")
-                        np_img = self.deps.camera_worker.get_latest_frame()
-                        img = gr.Image(value=np_img)
-                        await self.output_queue.put(
-                            AdditionalOutputs(
-                                {
-                                    "role": "assistant",
-                                    "content": img,
-                                }
                             )
-                        )
                     if not self.is_idle_tool_call:
                         await self.connection.response.create(
                             response={
-                                "instructions": "Use the tool result just returned and answer concisely in speech."
-                            }
                         )
                     else:
                         self.is_idle_tool_call = False
                     # re synchronize the head wobble after a tool call that may have taken some time
-                    self.deps.head_wobbler.reset()
                 # server error
                 if event.type == "error":
@@ -197,7 +208,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
     # Microphone receive
-    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
         """Receive audio frame from the microphone and send it to the openai server."""
         if not self.connection:
             return
@@ -205,9 +216,9 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
         array = array.squeeze()
         audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
         # Fills the input audio buffer to be sent to the server
-        await self.connection.input_audio_buffer.append(audio=audio_message)  # type: ignore
-    async def emit(self) -> tuple[int, np.ndarray] | AdditionalOutputs | None:
         """Emit audio frame to be played by the speaker."""
         # sends to the stream the stuff put in the output queue by the openai event handler
         # This is called periodically by the fastrtc Stream
@@ -219,7 +230,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
             self.last_activity_time = asyncio.get_event_loop().time()  # avoid repeated resets
-        return await wait_for_item(self.output_queue)
     async def shutdown(self) -> None:
         """Shutdown the handler."""
@@ -227,7 +238,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
             await self.connection.close()
             self.connection = None
-    def format_timestamp(self):
         """Format current timestamp with date, time and elapsed seconds."""
         current_time = asyncio.get_event_loop().time()
         elapsed_seconds = current_time - self.start_time
@@ -236,7 +247,7 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
-    async def send_idle_signal(self, idle_duration) -> None:
         """Send an idle signal to the openai server."""
         logger.debug("Sending idle signal")
         self.is_idle_tool_call = True
@@ -249,12 +260,12 @@ class OpenaiRealtimeHandler(AsyncStreamHandler):
                 "type": "message",
                 "role": "user",
                 "content": [{"type": "input_text", "text": timestamp_msg}],
-            }
         )
         await self.connection.response.create(
             response={
                 "modalities": ["text"],
                 "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
                 "tool_choice": "required",
-            }
         )

 import base64
 import asyncio
 import logging
+from typing import Any, Tuple
 from datetime import datetime
 import numpy as np
 import gradio as gr
 from openai import AsyncOpenAI
 from fastrtc import AdditionalOutputs, AsyncStreamHandler, wait_for_item
+from numpy.typing import NDArray
 from reachy_mini_conversation_demo.tools import (
     ALL_TOOL_SPECS,
         )
         self.deps = deps
+        self.connection: Any | None = None
+        self.output_queue: "asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs]" = asyncio.Queue()
         self.last_activity_time = asyncio.get_event_loop().time()
         self.start_time = asyncio.get_event_loop().time()
         self.is_idle_tool_call = False
+    def copy(self) -> "OpenaiRealtimeHandler":
         """Create a copy of the handler."""
         return OpenaiRealtimeHandler(self.deps)
+    async def start_up(self) -> None:
         """Start the handler."""
         self.client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)
         async with self.client.beta.realtime.connect(model=config.MODEL_NAME) as conn:
                     },
                     "voice": "ballad",
                     "instructions": SESSION_INSTRUCTIONS,
+                    "tools": ALL_TOOL_SPECS,  # type: ignore[typeddict-item]
                     "tool_choice": "auto",
                     "temperature": 0.7,
+                },
             )
             # Manage event received from the openai server
             async for event in self.connection:
                 logger.debug(f"OpenAI event: {event.type}")
                 if event.type == "input_audio_buffer.speech_started":
+                    if hasattr(self, "_clear_queue") and callable(self._clear_queue):
                         self._clear_queue()
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.reset()
                     self.deps.movement_manager.set_listening(True)
                     logger.debug("User speech started")
                 if event.type in ("response.audio.completed", "response.completed"):
                     # Doesn't seem to be called
                     logger.debug("response completed")
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.reset()
                 if event.type == "response.created":
                     logger.debug("Response created")
                 if event.type == "response.done":
                     # Doesn't mean the audio is done playing
                     logger.debug("Response done")
                 if event.type == "conversation.item.input_audio_transcription.completed":
                     logger.debug(f"User transcript: {event.transcript}")
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": event.transcript}))
                 if event.type == "response.audio.delta":
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.feed(event.delta)
                     self.last_activity_time = asyncio.get_event_loop().time()
                     logger.debug("last activity time updated to %s", self.last_activity_time)
                     await self.output_queue.put(
                     args_json_str = getattr(event, "arguments", None)
                     call_id = getattr(event, "call_id", None)
+                    if not isinstance(tool_name, str) or not isinstance(args_json_str, str):
+                        logger.error("Invalid tool call: tool_name=%s, args=%s", tool_name, args_json_str)
+                        continue
                     try:
                         tool_result = await dispatch_tool_call(tool_name, args_json_str, self.deps)
                         logger.debug("Tool '%s' executed successfully", tool_name)
                         tool_result = {"error": str(e)}
                     # send the tool result back
+                    if isinstance(call_id, str):
+                        await self.connection.conversation.item.create(
+                            item={
+                                "type": "function_call_output",
+                                "call_id": call_id,
+                                "output": json.dumps(tool_result),
+                            },
+                        )
                     await self.output_queue.put(
                         AdditionalOutputs(
                             {
                                 "role": "assistant",
                                 "content": json.dumps(tool_result),
+                                "metadata": {"title": f"🛠️ Used tool {tool_name}", "status": "done"},
                             },
+                        ),
                     )
                     if tool_name == "camera" and "b64_im" in tool_result:
                                 "role": "user",
                                 "content": [
                                     {
+                                        "type": "input_image",  # type: ignore[typeddict-item]
                                         "image_url": f"data:image/jpeg;base64,{b64_im}",
+                                    },
                                 ],
+                            },
                         )
                         logger.info("Added camera image to conversation")
+                        if self.deps.camera_worker is not None:
+                            np_img = self.deps.camera_worker.get_latest_frame()
+                            img = gr.Image(value=np_img)
+                            await self.output_queue.put(
+                                AdditionalOutputs(
+                                    {
+                                        "role": "assistant",
+                                        "content": img,
+                                    },
+                                ),
                             )
                     if not self.is_idle_tool_call:
                         await self.connection.response.create(
                             response={
+                                "instructions": "Use the tool result just returned and answer concisely in speech.",
+                            },
                         )
                     else:
                         self.is_idle_tool_call = False
                     # re synchronize the head wobble after a tool call that may have taken some time
+                    if self.deps.head_wobbler is not None:
+                        self.deps.head_wobbler.reset()
                 # server error
                 if event.type == "error":
                     await self.output_queue.put(AdditionalOutputs({"role": "assistant", "content": f"[error] {msg}"}))
     # Microphone receive
+    async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
         """Receive audio frame from the microphone and send it to the openai server."""
         if not self.connection:
             return
         array = array.squeeze()
         audio_message = base64.b64encode(array.tobytes()).decode("utf-8")
         # Fills the input audio buffer to be sent to the server
+        await self.connection.input_audio_buffer.append(audio=audio_message)
+    async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
         """Emit audio frame to be played by the speaker."""
         # sends to the stream the stuff put in the output queue by the openai event handler
         # This is called periodically by the fastrtc Stream
             self.last_activity_time = asyncio.get_event_loop().time()  # avoid repeated resets
+        return await wait_for_item(self.output_queue)  # type: ignore[no-any-return]
     async def shutdown(self) -> None:
         """Shutdown the handler."""
             await self.connection.close()
             self.connection = None
+    def format_timestamp(self) -> str:
         """Format current timestamp with date, time and elapsed seconds."""
         current_time = asyncio.get_event_loop().time()
         elapsed_seconds = current_time - self.start_time
+    async def send_idle_signal(self, idle_duration: float) -> None:
         """Send an idle signal to the openai server."""
         logger.debug("Sending idle signal")
         self.is_idle_tool_call = True
                 "type": "message",
                 "role": "user",
                 "content": [{"type": "input_text", "text": timestamp_msg}],
+            },
         )
         await self.connection.response.create(
             response={
                 "modalities": ["text"],
                 "instructions": "You MUST respond with function calls only - no speech or text. Choose appropriate actions for idle behavior.",
                 "tool_choice": "required",
+            },
         )

src/reachy_mini_conversation_demo/tools.py CHANGED Viewed

@@ -4,7 +4,7 @@ import json
 import asyncio
 import inspect
 import logging
-from typing import Any, Dict, Literal, Optional
 from dataclasses import dataclass
 from reachy_mini import ReachyMini
@@ -36,9 +36,9 @@ except ImportError as e:
     EMOTION_AVAILABLE = False
-def get_concrete_subclasses(base):
     """Recursively find all concrete (non-abstract) subclasses of a base class."""
-    result = []
     for cls in base.__subclasses__():
         if not inspect.isabstract(cls):
             result.append(cls)
@@ -58,9 +58,9 @@ class ToolDependencies:
     reachy_mini: ReachyMini
     movement_manager: Any  # MovementManager from moves.py
     # Optional deps
-    camera_worker: Optional[Any] = None  # CameraWorker for frame buffering
-    vision_manager: Optional[Any] = None
-    head_wobbler: Optional[Any] = None  # HeadWobbler for audio-reactive motion
     motion_duration_s: float = 1.0
@@ -88,7 +88,7 @@ class Tool(abc.ABC):
         }
     @abc.abstractmethod
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Async tool execution entrypoint."""
         raise NotImplementedError
@@ -113,7 +113,7 @@ class MoveHead(Tool):
     }
     # mapping: direction -> args for create_head_pose
-    DELTAS: dict[str, tuple[int, int, int, int, int, int]] = {
         "left": (0, 0, 0, 0, 0, 40),
         "right": (0, 0, 0, 0, 0, -40),
         "up": (0, 0, 0, 0, -30, 0),
@@ -121,9 +121,12 @@ class MoveHead(Tool):
         "front": (0, 0, 0, 0, 0, 0),
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Move head in a given direction."""
-        direction: Direction = kwargs.get("direction")
         logger.info("Tool call: move_head direction=%s", direction)
         deltas = self.DELTAS.get(direction, self.DELTAS["front"])
@@ -177,7 +180,7 @@ class Camera(Tool):
         "required": ["question"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Take a picture with the camera and ask a question about it."""
         image_query = (kwargs.get("question") or "").strip()
         if not image_query:
@@ -199,7 +202,7 @@ class Camera(Tool):
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
             vision_result = await asyncio.to_thread(
-                deps.vision_manager.processor.process_image, frame, image_query
             )
             if isinstance(vision_result, dict) and "error" in vision_result:
                 return vision_result
@@ -208,17 +211,16 @@ class Camera(Tool):
                 if isinstance(vision_result, str)
                 else {"error": "vision returned non-string"}
             )
-        else:
-            # Return base64 encoded image like main_works.py camera tool
-            import base64
-            import cv2
-            temp_path = "/tmp/camera_frame.jpg"
-            cv2.imwrite(temp_path, frame)
-            with open(temp_path, "rb") as f:
-                b64_encoded = base64.b64encode(f.read()).decode("utf-8")
-            return {"b64_im": b64_encoded}
 class HeadTracking(Tool):
@@ -232,7 +234,7 @@ class HeadTracking(Tool):
         "required": ["start"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Enable or disable head tracking."""
         enable = bool(kwargs.get("start"))
@@ -288,12 +290,12 @@ class Dance(Tool):
         "required": [],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Play a named or random dance move once (or repeat). Non-blocking."""
         if not DANCE_AVAILABLE:
             return {"error": "Dance system not available"}
-        move_name = kwargs.get("move", None)
         repeat = int(kwargs.get("repeat", 1))
         logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
@@ -326,12 +328,12 @@ class StopDance(Tool):
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
-            }
         },
         "required": ["dummy"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Stop the current dance move."""
         logger.info("Tool call: stop_dance")
         movement_manager = deps.movement_manager
@@ -373,7 +375,7 @@ class PlayEmotion(Tool):
         "required": ["emotion"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Play a pre-recorded emotion."""
         if not EMOTION_AVAILABLE:
             return {"error": "Emotion system not available"}
@@ -399,7 +401,7 @@ class PlayEmotion(Tool):
         except Exception as e:
             logger.exception("Failed to play emotion")
-            return {"error": f"Failed to play emotion: {str(e)}"}
 class StopEmotion(Tool):
@@ -413,12 +415,12 @@ class StopEmotion(Tool):
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
-            }
         },
         "required": ["dummy"],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Stop the current emotion."""
         logger.info("Tool call: stop_emotion")
         movement_manager = deps.movement_manager
@@ -442,7 +444,7 @@ class DoNothing(Tool):
         "required": [],
     }
-    async def __call__(self, deps: ToolDependencies, **kwargs) -> Dict[str, Any]:
         """Do nothing - stay still and silent."""
         reason = kwargs.get("reason", "just chilling")
         logger.info("Tool call: do_nothing reason=%s", reason)
@@ -452,12 +454,12 @@ class DoNothing(Tool):
 # Registry & specs (dynamic)
 # List of available tool classes
-ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}
 ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
 # Dispatcher
-def _safe_load_obj(args_json: str) -> dict[str, Any]:
     try:
         parsed_args = json.loads(args_json or "{}")
         return parsed_args if isinstance(parsed_args, dict) else {}

 import asyncio
 import inspect
 import logging
+from typing import Any, Dict, List, Tuple, Literal
 from dataclasses import dataclass
 from reachy_mini import ReachyMini
     EMOTION_AVAILABLE = False
+def get_concrete_subclasses(base: type[Tool]) -> List[type[Tool]]:
     """Recursively find all concrete (non-abstract) subclasses of a base class."""
+    result: List[type[Tool]] = []
     for cls in base.__subclasses__():
         if not inspect.isabstract(cls):
             result.append(cls)
     reachy_mini: ReachyMini
     movement_manager: Any  # MovementManager from moves.py
     # Optional deps
+    camera_worker: Any | None = None  # CameraWorker for frame buffering
+    vision_manager: Any | None = None
+    head_wobbler: Any | None = None  # HeadWobbler for audio-reactive motion
     motion_duration_s: float = 1.0
         }
     @abc.abstractmethod
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Async tool execution entrypoint."""
         raise NotImplementedError
     }
     # mapping: direction -> args for create_head_pose
+    DELTAS: Dict[str, Tuple[int, int, int, int, int, int]] = {
         "left": (0, 0, 0, 0, 0, 40),
         "right": (0, 0, 0, 0, 0, -40),
         "up": (0, 0, 0, 0, -30, 0),
         "front": (0, 0, 0, 0, 0, 0),
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Move head in a given direction."""
+        direction_raw = kwargs.get("direction")
+        if not isinstance(direction_raw, str):
+            return {"error": "direction must be a string"}
+        direction: Direction = direction_raw  # type: ignore[assignment]
         logger.info("Tool call: move_head direction=%s", direction)
         deltas = self.DELTAS.get(direction, self.DELTAS["front"])
         "required": ["question"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Take a picture with the camera and ask a question about it."""
         image_query = (kwargs.get("question") or "").strip()
         if not image_query:
         # Use vision manager for processing if available
         if deps.vision_manager is not None:
             vision_result = await asyncio.to_thread(
+                deps.vision_manager.processor.process_image, frame, image_query,
             )
             if isinstance(vision_result, dict) and "error" in vision_result:
                 return vision_result
                 if isinstance(vision_result, str)
                 else {"error": "vision returned non-string"}
             )
+        # Return base64 encoded image like main_works.py camera tool
+        import base64
+        import cv2
+        temp_path = "/tmp/camera_frame.jpg"
+        cv2.imwrite(temp_path, frame)
+        with open(temp_path, "rb") as f:
+            b64_encoded = base64.b64encode(f.read()).decode("utf-8")
+        return {"b64_im": b64_encoded}
 class HeadTracking(Tool):
         "required": ["start"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Enable or disable head tracking."""
         enable = bool(kwargs.get("start"))
         "required": [],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Play a named or random dance move once (or repeat). Non-blocking."""
         if not DANCE_AVAILABLE:
             return {"error": "Dance system not available"}
+        move_name = kwargs.get("move")
         repeat = int(kwargs.get("repeat", 1))
         logger.info("Tool call: dance move=%s repeat=%d", move_name, repeat)
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
+            },
         },
         "required": ["dummy"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Stop the current dance move."""
         logger.info("Tool call: stop_dance")
         movement_manager = deps.movement_manager
         "required": ["emotion"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Play a pre-recorded emotion."""
         if not EMOTION_AVAILABLE:
             return {"error": "Emotion system not available"}
         except Exception as e:
             logger.exception("Failed to play emotion")
+            return {"error": f"Failed to play emotion: {e!s}"}
 class StopEmotion(Tool):
             "dummy": {
                 "type": "boolean",
                 "description": "dummy boolean, set it to true",
+            },
         },
         "required": ["dummy"],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Stop the current emotion."""
         logger.info("Tool call: stop_emotion")
         movement_manager = deps.movement_manager
         "required": [],
     }
+    async def __call__(self, deps: ToolDependencies, **kwargs: Any) -> Dict[str, Any]:
         """Do nothing - stay still and silent."""
         reason = kwargs.get("reason", "just chilling")
         logger.info("Tool call: do_nothing reason=%s", reason)
 # Registry & specs (dynamic)
 # List of available tool classes
+ALL_TOOLS: Dict[str, Tool] = {cls.name: cls() for cls in get_concrete_subclasses(Tool)}  # type: ignore[type-abstract]
 ALL_TOOL_SPECS = [tool.spec() for tool in ALL_TOOLS.values()]
 # Dispatcher
+def _safe_load_obj(args_json: str) -> Dict[str, Any]:
     try:
         parsed_args = json.loads(args_json or "{}")
         return parsed_args if isinstance(parsed_args, dict) else {}

src/reachy_mini_conversation_demo/utils.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import logging
 import argparse
 import warnings
 from reachy_mini_conversation_demo.camera_worker import CameraWorker
-def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser("Reachy Mini Conversation Demo")
     parser.add_argument(
@@ -26,7 +28,7 @@ def parse_args():
     return parser.parse_args()
-def handle_vision_stuff(args, current_robot):
     """Initialize camera, head tracker, camera worker, and vision manager.
     By default, vision is handled by gpt-realtime model when camera tool is used.
@@ -44,7 +46,7 @@ def handle_vision_stuff(args, current_robot):
                 head_tracker = HeadTracker()
             elif args.head_tracker == "mediapipe":
-                from reachy_mini_toolbox.vision import HeadTracker
                 head_tracker = HeadTracker()
@@ -59,17 +61,17 @@ def handle_vision_stuff(args, current_robot):
                 vision_manager = initialize_vision_manager(camera_worker)
             except ImportError as e:
                 raise ImportError(
-                    "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'"
                 ) from e
         else:
             logging.getLogger(__name__).info(
-                "Using gpt-realtime for vision (default). Use --local-vision for local processing."
             )
     return camera_worker, head_tracker, vision_manager
-def setup_logger(debug):
     """Setups the logger."""
     log_level = "DEBUG" if debug else "INFO"
     logging.basicConfig(

 import logging
 import argparse
 import warnings
+from typing import Any, Tuple
+from reachy_mini import ReachyMini
 from reachy_mini_conversation_demo.camera_worker import CameraWorker
+def parse_args() -> argparse.Namespace:
     """Parse command line arguments."""
     parser = argparse.ArgumentParser("Reachy Mini Conversation Demo")
     parser.add_argument(
     return parser.parse_args()
+def handle_vision_stuff(args: argparse.Namespace, current_robot: ReachyMini) -> Tuple[CameraWorker | None, Any, Any]:
     """Initialize camera, head tracker, camera worker, and vision manager.
     By default, vision is handled by gpt-realtime model when camera tool is used.
                 head_tracker = HeadTracker()
             elif args.head_tracker == "mediapipe":
+                from reachy_mini_toolbox.vision import HeadTracker  # type: ignore[no-redef]
                 head_tracker = HeadTracker()
                 vision_manager = initialize_vision_manager(camera_worker)
             except ImportError as e:
                 raise ImportError(
+                    "To use --local-vision, please install the extra dependencies: pip install '.[local_vision]'",
                 ) from e
         else:
             logging.getLogger(__name__).info(
+                "Using gpt-realtime for vision (default). Use --local-vision for local processing.",
             )
     return camera_worker, head_tracker, vision_manager
+def setup_logger(debug: bool) -> logging.Logger:
     """Setups the logger."""
     log_level = "DEBUG" if debug else "INFO"
     logging.basicConfig(

src/reachy_mini_conversation_demo/vision/processors.py CHANGED Viewed

@@ -3,12 +3,13 @@ import time
 import base64
 import logging
 import threading
-from typing import Any, Dict, Optional
 from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
@@ -34,7 +35,7 @@ class VisionConfig:
 class VisionProcessor:
     """Handles SmolVLM2 model loading and inference."""
-    def __init__(self, vision_config: VisionConfig = None):
         """Initialize the vision processor."""
         self.vision_config = vision_config or VisionConfig()
         self.model_path = self.vision_config.model_path
@@ -60,7 +61,7 @@ class VisionProcessor:
         """Load model and processor onto the selected device."""
         try:
             logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
-            self.processor = AutoProcessor.from_pretrained(self.model_path)
             # Select dtype depending on device
             if self.device == "cuda":
@@ -70,16 +71,17 @@ class VisionProcessor:
             else:
                 dtype = torch.float32
-            model_kwargs = {"dtype": dtype}
             # flash_attention_2 is CUDA-only; skip on MPS/CPU
             if self.device == "cuda":
                 model_kwargs["_attn_implementation"] = "flash_attention_2"
             # Load model weights
-            self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)
-            self.model.eval()
             self._initialized = True
             return True
@@ -89,11 +91,11 @@ class VisionProcessor:
     def process_image(
         self,
-        cv2_image: np.ndarray,
         prompt: str = "Briefly describe what you see in one sentence.",
     ) -> str:
         """Process CV2 image and return description with retry logic."""
-        if not self._initialized:
             return "Vision model not initialized"
         for attempt in range(self.vision_config.max_retries):
@@ -205,16 +207,16 @@ class VisionProcessor:
 class VisionManager:
     """Manages periodic vision processing and scene understanding."""
-    def __init__(self, camera, vision_config: VisionConfig = None):
         """Initialize vision manager with camera and configuration."""
         self.camera = camera
         self.vision_config = vision_config or VisionConfig()
         self.vision_interval = self.vision_config.vision_interval
         self.processor = VisionProcessor(self.vision_config)
-        self._last_processed_time = 0
         self._stop_event = threading.Event()
-        self._thread: Optional[threading.Thread] = None
         # Initialize processor
         if not self.processor.initialize():
@@ -245,7 +247,7 @@ class VisionManager:
                     frame = self.camera.get_latest_frame()
                     if frame is not None:
                         description = self.processor.process_image(
-                            frame, "Briefly describe what you see in one sentence."
                         )
                         # Only update if we got a valid response
@@ -274,7 +276,7 @@ class VisionManager:
         }
-def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
     """Initialize vision manager with model download and configuration.
     Args:
@@ -318,7 +320,7 @@ def initialize_vision_manager(camera_worker) -> Optional[VisionManager]:
         # Log device info
         device_info = vision_manager.processor.get_model_info()
         logger.info(
-            f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}"
         )
         return vision_manager

 import base64
 import logging
 import threading
+from typing import Any, Dict
 from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
+from numpy.typing import NDArray
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from huggingface_hub import snapshot_download
 class VisionProcessor:
     """Handles SmolVLM2 model loading and inference."""
+    def __init__(self, vision_config: VisionConfig | None = None):
         """Initialize the vision processor."""
         self.vision_config = vision_config or VisionConfig()
         self.model_path = self.vision_config.model_path
         """Load model and processor onto the selected device."""
         try:
             logger.info(f"Loading SmolVLM2 model on {self.device} (HF_HOME={config.HF_HOME})")
+            self.processor = AutoProcessor.from_pretrained(self.model_path)  # type: ignore[no-untyped-call]
             # Select dtype depending on device
             if self.device == "cuda":
             else:
                 dtype = torch.float32
+            model_kwargs: Dict[str, Any] = {"dtype": dtype}
             # flash_attention_2 is CUDA-only; skip on MPS/CPU
             if self.device == "cuda":
                 model_kwargs["_attn_implementation"] = "flash_attention_2"
             # Load model weights
+            self.model = AutoModelForImageTextToText.from_pretrained(self.model_path, **model_kwargs).to(self.device)  # type: ignore[arg-type]
+            if self.model is not None:
+                self.model.eval()
             self._initialized = True
             return True
     def process_image(
         self,
+        cv2_image: NDArray[np.uint8],
         prompt: str = "Briefly describe what you see in one sentence.",
     ) -> str:
         """Process CV2 image and return description with retry logic."""
+        if not self._initialized or self.processor is None or self.model is None:
             return "Vision model not initialized"
         for attempt in range(self.vision_config.max_retries):
 class VisionManager:
     """Manages periodic vision processing and scene understanding."""
+    def __init__(self, camera: Any, vision_config: VisionConfig | None = None):
         """Initialize vision manager with camera and configuration."""
         self.camera = camera
         self.vision_config = vision_config or VisionConfig()
         self.vision_interval = self.vision_config.vision_interval
         self.processor = VisionProcessor(self.vision_config)
+        self._last_processed_time = 0.0
         self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
         # Initialize processor
         if not self.processor.initialize():
                     frame = self.camera.get_latest_frame()
                     if frame is not None:
                         description = self.processor.process_image(
+                            frame, "Briefly describe what you see in one sentence.",
                         )
                         # Only update if we got a valid response
         }
+def initialize_vision_manager(camera_worker: Any) -> VisionManager | None:
     """Initialize vision manager with model download and configuration.
     Args:
         # Log device info
         device_info = vision_manager.processor.get_model_info()
         logger.info(
+            f"Vision processing enabled: {device_info.get('model_path')} on {device_info.get('device')}",
         )
         return vision_manager

src/reachy_mini_conversation_demo/vision/yolo_head_tracker.py CHANGED Viewed

@@ -1,16 +1,17 @@
 from __future__ import annotations
 import logging
-from typing import Tuple, Optional
 import numpy as np
 try:
     from supervision import Detections
-    from ultralytics import YOLO
 except ImportError as e:
     raise ImportError(
-        "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'"
     ) from e
 from huggingface_hub import hf_hub_download
@@ -48,7 +49,7 @@ class HeadTracker:
             logger.error(f"Failed to load YOLO model: {e}")
             raise
-    def _select_best_face(self, detections: Detections) -> Optional[int]:
         """Select the best face based on confidence and area (largest face with highest confidence).
         Args:
@@ -61,6 +62,10 @@ class HeadTracker:
         if detections.xyxy.shape[0] == 0:
             return None
         # Filter by confidence threshold
         valid_mask = detections.confidence >= self.confidence_threshold
         if not np.any(valid_mask):
@@ -78,9 +83,9 @@ class HeadTracker:
         # Return index of best face
         best_idx = valid_indices[np.argmax(scores)]
-        return best_idx
-    def _bbox_to_mp_coords(self, bbox: np.ndarray, w: int, h: int) -> np.ndarray:
         """Convert bounding box center to MediaPipe-style coordinates [-1, 1].
         Args:
@@ -101,7 +106,7 @@ class HeadTracker:
         return np.array([norm_x, norm_y], dtype=np.float32)
-    def get_head_position(self, img: np.ndarray) -> Tuple[Optional[np.ndarray], Optional[float]]:
         """Get head position from face detection.
         Args:
@@ -125,9 +130,10 @@ class HeadTracker:
                 return None, None
             bbox = detections.xyxy[face_idx]
-            confidence = detections.confidence[face_idx]
-            logger.debug(f"Face detected with confidence: {confidence:.2f}")
             # Get face center in [-1, 1] coordinates
             face_center = self._bbox_to_mp_coords(bbox, w, h)

 from __future__ import annotations
 import logging
+from typing import Tuple
 import numpy as np
+from numpy.typing import NDArray
 try:
     from supervision import Detections
+    from ultralytics import YOLO  # type: ignore[attr-defined]
 except ImportError as e:
     raise ImportError(
+        "To use YOLO head tracker, please install the extra dependencies: pip install '.[yolo_vision]'",
     ) from e
 from huggingface_hub import hf_hub_download
             logger.error(f"Failed to load YOLO model: {e}")
             raise
+    def _select_best_face(self, detections: Detections) -> int | None:
         """Select the best face based on confidence and area (largest face with highest confidence).
         Args:
         if detections.xyxy.shape[0] == 0:
             return None
+        # Check if confidence is available
+        if detections.confidence is None:
+            return None
         # Filter by confidence threshold
         valid_mask = detections.confidence >= self.confidence_threshold
         if not np.any(valid_mask):
         # Return index of best face
         best_idx = valid_indices[np.argmax(scores)]
+        return int(best_idx)
+    def _bbox_to_mp_coords(self, bbox: NDArray[np.float32], w: int, h: int) -> NDArray[np.float32]:
         """Convert bounding box center to MediaPipe-style coordinates [-1, 1].
         Args:
         return np.array([norm_x, norm_y], dtype=np.float32)
+    def get_head_position(self, img: NDArray[np.uint8]) -> Tuple[NDArray[np.float32] | None, float | None]:
         """Get head position from face detection.
         Args:
                 return None, None
             bbox = detections.xyxy[face_idx]
+            if detections.confidence is not None:
+                confidence = detections.confidence[face_idx]
+                logger.debug(f"Face detected with confidence: {confidence:.2f}")
             # Get face center in [-1, 1] coordinates
             face_center = self._bbox_to_mp_coords(bbox, w, h)

tests/audio/test_head_wobbler.py CHANGED Viewed

@@ -4,7 +4,8 @@ import math
 import time
 import base64
 import threading
-from typing import List, Tuple, Callable
 import numpy as np
@@ -74,7 +75,7 @@ def test_reset_allows_future_offsets() -> None:
         wobbler.stop()
-def test_reset_during_inflight_chunk_keeps_worker(monkeypatch) -> None:
     """Simulate reset during chunk processing to ensure the worker survives."""
     wobbler, captured = _start_wobbler()
     ready = threading.Event()

 import time
 import base64
 import threading
+from typing import Any, List, Tuple
+from collections.abc import Callable
 import numpy as np
         wobbler.stop()
+def test_reset_during_inflight_chunk_keeps_worker(monkeypatch: Any) -> None:
     """Simulate reset during chunk processing to ensure the worker survives."""
     wobbler, captured = _start_wobbler()
     ready = threading.Event()

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff