fix: send tts start before speaker audio so ESP32 plays it

ESP32 ignores binary audio unless it receives tts start first. Also skip silent frames to reduce bandwidth. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 07:10:10 -07:00 · 2026-04-06 07:10:10 -07:00 · fc3c99d09c
parent b39dab2536
commit fc3c99d09c
1 changed files with 22 additions and 4 deletions
--- a/relay.py
+++ b/relay.py
@ -141,6 +141,7 @@ class Relay:
        self._speaker_buf = np.array([], dtype=np.int16)
        self._audio_in_count = 0
        self._audio_out_count = 0
        self._tts_started = False  # track if we sent tts start to ESP32
    async def handle_esp32(self, websocket):
        """Handle one ESP32 WebSocket connection."""
@ -236,12 +237,29 @@ class Relay:
            return
        try:
            self._audio_out_count += 1
            if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
                samples_peek = np.frombuffer(pcm_bytes, dtype=np.int16)
                max_amp = int(np.max(np.abs(samples_peek)))
                log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
            samples = np.frombuffer(pcm_bytes, dtype=np.int16)
            max_amp = int(np.max(np.abs(samples)))
            if self._audio_out_count <= 3 or self._audio_out_count % 100 == 0:
                log.info(f"Speaker frame #{self._audio_out_count}, size={len(pcm_bytes)}, max_amp={max_amp}")
            # Only send non-silent frames to ESP32
            if max_amp < 10:
                # If we were playing and now silent for a while, send tts stop
                if self._tts_started and self._audio_out_count % 50 == 0:
                    # Check later — don't stop immediately, silence gaps are normal
                    pass
                return
            # Send tts start before first audio frame
            if not self._tts_started:
                await self.ws.send(json.dumps({
                    "type": "tts", "state": "start",
                    "session_id": "relay-session"
                }))
                self._tts_started = True
                log.info("Sent tts start to ESP32")
            # Resample 48kHz → 16kHz
            downsampled = resample_poly(samples, DOWN_RATIO[0], DOWN_RATIO[1]).astype(np.int16)